# Class activity: Peak over threshold algorithm development

In [2]:
# copy code from HW2 to get the ozone data
%matplotlib inline
from IPython.display import HTML
from IPython.display import display

# import packages that we need to read the data files
# convert date strings into numerical values for plotting time series
# pandas is a powerful (but a bit more complicated) package to work with spreadsheet data
import pandas as pd
import datetime as dt 

# our two main packages for data analysis
import numpy as np
import matplotlib.pyplot as plt

In [3]:
def load_data(city,pollutant='ozone'):
    """A supporting function to load ozone data from a csv file
    
    Args:
        city (str): A string for the city name (must match string in file names).
        pollutant (str): name of the pollutant (defaults to 'ozone')
            Use this second parameter to assign another string 
    
    Returns:
        x (numpy array): an array with the dates (values have type datetime)
        y (numpy array): an array with the concentration
    """
    path="/nfs/home11/staff/timm/Public/Data/hw2/"
    filename=city+'_'+pollutant+'.csv'
    try:
        open(path+filename,'r')
        is_file=True
        print (80*"+")
        print ("Load data for "+city+" pollutant: "+pollutant)
        print ("Local file is "+path+filename)
        print (80*"+")
    except:
        print("Warning: could not open file "+path+filename)
        is_file=False
    
    if is_file:
        df=pd.read_csv(path+filename)
        print(80*"-")
        print ("+ Name of data columns in the Pandas Dataframe:")
        for name in df.columns:
            print (name)
        print(80*"-")
        ########################################################
        # pre-processing of the data
        ########################################################
        
        ########################################################
        # 1. convert the date data (type string) into numerical 
        # values (useful for plotting in plt.plot)
        ########################################################
        
        dates=df['Date'] # extracts the column named 'Date' from dataframe
        datelist=[]
        n=0
        for d in dates: # dates is iterable
            # take the string and convert into a numerical value
            value=dt.datetime.strptime(d,'%m/%d/%y')
            datelist.append(value)
            n=n+1
        
        # 
        x=np.array(datelist) # convert the list with datetimevalues into numpy array
        
        ########################################################
        # 2. extract column with the ozone concentration data
        ########################################################
        # gets data in a type numpy array
        y=df['Daily Max 8-hour Ozone Concentration'].values 
        # units we expect to be the same in each row, so we get one cell value
        unit=df['UNITS'][0] 
        print ("Loaded the data successfully!")
        print ("Number of days in file: "+str(n))
        print ("Dates:"+str(x[0])+" to "+str(x[-1]))
        print ("Concentration values range from: ")
        print ("%12.4f to %12.4f" % (np.nanmin(y), +np.nanmax(y)))
        print ("Units: "+unit)
        
    else:
        print ("do else")
        
        x,y = np.nan, np.nan
    
    return x,y

In [4]:
help(load_data)

Help on function load_data in module __main__:

load_data(city, pollutant='ozone')
    A supporting function to load ozone data from a csv file
    
    Args:
        city (str): A string for the city name (must match string in file names).
        pollutant (str): name of the pollutant (defaults to 'ozone')
            Use this second parameter to assign another string 
    
    Returns:
        x (numpy array): an array with the dates (values have type datetime)
        y (numpy array): an array with the concentration



Note: we have five data files for these cities:

| City          | String to use in function  load_data()  |
|---------------|-----------------------------------------|
| New York City | 'nyc'                                   |
| Los Angeles   | 'los_angeles'                           |
| Houston       | 'houston_tx'                            |
| Philadelphia  | 'philadelphia_pa'                       |
| Phoenix       | 'phoenix_az'                            |


In [5]:
x,y = load_data('nyc')

++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Load data for nyc pollutant: ozone
Local file is /nfs/home11/staff/timm/Public/Data/hw2/nyc_ozone.csv
++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
--------------------------------------------------------------------------------
+ Name of data columns in the Pandas Dataframe:
Date
Site ID
Daily Max 8-hour Ozone Concentration
UNITS
SITE_LATITUDE
SITE_LONGITUDE
Unnamed: 6
Unnamed: 7
--------------------------------------------------------------------------------
Loaded the data successfully!
Number of days in file: 365
Dates:2019-01-01 00:00:00 to 2019-12-31 00:00:00
Concentration values range from: 
      0.0070 to       0.0810
Units: ppm


In [None]:
# your code