In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
#Obtain the county file names, in a list, from the State parent folder
#Takes the parent directory (string) as parameter
#returns the list containing all the file names

def get_filenames(statename):
    parentfolder = "C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/" + statename
    filenames = []
    filenames = os.listdir(parentfolder)
    return filenames

In [3]:
#Concatenate Data frames

def df_county_concatenation(statename):
    statepath = "C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/" + statename
    countydfs = []
    countylist = []
    countylist = os.listdir(statepath)
    for x in range(0, len(countylist)):
        df1 = pd.read_csv(statepath + "/" + countylist[x])
        countydfs.append(df1)
    result = pd.concat(countydfs)
    return result
    

In [5]:
'''takes in a pandas dataframe with column "Units of Measure" and "Arithmetic Mean", and replaces ml/m3 into ppm in the original dataframe'''
def replace_units(df):
    tf = df["Units of Measure"] == ("Micrograms/cubic meter (LC)" or "Micrograms/cubic meter (25 C)")
    df['Arithmetic Mean'] = np.where(tf, df['Arithmetic Mean']*0.001, df['Arithmetic Mean'])

In [6]:
def replace_units_ppb_to_ppm(df):
    tf = df["Units of Measure"] == "Parts per billion"
    df['Arithmetic Mean'] = np.where(tf, df['Arithmetic Mean']*0.001, df['Arithmetic Mean'])

In [7]:
def filter_unique_sitekeys(str_parameter, original_df):
    #Filters out for unique site keys
    returning_df = original_df.groupby(["Site_keys"], axis=0).mean()
    uniquemeans = list(returning_df["Arithmetic Mean"])
    site_keys = original_df["Site_keys"].unique()
    pollutant = np.repeat(str_parameter, len(uniquemeans))
    LU = []

    for i in site_keys:
        ind = list(original_df["Site_keys"] == i).index(True)
        LU.append(list(original_df["LU"])[ind])

    d = {"Parameter Name": pollutant, "Site_keys": site_keys, "LU": LU, "ppm": uniquemeans}
    returning_df = pd.DataFrame(data = d)
    cols = ["Parameter Name", "Site_keys",  "LU", "ppm"]
    returning_df = returning_df[cols]
    return returning_df
    

In [8]:
#Before anything can be done:
#The component fil(pm_Statename) must be downloaded and put in PM25_Components Folder
#The unzipped state folder must be downloaded and located in StateData
pm25_param_name = "Suspended particulate (TSP)"
#NOTE: in round 1, the param name for particulate was "PM 2.5 - Local Conditions"
no2_param_name = "Nitrogen dioxide (NO2)"
so2_param_name = "Sulfur dioxide"

In [232]:
state = "Rhode Island"
counties = get_filenames(state)
fullstate = df_county_concatenation(state)

In [233]:
print(counties)

['Providence0.csv', 'Providence1.csv', 'Washington0.csv']


In [234]:
fullstate.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/States/"+ state+ ".csv")
print("file written")

file written


In [235]:
#After the files are written, we must read in the pandas dataframe and make appropriate conversions into ppm
replace_units(fullstate)

In [236]:
fullstate["Parameter Name"].unique()

array(['Suspended particulate (TSP)', 'Lead (TSP) STP',
       'Sulfate (TSP) STP', 'Arsenic (TSP) STP', 'Beryllium (TSP) STP',
       'Barium (TSP) STP', 'Cadmium (TSP) STP', 'Chromium (TSP) STP',
       'Cobalt (TSP) STP', 'Copper (TSP) STP', 'Iron (TSP) STP',
       'Manganese (TSP) STP', 'Molybdenum (TSP) STP', 'Nickel (TSP) STP',
       'Vanadium (TSP) STP', 'Zinc (TSP) STP', 'Nitrate (TSP) STP',
       'Soil index (COH)', 'Sulfur dioxide', 'Carbon monoxide', 'Ozone',
       'Wind Speed - Scalar', 'Wind Direction - Scalar',
       'Nitric oxide (NO)', 'Nitrogen dioxide (NO2)',
       'Beta radiation (TSP)', 'Alpha radiation (TSP)'], dtype=object)

In [237]:
#Filter our full state data frame to PM 2.5 Local Conditions -- this was for round 1
#micropercubic = fullstate.loc[fullstate['Units of Measure'] == "Micrograms/cubic meter (25 C)"]
#local = micropercubic.loc[micropercubic['Parameter Name'] == pm25_param_name]
#localconditions = local.loc[local["Pollutant Standard"] == 'PM25 Annual 2012']
#localconditions_filtereddf = localconditions.loc[:, ["Parameter Name", "Site_keys",  "LU", "Arithmetic Mean"]]

In [238]:
#Filter our full state data frame to suspended particulate -- round 2
micropercubic = fullstate.loc[fullstate['Units of Measure'] == "Micrograms/cubic meter (25 C)"]
local = micropercubic.loc[micropercubic['Parameter Name'] == pm25_param_name]
localconditions_filtereddf = local.loc[:, ["Parameter Name", "Site_keys",  "LU", "Arithmetic Mean"]]

In [239]:
#Filters out for unique site keys
localconditions_filtereddf_unique = filter_unique_sitekeys(pm25_param_name, localconditions_filtereddf)
localconditions_filtereddf_unique

Unnamed: 0,Parameter Name,Site_keys,LU,ppm
0,Suspended particulate (TSP),[ 41.7826 -71.472282],"{'Commercial and Services': 0.135, 'Residentia...",59.236364
1,Suspended particulate (TSP),[ 41.841767 -71.360055],"{'Residential': 0.7964601769911505, 'Streams a...",52.172414
2,Suspended particulate (TSP),[ 41.816489 -71.368943],"{'Commercial and Services': 0.03, 'Residential...",56.807018
3,Suspended particulate (TSP),[ 41.832322 -71.416445],{'Commercial and Services': 0.3382352941176471...,58.610169
4,Suspended particulate (TSP),[ 41.825556 -71.405278],"{'Commercial and Services': 0.93, 'Residential...",83.353982
5,Suspended particulate (TSP),[ 41.782878 -71.472282],"{'Commercial and Services': 0.115, 'Residentia...",65.357143
6,Suspended particulate (TSP),[ 41.879267 -71.382 ],{'Commercial and Services': 0.3489583333333333...,50.280702
7,Suspended particulate (TSP),[ 41.877322 -71.3845 ],{'Commercial and Services': 0.1560693641618497...,56.940299
8,Suspended particulate (TSP),[ 41.8226 -71.410057],"{'Commercial and Services': 0.88, 'Industrial'...",51.829268
9,Suspended particulate (TSP),[ 41.820933 -71.407557],"{'Commercial and Services': 0.72, 'Streams and...",49.461538


In [240]:
localconditions_filtereddf_unique.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/PM25_States_Final/PM25_LocalConditions/" + state+ "_localconditions.csv")
print("file written")

file written


In [170]:
#combine local conditions and component into a single df
componentpms = pd.read_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/PM25_States_Final/PM25_Components/pm_" + state +".csv")
state_pm = [localconditions_filtereddf_unique, componentpms]
state_pm = pd.concat(state_pm)
cols = ["Parameter Name", "Site_keys",  "LU", "ppm"]
state_pm = state_pm[cols]
state_pm

FileNotFoundError: File b'C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/PM25_States_Final/PM25_Components/pm_Maryland.csv' does not exist

In [None]:
state_pm.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/PM25_States_Final/PM25_By_State/PM25_" + state +".csv")
print("file written")

In [241]:
sulfurdioxide = fullstate.loc[fullstate['Parameter Name'] == so2_param_name]
sulfurdioxide

Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration,...,10th Percentile,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change,Site_keys,LU
25,44,7,11,42401,1,41.832322,-71.416445,WGS84,Sulfur dioxide,1 HOUR,...,7.6,,"HEALTH DEPT. PARKING LOT, STATE STREET",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.832322 -71.416445],{'Commercial and Services': 0.3382352941176471...
26,44,7,11,42401,1,41.832322,-71.416445,WGS84,Sulfur dioxide,1 HOUR,...,1.1,,"HEALTH DEPT. PARKING LOT, STATE STREET",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.832322 -71.416445],{'Commercial and Services': 0.3382352941176471...
27,44,7,11,42401,1,41.832322,-71.416445,WGS84,Sulfur dioxide,24-HR BLK AVG,...,3.2,,"HEALTH DEPT. PARKING LOT, STATE STREET",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.832322 -71.416445],{'Commercial and Services': 0.3382352941176471...
28,44,7,11,42401,1,41.832322,-71.416445,WGS84,Sulfur dioxide,3-HR BLK AVG,...,1.6,,"HEALTH DEPT. PARKING LOT, STATE STREET",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.832322 -71.416445],{'Commercial and Services': 0.3382352941176471...
32,44,7,12,42401,1,41.825556,-71.405278,NAD83,Sulfur dioxide,1 HOUR,...,9.9,Rockefeller Library,"ROCKEFELLER LIBRARY, PROSPECT STREET.",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.825556 -71.405278],"{'Commercial and Services': 0.93, 'Residential..."
33,44,7,12,42401,1,41.825556,-71.405278,NAD83,Sulfur dioxide,1 HOUR,...,2.6,Rockefeller Library,"ROCKEFELLER LIBRARY, PROSPECT STREET.",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.825556 -71.405278],"{'Commercial and Services': 0.93, 'Residential..."
34,44,7,12,42401,1,41.825556,-71.405278,NAD83,Sulfur dioxide,24-HR BLK AVG,...,4.1,Rockefeller Library,"ROCKEFELLER LIBRARY, PROSPECT STREET.",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.825556 -71.405278],"{'Commercial and Services': 0.93, 'Residential..."
35,44,7,12,42401,1,41.825556,-71.405278,NAD83,Sulfur dioxide,3-HR BLK AVG,...,2.6,Rockefeller Library,"ROCKEFELLER LIBRARY, PROSPECT STREET.",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.825556 -71.405278],"{'Commercial and Services': 0.93, 'Residential..."
50,44,7,1007,42401,1,41.820933,-71.407557,WGS84,Sulfur dioxide,1 HOUR,...,7.6,,"TRAILER, 120 DYER STREET.",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.820933 -71.407557],"{'Commercial and Services': 0.72, 'Streams and..."
51,44,7,1007,42401,1,41.820933,-71.407557,WGS84,Sulfur dioxide,1 HOUR,...,1.1,,"TRAILER, 120 DYER STREET.",Rhode Island,Providence,Providence,"Providence-Warwick, RI-MA",2013-02-16,[ 41.820933 -71.407557],"{'Commercial and Services': 0.72, 'Streams and..."


In [242]:
replace_units_ppb_to_ppm(sulfurdioxide)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [243]:
#Sample Duration: 1 Hour
#Metric Used: observed values
sulfurdioxide_filtereddf = sulfurdioxide.loc[sulfurdioxide['Sample Duration'] == "1 HOUR"]
sulfurdioxide_filtereddf = sulfurdioxide_filtereddf.loc[sulfurdioxide_filtereddf['Metric Used'] == "Observed Values"]
cols = ["Parameter Name", "Site_keys",  "LU", "Arithmetic Mean"]
sulfurdioxide_filtereddf = sulfurdioxide_filtereddf[cols]
sulfurdioxide_filtereddf_final = filter_unique_sitekeys(so2_param_name, sulfurdioxide_filtereddf)
sulfurdioxide_filtereddf_final

Unnamed: 0,Parameter Name,Site_keys,LU,ppm
0,Sulfur dioxide,[ 41.832322 -71.416445],{'Commercial and Services': 0.3382352941176471...,0.015484
1,Sulfur dioxide,[ 41.825556 -71.405278],"{'Commercial and Services': 0.93, 'Residential...",0.013782
2,Sulfur dioxide,[ 41.820933 -71.407557],"{'Commercial and Services': 0.72, 'Streams and...",0.013829


In [244]:
nitrogendioxide = fullstate.loc[fullstate['Parameter Name'] == no2_param_name]
replace_units_ppb_to_ppm(nitrogendioxide)
#Sample Duration: 1 Hour
#Metric Used: observed values
nitrogendioxide_filtereddf = nitrogendioxide.loc[nitrogendioxide['Sample Duration'] == "1 HOUR"]
nitrogendioxide_filtereddf = nitrogendioxide_filtereddf.loc[nitrogendioxide_filtereddf['Metric Used'] == "Observed values"]
cols = ["Parameter Name", "Site_keys",  "LU", "Arithmetic Mean"]
nitrogendioxide_filtereddf = nitrogendioxide_filtereddf[cols]
nitrogendioxide_filtereddf_final = filter_unique_sitekeys(no2_param_name, nitrogendioxide_filtereddf)
nitrogendioxide_filtereddf_final

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Parameter Name,Site_keys,LU,ppm
0,Nitrogen dioxide (NO2),[ 41.820933 -71.407557],"{'Commercial and Services': 0.72, 'Streams and...",0.035406


In [245]:
sulfurdioxide_filtereddf_final.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/SO2_States_Final/SO2_"+state + ".csv")
nitrogendioxide_filtereddf_final.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/NO2_States_Final/NO2_"+state + ".csv")
print("files written")

files written


In [246]:
#Compile the PM2.5, SO2, and NO2 data frames into a single data frame, one for each pollutant
PM25_path = "C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/PM25_States_Final/PM25_By_State"
PM25_files = os.listdir(PM25_path)

SO2_path = "C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/SO2_States_Final"
SO2_files = os.listdir(SO2_path)

NO2_path = "C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/NO2_States_Final"
NO2_files = os.listdir(NO2_path)

In [248]:
NO2_files

['NO2_Alabama.csv',
 'NO2_Arizona.csv',
 'NO2_California.csv',
 'NO2_Connecticut.csv',
 'NO2_Delaware.csv',
 'NO2_Florida.csv',
 'NO2_Georgia.csv',
 'NO2_Illinois.csv',
 'NO2_Maine.csv',
 'NO2_Maryland.csv',
 'NO2_Massachusetts.csv',
 'NO2_New Hampshire.csv',
 'NO2_New Jersey.csv',
 'NO2_New York.csv',
 'NO2_Pennsylvania.csv',
 'NO2_Rhode Island.csv']

In [249]:
PM25_list = []
SO2_list = []
NO2_list = []

#for x in range(0, len(PM25_files)):
#    df1 = pd.read_csv(PM25_path + "/" + PM25_files[x])
#    PM25_list.append(df1)

for y in range(0, len(SO2_files)):
    df2 = pd.read_csv(SO2_path + "/" + SO2_files[y])
    SO2_list.append(df2)

for z in range(0, len(NO2_files)):
    df3 = pd.read_csv(NO2_path + "/" + NO2_files[z])
    NO2_list.append(df3)
        
#PM25_result = pd.concat(PM25_list)
SO2_result = pd.concat(SO2_list)
NO2_result = pd.concat(NO2_list)

cols = ["Parameter Name", "Site_keys",  "LU", "ppm"]
#PM25_result = PM25_result[cols]
SO2_result = SO2_result[cols]
NO2_result = NO2_result[cols]

In [250]:
#Write the concatenated dataframes to files
#PM25_result.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/PM25_COMPLETE.csv")
SO2_result.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/SO2_COMPLETE.csv")
NO2_result.to_csv("C:/Users/Anthony DePinho/Documents/REU 2017/TRiCAM_BostonAQ/StateData/NO2_COMPLETE.csv")
print("files written")

files written
