In [1]:
import pandas as pd
from read_data import ReadData

pd.set_option('display.max_columns', 100)

In [2]:
"""
WIND: 
    61101 - Speed - Scalar
    61103 - Resultant
TEMP:
    62104 - 24 hr Max
    68105 - Avg tempature
Relative Humitdity:
    62201 - Relative Humidity
Pressure:
    64101 - Barometric presure
    68108 - Avg pressure
Dew point
    62103
"""

parameter_map = {
    "WIND": 61101,
#     "WIND_RES": 61103,
    "TEMP": 62101,
    "RH": 62201,
    "Pressue": 64101
}

path = "api_data/R_data"
state_code = "06"
filter_city_list = ['037', '111']

In [3]:
cities = set()
combined_df = pd.DataFrame()
for parameter in parameter_map:
    code = parameter_map[parameter]
    fname = "{}/daily_{}_2020_{}.csv".format(path, code, state_code)
    print(fname)
    df = ReadData(parameter, year='2020', filename=fname).get_pandas_obj()
    s = df['County'].unique().tolist()
    if not cities:
        cities =  set(s)
    cities = cities.intersection(s)
    df = df[df['County Code'].isin(filter_city_list)]
    new_parameter_col = 'Arithmetic Mean {}'.format(parameter)
    df[new_parameter_col] =  df['Arithmetic Mean']
    resultant_df = df[['id', 'Date Local', 'County', new_parameter_col]]
    if combined_df.empty:
        combined_df = resultant_df
    else:
        combined_df = combined_df.merge(resultant_df, on =["id", "Date Local", "County"], how="outer")

print(cities)

api_data/R_data/daily_61101_2020_06.csv
api_data/R_data/daily_62101_2020_06.csv
api_data/R_data/daily_62201_2020_06.csv
api_data/R_data/daily_64101_2020_06.csv
{'San Bernardino', 'San Diego', 'Inyo', 'Los Angeles', 'Ventura'}


In [4]:
df = combined_df.groupby(['Date Local', "County"]).agg({'Arithmetic Mean Pressue': 'max',
                                                 'Arithmetic Mean RH': 'max',
                                                 'Arithmetic Mean TEMP': 'max',
                                                 'Arithmetic Mean WIND': 'max',
                                                 }).reset_index()
df.columns = ["date", "county", "mean_pressure", "mean_rh", "mean_temp", "mean_wind"]

In [5]:
tdf = df.drop_duplicates('date', keep='first')
tdf['date_number'] = list(range(1, tdf.shape[0]+1))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [6]:
df['city'] = df['county'].apply(lambda x: 1 if x=='Los Angeles' else 2)
df = df.merge(tdf[['date', 'date_number']], on='date')

In [None]:
pol_df = ReadData("PM2", year='2020').get_pandas_obj()
pol_df = pol_df[pol_df['State Code']=="06"]

In [25]:
pol_df = ReadData("PM2", year='2020').get_pandas_obj()
pol_df = pol_df[pol_df['State Code']=="06"]
pol_df['County'].unique()
# pol_df.groupby(['County', 'State Code', 'State']).count().sort_values(by="id", ascending=False)

array(['Ventura', 'Santa Cruz', 'Santa Barbara', 'San Luis Obispo',
       'Tulare', 'Sutter', 'San Diego', 'San Bernardino', 'Fresno',
       'Los Angeles', 'Kings', 'Kern', 'Inyo', 'Imperial', 'Riverside',
       'Plumas', 'Placer', 'San Benito', 'Sacramento', 'Orange', 'Merced',
       'Nevada', 'Monterey', 'Mono', 'Colusa', 'Lake', 'Madera',
       'Stanislaus', 'Shasta', 'Yolo'], dtype=object)

In [None]:
pol_df = pol_df[pol_df['County Code'].isin(filter_city_list)]
pol_df = pol_df.groupby(['Date Local', "County"]).agg({'First Max Value': 'max'}).reset_index()

In [None]:
pol_df['pollutant_7'] = pol_df.groupby(['County'])['First Max Value'].transform(lambda x: x.rolling(7).mean()).reset_index()['First Max Value']
pol_df['pollutant_14'] = pol_df.groupby(['County'])['First Max Value'].transform(lambda x: x.rolling(14).mean()).reset_index()['First Max Value']
pol_df['pollutant_21'] = pol_df.groupby(['County'])['First Max Value'].transform(lambda x: x.rolling(21).mean()).reset_index()['First Max Value']

In [None]:
res_df_1 = pol_df[['Date Local', "County", "pollutant_7"]]
res_df_1.columns = ['date', 'county', 'pollutant']
res_df_1 = res_df_1.merge(df, on=('date', 'county'))

In [None]:
covid_data = pd.read_csv("data/covid_data/covid_us_county.csv")
covid_data['date'] = pd.to_datetime(covid_data['date'], format="%Y-%m-%d")

In [None]:
covid_data['cases'] = covid_data['cases']+1
covid_data['deaths'] = covid_data['deaths']+1

In [None]:
covid_data['cases_shifted'] = covid_data.groupby('county')['cases'].shift(1)
covid_data['deaths_shifted'] = covid_data.groupby('county')['deaths'].shift(1)
covid_data['cases_shifted'] = covid_data['cases_shifted'].fillna(1)
covid_data['deaths_shifted'] = covid_data['deaths_shifted'].fillna(1)

In [None]:
final_df= res_df_1.merge(covid_data[['date', 'county','cases_shifted', 'cases']], on=("date", "county"))

In [None]:
final_df.to_csv("r_files/pollutant_pm2.csv", index=False)