In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr
from matplotlib.ticker import StrMethodFormatter

In [2]:
riskforecasting = pd.read_csv('~data/pollution_risk_forecasting.csv')
riskforecasting['time'] = pd.to_datetime(riskforecasting['time'])
riskforecasting

FileNotFoundError: [Errno 2] No such file or directory: '~data/pollution_risk_forecasting.csv'

In [None]:
optics = pd.read_csv("~data/optics.csv")
optics

In [None]:
transp = pd.read_csv("~data/transp.csv")
transp

In [None]:
plankton = pd.read_csv("~data/plankton.csv")
plankton

In [None]:
15*15*(2*365+366)*430

In [None]:
optics['time'] = pd.to_datetime(optics['time'])
transp['time'] = pd.to_datetime(transp['time'])
plankton['time'] = pd.to_datetime(plankton['time'])

In [None]:
feature = pd.merge(optics, transp, on=['time', 'lat', 'lon', 'site'], how='outer')
feature = pd.merge(feature, plankton, on=['time', 'lat', 'lon', 'site'], how='outer')
feature

In [None]:
feature = feature[['time', 'site', 'BBP', 'CDM', 'SPM', 'KD490', 'ZSD', 'CHL']].reset_index(drop=True)
feature = feature.sort_values(by=['time', 'site', 'lat', 'lon'])
feature

In [None]:
len(feature['time'].unique())

In [None]:
len(feature['site'].unique())

In [None]:
feature.to_csv('~data/feature_data(s3).csv', index = False)

In [None]:
plt.figure(figsize=(16,4))
plt.hist(feature[['BBP', 'CDM', 'SPM', 'KD490', 'ZSD', 'CHL']], 20)
plt.legend(['BBP', 'CDM', 'SPM', 'KD490', 'ZSD', 'CHL'])
plt.show()

## Risk Levels vs. Features

In [None]:
df = pd.merge(riskforecasting, feature, on=['site','time'])
df

### Hist of each risk level in BBP, CDM, SPM, KD490, ZSD, and CHL

Divide the values into intervals of 10s and group all values above 100 become a single intervel.

In [None]:
df_risk = df[['riskLevelLabel','BBP', 'CDM', 'SPM', 'KD490', 'ZSD', 'CHL']]

features = ['BBP', 'CDM', 'SPM', 'KD490', 'ZSD', 'CHL']
for f in features:
    df_risk[f + '_Range'] = pd.cut(df_risk[f], bins=np.arange(0, df_risk[f].max()//100*100+101, 5))
df_risk

In [None]:
df_increased = df_risk[df_risk['riskLevelLabel'] == 1].drop('riskLevelLabel',axis=1)
counts_increased = pd.DataFrame({
    'BBP': df_increased['BBP_Range'].value_counts(),
    'CDM': df_increased['CDM_Range'].value_counts(),
    'SPM': df_increased['SPM_Range'].value_counts(),
    'KD490': df_increased['KD490_Range'].value_counts(),
    'ZSD': df_increased['ZSD_Range'].value_counts(),
    'CHL': df_increased['CHL_Range'].value_counts(),
})

df_normal = df_risk[df_risk['riskLevelLabel'] == 0].drop('riskLevelLabel',axis=1)
counts_normal = pd.DataFrame({
    'BBP': df_increased['BBP_Range'].value_counts(),
    'CDM': df_increased['CDM_Range'].value_counts(),
    'SPM': df_increased['SPM_Range'].value_counts(),
    'KD490': df_increased['KD490_Range'].value_counts(),
    'ZSD': df_increased['ZSD_Range'].value_counts(),
    'CHL': df_increased['CHL_Range'].value_counts(),
})

In [None]:
new_row = pd.DataFrame(counts_normal[10:].sum(axis=0)).T
counts_normal = pd.concat([counts_normal[:10], new_row])
counts_normal.index.values[-1] = '(100.0, 550,0]'

In [None]:
new_row = pd.DataFrame(counts_increased[10:].sum(axis=0)).T
counts_increased = pd.concat([counts_increased[:10], new_row])
counts_increased.index.values[-1] = '(100.0, 550,0]'

In [None]:
fig, axs = plt.subplots(2,1, figsize=(16, 8), sharex=True)

counts_increased.plot(kind='bar', ax=axs[0])
counts_normal.plot(kind='bar', ax=axs[1])
axs[1].set_xlabel('Value Range')
axs[0].set_ylabel(r'Count '+'\n'+'(increased risk)', rotation=0, ha='right')
axs[1].set_ylabel(r'Count '+'\n'+'(normal risk)', rotation=0, ha='right')
axs[0].legend(title='Column')
axs[1].legend(title='Column')
axs[1].set_xticklabels(axs[1].get_xticklabels(), rotation=70)
plt.tight_layout()
plt.show()