In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import folium
from folium import plugins
import fancyimpute

Using TensorFlow backend.


In [2]:
path ='/public/npatil4/dataMining/data/daily/combined_wide/'
files = [path+'combined_wide_'+str(year)+'.csv' for year in years]

In [3]:
data = [pd.read_csv(file_name,sep='|') for file_name in files]

In [4]:
[df.shape for df in data]

[(148428, 18),
 (160416, 18),
 (157975, 18),
 (168291, 18),
 (184197, 18),
 (196085, 18),
 (206363, 18)]

In [5]:
data= pd.concat(data, ignore_index=True)

In [6]:
data['DateLocal'] = pd.to_datetime(data.DateLocal)
data['year'] = data.DateLocal.dt.year
data['weekNo'] = data.DateLocal.dt.week
data['Weekday'] = data.DateLocal.dt.weekday

In [7]:
data.head(2).transpose()

Unnamed: 0,0,1
DateLocal,2009-01-01 00:00:00,2009-01-01 00:00:00
CityName,Aberdeen,Adjuntas
CountyName,Brown,Adjuntas
StateName,South Dakota,Puerto Rico
Latitude,45.4625,18.17538
Longitude,-98.48611,-66.72599
Barometricpressure,,
Carbonmonoxide,,
DewPoint,,
NitrogenDioxide,,


In [8]:
data_missing = data.apply(pd.isnull)

In [9]:
100 * data_missing.ix[:,6:-4].apply(sum)/data_missing.shape[0]

Barometricpressure    81.756326
Carbonmonoxide        76.663324
DewPoint              98.023335
NitrogenDioxide       70.833923
OutdoorTemperature    66.338505
Ozone                 54.405752
PM                     0.000000
RelativeHumidity      78.592762
Sulfurdioxide         72.999251
WindSpeed             71.541185
dtype: float64

In [10]:
data_missing_count = data_missing.ix[:,6:-4].apply(sum,1).groupby([data.CountyName,
                                                                   data.CityName]
                                                                 ).mean()
data_missing_count = pd.DataFrame(data_missing_count[data_missing_count>=8]).reset_index()

In [11]:
data_pol = data.ix[:,:-4].groupby([
       'CityName', 'CountyName',
        'Latitude','Longitude'],
        as_index=False).mean()

In [12]:
pollution_heatmap = folium.Map(location=[data_pol['Latitude'].mean(),
                                         data_pol['Longitude'].mean()],
                               zoom_start=4)
pollution_heatmap.add_children(
    plugins.HeatMap(
        [[row["Latitude"], row["Longitude"],row['PM']]
         for name, row in data_pol.iterrows()]
    ))
pollution_heatmap.save("heatmap.html")
pollution_heatmap

In [13]:
data_pol = data.ix[:,:-4].groupby([
       'DateLocal',
       'CityName', 'CountyName',
       'Latitude','Longitude'],
        as_index=False).mean()


In [14]:
columns = [
 'Carbonmonoxide',
 'DewPoint',
 'NitrogenDioxide',
 'OutdoorTemperature',
 'Ozone',
 'PM',
 'RelativeHumidity',
 'Sulfurdioxide'
]

In [15]:
X_incomplete = data_pol.ix[:,columns]
X_incomplete.head()

Unnamed: 0,Carbonmonoxide,DewPoint,NitrogenDioxide,OutdoorTemperature,Ozone,PM,RelativeHumidity,Sulfurdioxide
0,,,,,,5.2,,
1,,,,,,3.9,,
2,,,,,,7.9,,2.576708
3,,,,,,7.9,,
4,,,,,,4.95,,


In [16]:
X_incomplete.shape

(1221755, 8)

In [None]:
X_incomplete.to_csv(
    '../data/daily/combined_wide/combined_wide_all_missing.txt',
    sep='|')


In [None]:
from fancyimpute import BiScaler, KNN, NuclearNormMinimization, SoftImpute, MICE

filled_path = '../data/daily/combined_wide/combined_wide_all_filled_'

print('SoftImpute')

X_filled_si = SoftImpute(n_power_iterations=5,
                         max_iters=250,
                         init_fill_method='mean').complete(X_incomplete)

X_filled_si = pd.DataFrame(X_filled_si)
X_filled_si.columns = columns
X_filled_si.to_csv(filled_path+'si.txt',sep='|',index=False)

print('MICE')

z = MICE2(n_pmm_neighbors=100,
          n_imputations=250,
          init_fill_method='median',
          verbose=False).complete(X_incomplete)

missing_data = np.array(z[0])
missing_bool = z[1]
missing_replacement = z[2]
np.putmask(missing_data,missing_bool,missing_replacement)
X_filled_mice = pd.DataFrame(missing_data)
X_filled_mice.columns = columns
X_filled_mice.to_csv(path_filled+'mice.txt',sep='|',index=False)

print('KNN')
X_filled_knn = KNN(k=100).complete(X_incomplete)
X_filled_knn = pd.DataFrame(X_filled_knn)
X_filled_knn.columns = columns
X_filled_knn.to_csv(path_filled+'knn.txt',sep='|',index=False)

print('nnm')
X_filled_nnm = pd.DataFrame(NuclearNormMinimization().complete(X_incomplete))
X_filled_nnm.columns = columns
X_filled_nnm.to_csv(path_filled+'nnm.txt',sep='|',index=False)

In [22]:
data_model = pd.read_csv('../data/daily/combined_wide/combined_wide_all_filled_si.txt',sep='|')
data_model.head()

Unnamed: 0,Carbonmonoxide,DewPoint,NitrogenDioxide,OutdoorTemperature,Ozone,PM,RelativeHumidity,Sulfurdioxide
0,0.063618,0.425478,1.502567,1.523767,0.006282,5.2,1.377421,0.442094
1,0.047714,0.319108,1.126925,1.142825,0.004712,3.9,1.033066,0.331571
2,0.102734,0.61201,2.501234,2.330388,0.010028,7.9,2.128413,2.576708
3,0.096651,0.646399,2.282745,2.314953,0.009544,7.9,2.09262,0.671643
4,0.06056,0.405022,1.430328,1.450508,0.00598,4.95,1.311199,0.42084


In [23]:
data_model['CityName'] = data_pol.CityName
data_model['CityName'] = data_model.CityName.astype('category')
data_model['date'] = data_pol.DateLocal
data_model['weekNo'] = data_model.date.dt.weekofyear
data_model['weekNo'] = data_model.weekNo.astype('category')
data_model['monthNo'] = data_model.date.dt.month
data_model['monthNo'] = data_model.monthNo.astype('category')
data_model['pm_mean_month'] = data_model.PM.groupby([data_model.CityName,
                                                     data_model.monthNo]).transform('mean')
data_model['pm_mean_weekNo'] = data_model.PM.groupby([data_model.CityName,
                                                     data_model.weekNo]).transform('mean')