In [1]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import csv
import math
from sklearn import preprocessing

plt.rcParams['savefig.dpi'] = 300 
plt.rcParams['figure.dpi'] = 300
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (10.0, 8.0)
# Fix the seed of the random number 
# generator so that your results will match ours
np.random.seed(1)

%matplotlib inline
%load_ext autoreload
%autoreload 2

# Weather

In [2]:
weather = pd.read_csv('weatherData_0401to0501_clean.csv').drop(['Unnamed: 0'],axis=1)
weather.head()

Unnamed: 0,id,station_id,time,weather,temperature,pressure,humidity,wind_speed,wind_direction
0,2050685,shunyi_meo,2018-04-01 17:00:00,Hail,16.5,1002.3,71.0,1.2,121.0
1,2050686,hadian_meo,2018-04-01 17:00:00,Hail,13.5,999.9,71.0,1.0,47.0
2,2050687,yanqing_meo,2018-04-01 17:00:00,Hail,12.2,949.6,49.0,0.7,65.0
3,2050688,miyun_meo,2018-04-01 17:00:00,Hail,13.5,997.5,75.0,0.4,70.0
4,2050689,huairou_meo,2018-04-01 17:00:00,Hail,16.2,997.2,69.0,1.0,1.0


In [3]:
weather.describe()

Unnamed: 0,id,temperature,pressure,humidity,wind_speed,wind_direction
count,12274.0,12274.0,12274.0,12274.0,12274.0,12274.0
mean,2580181.0,14.700248,999.0626,50.420075,2.340863,167.547499
std,307516.7,6.880294,19.113281,24.59119,1.705213,105.20334
min,2050685.0,-2.6,944.2,6.0,0.0,0.0
25%,2315845.0,10.0,996.2,28.0,1.1,73.0
50%,2577116.0,14.9,1005.4,48.0,1.9,175.0
75%,2839251.0,19.9,1011.9,70.0,3.1,247.0
max,3118041.0,31.7,1028.3,99.0,13.2,360.0


In [4]:
weather_coded = pd.get_dummies(weather['weather'])
weather_coded.head()
# weather = weather.drop('weather', axis = 1)
# weather = weather.join(weather_coded)

Unnamed: 0,Cloudy,Hail,Light Rain,Overcast,Rain,Sleet,Sunny/clear,Thundershower
0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0
4,0,1,0,0,0,0,0,0


In [5]:
wind_direction_norm = weather[['wind_direction']].copy()
for i in range(0,weather[['wind_direction']].index.size):

    if (wind_direction_norm.loc[i,'wind_direction'].item() <22.5) \
    | (wind_direction_norm.loc[i,'wind_direction'].item() >337.5 ):
         wind_direction_norm.loc[i,'wind_direction']=1
            
    elif (wind_direction_norm.loc[i,'wind_direction'] >22.5) \
    & (wind_direction_norm.loc[i,'wind_direction'] < 67.5 ):
        wind_direction_norm.loc[i,'wind_direction']=2
        
    elif (wind_direction_norm.loc[i,'wind_direction'] >67.5) \
    & (wind_direction_norm.loc[i,'wind_direction'] < 112.5):
        wind_direction_norm.loc[i,'wind_direction']=3
        
    elif (wind_direction_norm.loc[i,'wind_direction'] >112.5) \
    & (wind_direction_norm.loc[i,'wind_direction'] < 157.5):
        wind_direction_norm.loc[i,'wind_direction']=4
        
    elif (wind_direction_norm.loc[i,'wind_direction'] >157.5) \
    & (wind_direction_norm.loc[i,'wind_direction'] < 202.5):
        wind_direction_norm.loc[i,'wind_direction']=5
        
    elif (wind_direction_norm.loc[i,'wind_direction'] >202.5) \
    & (wind_direction_norm.loc[i,'wind_direction'] < 247.5):
        wind_direction_norm.loc[i,'wind_direction']=6
        
    elif (wind_direction_norm.loc[i,'wind_direction'] >247.5) \
    & (wind_direction_norm.loc[i,'wind_direction'] < 292.5):
        wind_direction_norm.loc[i,'wind_direction']=7
        
    elif (wind_direction_norm.loc[i,'wind_direction'] >292.5) \
    & (wind_direction_norm.loc[i,'wind_direction'] < 337.5):
        wind_direction_norm.loc[i,'wind_direction']=8

In [6]:
wind_direction_norm['wind_direction'].value_counts()

6.0    1871
2.0    1828
5.0    1744
3.0    1730
1.0    1666
8.0    1400
4.0    1026
7.0    1009
Name: wind_direction, dtype: int64

In [7]:
wind_direction_coded = pd.get_dummies(wind_direction_norm['wind_direction'])
wind_direction_coded.head()
# weather = weather.drop('wind_direction', axis = 1)
# weather = weather.join(weather_coded)

Unnamed: 0,1.0,2.0,3.0,4.0,5.0,6.0,7.0,8.0
0,0,0,0,1,0,0,0,0
1,0,1,0,0,0,0,0,0
2,0,1,0,0,0,0,0,0
3,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0


In [8]:
temperature_norm = (weather[['temperature']]-(-25))/(45-(-25))
pressure_norm = (weather[['pressure']]-(935))/(1045-(935))
humidity_norm = weather[['humidity']]/(100)
wind_speed_norm = weather[['wind_speed']]/16

In [9]:
temperature_norm.describe()

Unnamed: 0,temperature
count,12274.0
mean,0.567146
std,0.09829
min,0.32
25%,0.5
50%,0.57
75%,0.641429
max,0.81


In [10]:
pressure_norm.describe()

Unnamed: 0,pressure
count,12274.0
mean,0.582387
std,0.173757
min,0.083636
25%,0.556364
50%,0.64
75%,0.699091
max,0.848182


In [11]:
humidity_norm.describe()

Unnamed: 0,humidity
count,12274.0
mean,0.504201
std,0.245912
min,0.06
25%,0.28
50%,0.48
75%,0.7
max,0.99


In [12]:
wind_speed_norm.describe()

Unnamed: 0,wind_speed
count,12274.0
mean,0.146304
std,0.106576
min,0.0
25%,0.06875
50%,0.11875
75%,0.19375
max,0.825


In [13]:
weather_done = weather.copy()
weather_done = weather_done.drop('temperature', axis = 1)
weather_done = weather_done.join(temperature_norm)
weather_done = weather_done.drop('pressure', axis = 1)
weather_done = weather_done.join(pressure_norm)
weather_done = weather_done.drop('humidity', axis = 1)
weather_done = weather_done.join(humidity_norm)
weather_done = weather_done.drop('wind_speed', axis = 1)
weather_done = weather_done.join(wind_speed_norm)
weather_done = weather_done.drop('wind_direction', axis = 1)
weather_done = weather_done.join(wind_direction_coded)
weather_done = weather_done.drop('weather', axis = 1)
weather_done = weather_done.join(weather_coded)

In [14]:
weather_done.head()

Unnamed: 0,id,station_id,time,temperature,pressure,humidity,wind_speed,1.0,2.0,3.0,...,7.0,8.0,Cloudy,Hail,Light Rain,Overcast,Rain,Sleet,Sunny/clear,Thundershower
0,2050685,shunyi_meo,2018-04-01 17:00:00,0.592857,0.611818,0.71,0.075,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,2050686,hadian_meo,2018-04-01 17:00:00,0.55,0.59,0.71,0.0625,0,1,0,...,0,0,0,1,0,0,0,0,0,0
2,2050687,yanqing_meo,2018-04-01 17:00:00,0.531429,0.132727,0.49,0.04375,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,2050688,miyun_meo,2018-04-01 17:00:00,0.55,0.568182,0.75,0.025,0,0,1,...,0,0,0,1,0,0,0,0,0,0
4,2050689,huairou_meo,2018-04-01 17:00:00,0.588571,0.565455,0.69,0.0625,1,0,0,...,0,0,0,1,0,0,0,0,0,0


In [15]:
weather_done.to_csv('weather_norm.csv')

# Air

In [16]:
air = pd.read_csv('airData_0401to0501_clean.csv').drop(['Unnamed: 0'],axis=1)
air.head()

Unnamed: 0,id,station_id,time,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration
0,2943158,dongsi_aq,2018-04-01 17:00:00,145.0,208.0,102.0,1.0,44.0,14.0
1,2943159,tiantan_aq,2018-04-01 17:00:00,132.0,174.0,95.0,1.0,24.0,2.0
2,2943160,guanyuan_aq,2018-04-01 17:00:00,138.0,187.0,86.0,0.9,52.0,10.0
3,2943161,wanshouxigong_aq,2018-04-01 17:00:00,132.0,155.0,75.0,0.9,44.0,11.0
4,2943162,aotizhongxin_aq,2018-04-01 17:00:00,149.0,195.0,105.0,1.0,34.0,9.0


In [17]:
air.describe()

Unnamed: 0,id,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration
count,22925.0,22925.0,22925.0,22925.0,22925.0,22925.0,22925.0
mean,2961962.0,60.628375,121.738713,42.723664,0.708203,77.163969,6.601701
std,10856.39,57.793019,96.622337,30.028965,0.497271,53.591873,7.589781
min,2943158.0,3.0,6.0,2.0,0.1,2.0,2.0
25%,2952586.0,18.5,49.0,20.0,0.3,38.0,2.0
50%,2961954.0,39.0,102.0,35.0,0.6,73.0,3.5
75%,2971399.0,90.0,176.0,60.0,0.9,102.0,9.0
max,2980767.0,396.0,2030.0,249.0,4.3,342.0,300.0


In [18]:
air.isnull().any()

id                    False
station_id            False
time                  False
PM25_Concentration    False
PM10_Concentration    False
NO2_Concentration     False
CO_Concentration      False
O3_Concentration      False
SO2_Concentration     False
dtype: bool

In [19]:
PM25_norm = air[['PM25_Concentration']]/1000
PM10_norm = air[['PM10_Concentration']]/3000
NO2_norm = air[['NO2_Concentration']]/300
CO_norm = air[['CO_Concentration']]/15
O3_norm = air[['O3_Concentration']]/500
SO2_norm = air[['SO2_Concentration']]/300

In [20]:
air_done = air.copy()
air_done = air_done.drop('PM25_Concentration', axis = 1)
air_done = air_done.join(PM25_norm)
air_done = air_done.drop('PM10_Concentration', axis = 1)
air_done = air_done.join(PM10_norm)
air_done = air_done.drop('NO2_Concentration', axis = 1)
air_done = air_done.join(NO2_norm)
air_done = air_done.drop('CO_Concentration', axis = 1)
air_done = air_done.join(CO_norm)
air_done = air_done.drop('O3_Concentration', axis = 1)
air_done = air_done.join(O3_norm)
air_done = air_done.drop('SO2_Concentration', axis = 1)
air_done = air_done.join(SO2_norm)

In [21]:
air_done.describe()

Unnamed: 0,id,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration
count,22925.0,22925.0,22925.0,22925.0,22925.0,22925.0,22925.0
mean,2961962.0,0.060628,0.04058,0.142412,0.047214,0.154328,0.022006
std,10856.39,0.057793,0.032207,0.100097,0.033151,0.107184,0.025299
min,2943158.0,0.003,0.002,0.006667,0.006667,0.004,0.006667
25%,2952586.0,0.0185,0.016333,0.066667,0.02,0.076,0.006667
50%,2961954.0,0.039,0.034,0.116667,0.04,0.146,0.011667
75%,2971399.0,0.09,0.058667,0.2,0.06,0.204,0.03
max,2980767.0,0.396,0.676667,0.83,0.286667,0.684,1.0


In [22]:
air_done.head()

Unnamed: 0,id,station_id,time,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration
0,2943158,dongsi_aq,2018-04-01 17:00:00,0.145,0.069333,0.34,0.066667,0.088,0.046667
1,2943159,tiantan_aq,2018-04-01 17:00:00,0.132,0.058,0.316667,0.066667,0.048,0.006667
2,2943160,guanyuan_aq,2018-04-01 17:00:00,0.138,0.062333,0.286667,0.06,0.104,0.033333
3,2943161,wanshouxigong_aq,2018-04-01 17:00:00,0.132,0.051667,0.25,0.06,0.088,0.036667
4,2943162,aotizhongxin_aq,2018-04-01 17:00:00,0.149,0.065,0.35,0.066667,0.068,0.03


In [23]:
air_done.to_csv('air_norm.csv')

# Time

In [24]:
weather_done['station_id'].value_counts()

xiayunling_meo     687
mentougou_meo      684
shijingshan_meo    683
beijing_meo        683
zhaitang_meo       683
yanqing_meo        683
pingchang_meo      683
shangdianzi_meo    682
pinggu_meo         682
huairou_meo        682
shunyi_meo         682
fangshan_meo       681
daxing_meo         681
chaoyang_meo       681
fengtai_meo        680
tongzhou_meo       680
hadian_meo         680
miyun_meo          677
Name: station_id, dtype: int64

In [25]:
weather_station = weather_done.groupby('station_id')

In [26]:
hadian_meo = weather_station.get_group('hadian_meo')

In [27]:
hadian_meo.head()

Unnamed: 0,id,station_id,time,temperature,pressure,humidity,wind_speed,1.0,2.0,3.0,...,7.0,8.0,Cloudy,Hail,Light Rain,Overcast,Rain,Sleet,Sunny/clear,Thundershower
1,2050686,hadian_meo,2018-04-01 17:00:00,0.55,0.59,0.71,0.0625,0,1,0,...,0,0,0,1,0,0,0,0,0,0
19,2052219,hadian_meo,2018-04-01 18:00:00,0.568571,0.588182,0.64,0.08125,0,1,0,...,0,0,0,1,0,0,0,0,0,0
37,2053752,hadian_meo,2018-04-01 19:00:00,0.571429,0.587273,0.69,0.10625,0,1,0,...,0,0,0,1,0,0,0,0,0,0
55,2055285,hadian_meo,2018-04-01 20:00:00,0.55,0.584545,0.77,0.0625,0,1,0,...,0,0,0,1,0,0,0,0,0,0
73,2056818,hadian_meo,2018-04-01 21:00:00,0.544286,0.586364,0.82,0.05625,0,1,0,...,0,0,0,1,0,0,0,0,0,0


In [28]:
air_done['station_id'].value_counts()

dingling_aq          655
yungang_aq           655
gucheng_aq           655
yongdingmennei_aq    655
xizhimenbei_aq       655
mentougou_aq         655
yanqin_aq            655
miyunshuiku_aq       655
daxing_aq            655
wanliu_aq            655
miyun_aq             655
qianmen_aq           655
yongledian_aq        655
beibuxinqu_aq        655
dongsihuan_aq        655
pingchang_aq         655
huairou_aq           655
dongsi_aq            655
fangshan_aq          655
nongzhanguan_aq      655
pinggu_aq            655
aotizhongxin_aq      655
tiantan_aq           655
tongzhou_aq          655
badaling_aq          655
guanyuan_aq          655
yufa_aq              655
shunyi_aq            655
liulihe_aq           655
nansanhuan_aq        655
yizhuang_aq          655
zhiwuyuan_aq         655
fengtaihuayuan_aq    655
donggaocun_aq        655
wanshouxigong_aq     655
Name: station_id, dtype: int64

In [29]:
weather_station = weather_done.groupby('station_id')
hadian_meo = weather_station.get_group('hadian_meo')
times = pd.date_range('2018-04-01 17:00:00', '2018-05-01 00:00:00', freq = 'H')
hadian_meo.index = pd.DatetimeIndex(hadian_meo['time'])
hadian_meo_time = hadian_meo.reindex(times)
hadian_meo_time_ok = hadian_meo_time.interpolate()

In [30]:
hadian_meo_time_ok.head()

Unnamed: 0,id,station_id,time,temperature,pressure,humidity,wind_speed,1.0,2.0,3.0,...,7.0,8.0,Cloudy,Hail,Light Rain,Overcast,Rain,Sleet,Sunny/clear,Thundershower
2018-04-01 17:00:00,2050686.0,hadian_meo,2018-04-01 17:00:00,0.55,0.59,0.71,0.0625,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-04-01 18:00:00,2052219.0,hadian_meo,2018-04-01 18:00:00,0.568571,0.588182,0.64,0.08125,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-04-01 19:00:00,2053752.0,hadian_meo,2018-04-01 19:00:00,0.571429,0.587273,0.69,0.10625,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-04-01 20:00:00,2055285.0,hadian_meo,2018-04-01 20:00:00,0.55,0.584545,0.77,0.0625,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2018-04-01 21:00:00,2056818.0,hadian_meo,2018-04-01 21:00:00,0.544286,0.586364,0.82,0.05625,0.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
air_station = air_done.groupby('station_id')
wanliu_aq = air_station.get_group('wanliu_aq')
times = pd.date_range('2018-04-01 17:00:00', '2018-04-22 17:00:00', freq = 'H')
wanliu_aq.index = pd.DatetimeIndex(wanliu_aq['time'])
wanliu_aq_time = wanliu_aq.reindex(times)
wanliu_aq_time_ok = wanliu_aq_time.interpolate()

In [32]:
wanliu_aq_time_ok.head()

Unnamed: 0,id,station_id,time,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration
2018-04-01 17:00:00,2943164.0,wanliu_aq,2018-04-01 17:00:00,0.167,0.074333,0.446667,0.066667,0.026,0.026667
2018-04-01 18:00:00,2943220.0,wanliu_aq,2018-04-01 18:00:00,0.172,0.072333,0.486667,0.073333,0.004,0.02
2018-04-01 19:00:00,2943276.0,wanliu_aq,2018-04-01 19:00:00,0.215,0.086667,0.443333,0.1,0.014,0.02
2018-04-01 20:00:00,2943332.0,wanliu_aq,2018-04-01 20:00:00,0.224,0.088333,0.41,0.106667,0.004,0.02
2018-04-01 21:00:00,2943388.0,wanliu_aq,2018-04-01 21:00:00,0.236,0.083667,0.35,0.106667,0.004,0.016667
