# Project 4: 
# West Nile Virus Prediction
## Predict West Nile virus in mosquitos across the city of Chicago

In [1]:
# import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OneHotEncoder, OrdinalEncoder
from sklearn import metrics
from sklearn.metrics import mean_squared_error

## 4. Data Cleaning

#### a. Data Dictionary

In [2]:
# import data

# weather dataset

weather = pd.read_csv('../DataSets/raw data (from Kaggle)/weather.csv')

In [3]:
weather.head()

Unnamed: 0,Station,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,...,CodeSum,Depth,Water1,SnowFall,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed
0,1,2007-05-01,83,50,67,14,51,56,0,2,...,,0,M,0.0,0.0,29.1,29.82,1.7,27,9.2
1,2,2007-05-01,84,52,68,M,51,57,0,3,...,,M,M,M,0.0,29.18,29.82,2.7,25,9.6
2,1,2007-05-02,59,42,51,-3,42,47,14,0,...,BR,0,M,0.0,0.0,29.38,30.09,13.0,4,13.4
3,2,2007-05-02,60,43,52,M,42,47,13,0,...,BR HZ,M,M,M,0.0,29.44,30.08,13.3,2,13.4
4,1,2007-05-03,66,46,56,2,40,48,9,0,...,,0,M,0.0,0.0,29.39,30.12,11.7,7,11.9


In [4]:
weather.columns

Index(['Station', 'Date', 'Tmax', 'Tmin', 'Tavg', 'Depart', 'DewPoint',
       'WetBulb', 'Heat', 'Cool', 'Sunrise', 'Sunset', 'CodeSum', 'Depth',
       'Water1', 'SnowFall', 'PrecipTotal', 'StnPressure', 'SeaLevel',
       'ResultSpeed', 'ResultDir', 'AvgSpeed'],
      dtype='object')

In [5]:
weather.dtypes

Station          int64
Date            object
Tmax             int64
Tmin             int64
Tavg            object
Depart          object
DewPoint         int64
WetBulb         object
Heat            object
Cool            object
Sunrise         object
Sunset          object
CodeSum         object
Depth           object
Water1          object
SnowFall        object
PrecipTotal     object
StnPressure     object
SeaLevel        object
ResultSpeed    float64
ResultDir        int64
AvgSpeed        object
dtype: object

#### Missing values

In [6]:
# drop the column 'Water1' as it is almost all missing values
weather = weather.drop(['Water1','Depth','SnowFall'], axis =1)

In [7]:
# dealing with missing value
weather = weather.replace('M', np.nan)
weather = weather.replace('-', np.nan)
weather = weather.ffill()
weather = weather.replace(' ', 'N')
weather['PrecipTotal'] = weather['PrecipTotal'].replace('T', 0.00)
weather['PrecipTotal'] = weather['PrecipTotal'].replace('  T', 0.00)

In [8]:
# change dtypes for continuous data to float
numericdata = [x for x in weather.columns if x not in ['Station','Date', 'CodeSum']]
weather[numericdata] = weather[numericdata].astype('float64')

#### Taking average of values of Station 1 and Station 2

In [9]:
weather_consolidate = weather.groupby('Date').mean().drop(columns = 'Station').reset_index()
station1 = weather[weather['Station'] == 1]['CodeSum'].reset_index(drop = True)
station2 = weather[weather['Station'] == 2]['CodeSum'].reset_index(drop = True)
weather_consolidate['CodeSum'] = (station1.map(lambda x: x.split()) + station2.map(lambda x: x.split()))
weather_consolidate['CodeSum'] = weather_consolidate['CodeSum'].map(lambda x: set(x))

In [10]:
weather_consolidate.head(5)

Unnamed: 0,Date,Tmax,Tmin,Tavg,Depart,DewPoint,WetBulb,Heat,Cool,Sunrise,Sunset,PrecipTotal,StnPressure,SeaLevel,ResultSpeed,ResultDir,AvgSpeed,CodeSum
0,2007-05-01,83.5,51.0,67.5,14.0,51.0,56.5,0.0,2.5,448.0,1849.0,0.0,29.14,29.82,2.2,26.0,9.4,{N}
1,2007-05-02,59.5,42.5,51.5,-3.0,42.0,47.0,13.5,0.0,447.0,1850.0,0.0,29.41,30.085,13.15,3.0,13.4,"{BR, HZ}"
2,2007-05-03,66.5,47.0,57.0,2.0,40.0,49.0,8.0,0.0,446.0,1851.0,0.0,29.425,30.12,12.3,6.5,12.55,"{HZ, N}"
3,2007-05-04,72.0,50.0,58.0,4.0,41.5,50.0,7.0,0.0,444.0,1852.0,0.0,29.335,30.045,10.25,7.5,10.6,"{RA, N}"
4,2007-05-05,66.0,53.5,60.0,5.0,38.5,49.5,5.0,0.0,443.0,1853.0,0.0,29.43,30.095,11.45,7.0,11.75,{N}


In [11]:
# saving to csv
weather_consolidate.to_csv('../Datasets/weather_clean.csv', index = False)