# Project 4!!

In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import roc_auc_score,confusion_matrix,classification_report
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

%matplotlib inline

pd.set_option('display.max_rows', 100) # to look at more rows of data later
pd.set_option('display.max_columns', 100) # to expand columns view so that all can be seen later

In [2]:
# Load dataset
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')
weather_df = pd.read_csv('../dataset/weather_final.csv')

In [3]:
# Print shape of dataset
print(train_df.shape)
print(test_df.shape)

(10506, 12)
(116293, 11)


In [4]:
# Print columns
print(train_df.columns)
print(test_df.columns)

Index(['Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy',
       'NumMosquitos', 'WnvPresent'],
      dtype='object')
Index(['Id', 'Date', 'Address', 'Species', 'Block', 'Street', 'Trap',
       'AddressNumberAndStreet', 'Latitude', 'Longitude', 'AddressAccuracy'],
      dtype='object')


In [5]:
train_df.head()

Unnamed: 0,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent
0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0
2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0
3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0
4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0


In [6]:
test_df.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9


In [7]:
print(train_df[train_df.duplicated()].count())

Date                      813
Address                   813
Species                   813
Block                     813
Street                    813
Trap                      813
AddressNumberAndStreet    813
Latitude                  813
Longitude                 813
AddressAccuracy           813
NumMosquitos              813
WnvPresent                813
dtype: int64


In [8]:
print(test_df[test_df.duplicated()].count())

Id                        0
Date                      0
Address                   0
Species                   0
Block                     0
Street                    0
Trap                      0
AddressNumberAndStreet    0
Latitude                  0
Longitude                 0
AddressAccuracy           0
dtype: int64


In [9]:
train_df.groupby(by=['Date','Address','Species','WnvPresent']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Block,Latitude,Longitude,AddressAccuracy,NumMosquitos
Date,Address,Species,WnvPresent,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX PIPIENS/RESTUANS,0,11,41.867108,-87.654224,8,1
2007-05-29,"1100 Roosevelt Road, Chicago, IL 60608, USA",CULEX RESTUANS,0,11,41.867108,-87.654224,8,2
2007-05-29,"1100 South Peoria Street, Chicago, IL 60608, USA",CULEX RESTUANS,0,11,41.862292,-87.648860,8,1
2007-05-29,"1100 West Chicago Avenue, Chicago, IL 60642, USA",CULEX RESTUANS,0,11,41.896282,-87.655232,8,1
2007-05-29,"1500 North Long Avenue, Chicago, IL 60651, USA",CULEX RESTUANS,0,15,41.907645,-87.760886,8,1
2007-05-29,"1500 West Webster Avenue, Chicago, IL 60614, USA",CULEX RESTUANS,0,15,41.921600,-87.666455,8,2
2007-05-29,"1700 West 95th Street, Chicago, IL 60643, USA",CULEX RESTUANS,0,17,41.720848,-87.666014,9,3
2007-05-29,"2100 North Stave Street, Chicago, IL 60647, USA",CULEX PIPIENS/RESTUANS,0,21,41.919343,-87.694259,8,1
2007-05-29,"2200 North Cannon Drive, Chicago, IL 60614, USA",CULEX PIPIENS/RESTUANS,0,22,41.921965,-87.632085,8,2
2007-05-29,"2200 North Cannon Drive, Chicago, IL 60614, USA",CULEX RESTUANS,0,22,41.921965,-87.632085,8,3


In [10]:
# Drop duplicates
train_df.drop_duplicates(subset=['Date','Address','Species','Trap','Block','WnvPresent'],inplace=True)
train_df.reset_index(inplace=True)

In [11]:
train_df.shape

(8610, 13)

In [12]:
# Check which mozzies spread WNV
train_df[train_df['WnvPresent'] == 1]['Species'].unique()

array(['CULEX PIPIENS/RESTUANS', 'CULEX PIPIENS', 'CULEX RESTUANS'],
      dtype=object)

In [13]:
# Check if there's overlap
train_df[train_df['WnvPresent'] == 0]['Species'].unique()

array(['CULEX PIPIENS/RESTUANS', 'CULEX RESTUANS', 'CULEX PIPIENS',
       'CULEX SALINARIUS', 'CULEX TERRITANS', 'CULEX TARSALIS',
       'CULEX ERRATICUS'], dtype=object)

In [14]:
# # Find probability of ocurrence for each mosquito
# prob_1 = train_df[train_df['Species'] == 'CULEX PIPIENS/RESTUANS']['WnvPresent'].sum()/train_df[train_df['Species'] == 'CULEX PIPIENS/RESTUANS']['WnvPresent'].count()
# prob_2 = train_df[train_df['Species'] == 'CULEX PIPIENS']['WnvPresent'].sum()/train_df[train_df['Species'] == 'CULEX PIPIENS']['WnvPresent'].count()
# prob_3 = train_df[train_df['Species'] == 'CULEX RESTUANS']['WnvPresent'].sum()/train_df[train_df['Species'] == 'CULEX RESTUANS']['WnvPresent'].count()

In [15]:
# # Ordinal encode species
# ord_list = []
# for i in train_df['Species']:
#     if i == 'CULEX PIPIENS/RESTUANS':
#         ord_list.append(prob_1)
#     elif i == 'CULEX PIPIENS':
#         ord_list.append(prob_2)
#     elif i == 'CULEX RESTUANS':
#         ord_list.append(prob_3)
#     else:
#         ord_list.append(0)

# train_df['Species_Enc'] = ord_list

In [16]:
# # Ordinal encode species
# ord_list = []
# for i in test_df['Species']:
#     if i == 'CULEX PIPIENS/RESTUANS':
#         ord_list.append(2)
#     elif i == 'CULEX PIPIENS':
#         ord_list.append(3)
#     elif i == 'CULEX RESTUANS':
#         ord_list.append(1)
#     else:
#         ord_list.append(0)

# test_df['Species_Enc'] = ord_list

In [17]:
# One-hot encode mozzies that spread WNV
train_species = pd.get_dummies(train_df['Species'])[['CULEX PIPIENS/RESTUANS','CULEX PIPIENS','CULEX RESTUANS']]
test_species = pd.get_dummies(test_df['Species'])[['CULEX PIPIENS/RESTUANS','CULEX PIPIENS','CULEX RESTUANS']]

In [18]:
train_df = pd.concat([train_df,train_species],axis=1,sort=False)
test_df = pd.concat([test_df,test_species],axis=1,sort=False)

In [19]:
# Calculate euclidean distance of weather station from city and determine which station is nearest
# This is calculated using pythagoras theorem  
# Station 1: CHICAGO O'HARE INTERNATIONAL AIRPORT Lat: 41.995 Lon: -87.933 Elev: 662 ft. above sea level
# Station 2: CHICAGO MIDWAY INTL ARPT Lat: 41.786 Lon: -87.752 Elev: 612 ft. above sea level
train_df['diststat1'] = np.sqrt((train_df['Latitude'] - 41.995) ** 2 + (train_df['Longitude'] - (-87.933)) ** 2)
train_df['diststat2'] = np.sqrt((train_df['Latitude'] - 41.786) ** 2 + (train_df['Longitude'] - (-87.752)) ** 2)
train_df['Station'] = [2 if train_df['diststat1'][i] > train_df['diststat2'][i] else 1 for i in range(train_df.shape[0])]
train_df.head()


Unnamed: 0,index,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,NumMosquitos,WnvPresent,CULEX PIPIENS/RESTUANS,CULEX PIPIENS,CULEX RESTUANS,diststat1,diststat2,Station
0,0,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,1,0,0,0.138026,0.17566,1
1,1,2007-05-29,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0,0,1,0.138026,0.17566,1
2,2,2007-05-29,"6200 North Mandell Avenue, Chicago, IL 60646, USA",CULEX RESTUANS,62,N MANDELL AVE,T007,"6200 N MANDELL AVE, Chicago, IL",41.994991,-87.769279,9,1,0,0,0,1,0.163721,0.209704,1
3,3,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX PIPIENS/RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,1,0,1,0,0,0.11019,0.201691,1
4,4,2007-05-29,"7900 West Foster Avenue, Chicago, IL 60656, USA",CULEX RESTUANS,79,W FOSTER AVE,T015,"7900 W FOSTER AVE, Chicago, IL",41.974089,-87.824812,8,4,0,0,0,1,0.11019,0.201691,1


In [20]:
# Apply for test data set

test_df['diststat1'] = np.sqrt((test_df['Latitude'] - 41.995) ** 2 + (test_df['Longitude'] - (-87.933)) ** 2)
test_df['diststat2'] = np.sqrt((test_df['Latitude'] - 41.786) ** 2 + (test_df['Longitude'] - (-87.752)) ** 2)
test_df['Station'] = [2 if test_df['diststat1'][i] > test_df['diststat2'][i] else 1 for i in range(test_df.shape[0])]
test_df.head()

Unnamed: 0,Id,Date,Address,Species,Block,Street,Trap,AddressNumberAndStreet,Latitude,Longitude,AddressAccuracy,CULEX PIPIENS/RESTUANS,CULEX PIPIENS,CULEX RESTUANS,diststat1,diststat2,Station
0,1,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS/RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,1,0,0,0.138026,0.17566,1
1,2,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX RESTUANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0,0,1,0.138026,0.17566,1
2,3,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX PIPIENS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0,1,0,0.138026,0.17566,1
3,4,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX SALINARIUS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0,0,0,0.138026,0.17566,1
4,5,2008-06-11,"4100 North Oak Park Avenue, Chicago, IL 60634,...",CULEX TERRITANS,41,N OAK PARK AVE,T002,"4100 N OAK PARK AVE, Chicago, IL",41.95469,-87.800991,9,0,0,0,0.138026,0.17566,1


In [21]:
train_df['dateofyear'] = pd.to_datetime(train_df['Date'], format='%Y-%m-%d').dt.dayofyear
test_df['dateofyear'] = pd.to_datetime(test_df['Date'], format='%Y-%m-%d').dt.dayofyear

In [22]:
# Merged weather and train/test to one dataframe
train_weather_df = pd.merge(train_df,weather_df,on=['Station','Date'])
train_weather_df.drop(axis=1,columns=['index'],inplace=True)
train_weather_df.head()
train_weather_df.to_csv('../dataset/train_weather.csv')

In [23]:
test_weather_df = pd.merge(test_df,weather_df,on=['Station','Date'])
# test_weather_df.drop(axis=1,columns=['Water1','Depth'],inplace=True)
test_weather_df.head()
train_weather_df.to_csv('../dataset/test_weather.csv')

In [24]:
train_weather_df.dtypes

Date                       object
Address                    object
Species                    object
Block                       int64
Street                     object
Trap                       object
AddressNumberAndStreet     object
Latitude                  float64
Longitude                 float64
AddressAccuracy             int64
NumMosquitos                int64
WnvPresent                  int64
CULEX PIPIENS/RESTUANS      uint8
CULEX PIPIENS               uint8
CULEX RESTUANS              uint8
diststat1                 float64
diststat2                 float64
Station                     int64
dateofyear                  int64
Tmax                        int64
Tmin                        int64
Tavg                      float64
Depart                    float64
DewPoint                    int64
WetBulb                   float64
Heat                      float64
Cool                      float64
Sunrise                     int64
Sunset                      int64
CodeSum       

In [25]:
np.abs(train_weather_df.corr()[['WnvPresent']]).sort_values(by='WnvPresent',ascending=False).head(31).T

Unnamed: 0,WnvPresent,NumMosquitos,dateofyear,Month,Sunrise,CULEX PIPIENS,CULEX RESTUANS,DewPoint,WetBulb,Station,Tmin,Longitude,Tavg,Cool,Depart,diststat1,Sunset,Tmax,Heat,ResultSpeed,Year,AvgSpeed,diststat2,Latitude,PrecipTotal,CULEX PIPIENS/RESTUANS,Block,AddressAccuracy,StnPressure,SeaLevel,ResultDir
WnvPresent,1.0,0.298589,0.100332,0.096551,0.096179,0.095826,0.09217,0.08854,0.087747,0.080604,0.078567,0.074075,0.07106,0.067323,0.063704,0.061033,0.05857,0.055976,0.054174,0.048979,0.042496,0.034714,0.033516,0.030907,0.021415,0.018785,0.011303,0.007886,0.007118,0.006683,0.001152


In [26]:
# # Get the correleation

# corr = train_weather_df.corr()

# # Generate a mask for the upper triangle
# mask = np.zeros_like(corr, dtype=np.bool)
# mask[np.triu_indices_from(mask)] = True

# # Set up the matplotlib figure
# f, ax = plt.subplots(figsize=(20, 20))

# # Generate a custom diverging colormap
# cmap = sns.diverging_palette(220, 10, as_cmap=True)

# # Draw the heatmap with the mask and correct aspect ratio
# sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,annot=True,
#             square=True, linewidths=.5, cbar_kws={"shrink": .5}).set_title('Correlation')

In [27]:
feat = ['dateofyear','Latitude','Longitude','AddressAccuracy','CULEX PIPIENS/RESTUANS','CULEX PIPIENS','CULEX RESTUANS','Heat','Cool','WetBulb','PrecipTotal','Sunrise','Sunset','Tmin','Tmax','DewPoint']

X_subset = train_weather_df[feat]
y = train_weather_df['WnvPresent']
X_kaggle_subset = test_weather_df[feat]

In [28]:
X_subset.head()

Unnamed: 0,dateofyear,Latitude,Longitude,AddressAccuracy,CULEX PIPIENS/RESTUANS,CULEX PIPIENS,CULEX RESTUANS,Heat,Cool,WetBulb,PrecipTotal,Sunrise,Sunset,Tmin,Tmax,DewPoint
0,149,41.95469,-87.800991,9,1,0,0,0.0,9.0,65.0,0.0,421,1917,60,88,58
1,149,41.95469,-87.800991,9,0,0,1,0.0,9.0,65.0,0.0,421,1917,60,88,58
2,149,41.994991,-87.769279,9,0,0,1,0.0,9.0,65.0,0.0,421,1917,60,88,58
3,149,41.974089,-87.824812,8,1,0,0,0.0,9.0,65.0,0.0,421,1917,60,88,58
4,149,41.974089,-87.824812,8,0,0,1,0.0,9.0,65.0,0.0,421,1917,60,88,58


In [29]:
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_subset)
X_kaggle_poly = poly.fit_transform(X_kaggle_subset)

In [30]:
poly_train = pd.DataFrame(X_train_poly, columns = poly.get_feature_names(X_subset.columns))
poly_kaggle = pd.DataFrame(X_kaggle_poly, columns = poly.get_feature_names(X_kaggle_subset.columns))

In [31]:
poly_train['WnvPresent'] = train_weather_df['WnvPresent']

In [32]:
np.abs(poly_train.corr()[['WnvPresent']]).sort_values(by='WnvPresent',ascending=False).head(31).T

Unnamed: 0,WnvPresent,WetBulb Sunrise,dateofyear DewPoint,Sunrise DewPoint,dateofyear WetBulb,Sunrise Tmin,dateofyear Tmin,Sunrise Sunset,dateofyear Tmax,Sunrise Tmax,dateofyear Sunset,CULEX PIPIENS Cool,CULEX PIPIENS DewPoint,CULEX PIPIENS Tmin,CULEX PIPIENS WetBulb,dateofyear Latitude,dateofyear Longitude,dateofyear,CULEX PIPIENS Tmax,CULEX PIPIENS Sunrise,dateofyear CULEX PIPIENS,Latitude Sunrise,Longitude Sunrise,Sunrise,Latitude CULEX PIPIENS,Longitude CULEX PIPIENS,CULEX PIPIENS Sunset,CULEX PIPIENS,AddressAccuracy CULEX PIPIENS,dateofyear Sunrise,CULEX RESTUANS Sunset
WnvPresent,1.0,0.142523,0.141273,0.139731,0.139305,0.137021,0.13653,0.120361,0.118866,0.115865,0.115044,0.105437,0.105344,0.103297,0.102892,0.100957,0.100865,0.100332,0.098708,0.097986,0.097448,0.097054,0.096952,0.096179,0.095906,0.095871,0.095829,0.095826,0.093898,0.092736,0.092481


In [33]:
# feature_list = [
#                 'dateofyear CULEX PIPIENS/RESTUANS',
#                 'dateofyear CULEX PIPIENS',
#                 'dateofyear CULEX RESTUANS',
#                 'Latitude CULEX PIPIENS/RESTUANS',
#                 'Latitude CULEX PIPIENS',
#                 'Latitude CULEX RESTUANS',
#                 'Longitude CULEX PIPIENS/RESTUANS', 
#                 'Longitude CULEX PIPIENS',
#                 'Longitude CULEX RESTUANS',              
#                'WetBulb','Sunrise','Sunset','Tmin','Tmax','DewPoint']

# feature_list = ['WetBulb Sunrise',
#                'dateofyear DewPoint',
#                'Sunrise DewPoint',
#                'Sunrise Tmin',
#                'dateofyear Tmin',
#                'Sunrise Sunset',
#                'dateofyear Tmax',
#                'Sunrise Tmax',
#                'dateofyear Sunset']

feature_list = [
                'dateofyear CULEX PIPIENS/RESTUANS',
                'dateofyear CULEX PIPIENS',
                'dateofyear CULEX RESTUANS',
                'CULEX PIPIENS/RESTUANS Sunrise',
                'CULEX PIPIENS Sunrise',
                'CULEX RESTUANS Sunrise',
                'Longitude CULEX PIPIENS/RESTUANS', 
                'Longitude CULEX PIPIENS',
                'Longitude CULEX RESTUANS',              
               'WetBulb Sunrise','Sunrise']


X = poly_train[feature_list]
# y = train_weather_df['WnvPresent']
X_kaggle = poly_kaggle[feature_list]

In [67]:
model_dict = {
    'ss': StandardScaler(),
    'lr': LogisticRegression(solver='lbfgs'),
    'nb': MultinomialNB(),
    'knn': KNeighborsClassifier(),
    'dt': DecisionTreeClassifier(),
    'rf': RandomForestClassifier(),
    'et': ExtraTreesClassifier(),
    'ada_dt': AdaBoostClassifier(),
    'ada_rf': AdaBoostClassifier(base_estimator=RandomForestClassifier()),
    'gboost': GradientBoostingClassifier()
}

model_full = {
    'ss': 'Standard Scaler',
    'lr': 'Logistic Regression',
    'knn': 'KNearestNeighbor',
    'nb': 'Multinomial NB',
    'dt': 'Decision Tree',
    'rf': 'Random Forest',
    'et': 'Extra Tree',
    'ada_dt': 'AdaBoost - Decision Tree',
    'ada_rf': 'AdaBoost - Random Forest',
    'gboost': 'Gradient Boosting Classifier'
}

param_dict = {    
    'knn': {
        'knn__n_neighbors': [2,3,4,5]
    },
    'lr': {
        'lr__max_iter': [100,200]
    },
    'nb': {},
    'dt': {
        'dt__max_depth': [5,7],
        'dt__min_samples_split': [10,15],
        'dt__min_samples_leaf': [3,4]
    },
    'rf': {
        'rf__n_estimators': [500,1000,2000],
        'rf__min_samples_split': [2,3],
        'rf__max_depth': [2,3],
        'rf__min_samples_leaf': [3,4]
        
    },
    'et': {
        'et__n_estimators': [1000,2000],
        'et__min_samples_split': [2,3],
    },
    'ada_dt': {
        'ada_dt__n_estimators': [50,100,200],
        'ada_dt__learning_rate': [0.9, 1]
    },
    'ada_rf': {
        'ada_rf__n_estimators': [50,100,200],
        'ada_rf__learning_rate': [0.9, 1],
        'ada_rf__base_estimator__max_depth': [3], 
        'ada_rf__base_estimator__min_samples_leaf': [4], 
        'ada_rf__base_estimator__min_samples_split': [2], 
        'ada_rf__base_estimator__n_estimators': [1000]
    },
    'gboost': {
        'gboost__n_estimators': [50,100],
        'gboost__max_depth': [2,3,4],
        'gboost__learning_rate': [0.1, 0.5, 1]
    }
}

def prepare_pipeline(list_of_models):
    """
    Prepare pipeline of models to be used for modelling
    
    Parameters
    ----------
    list_of_models: list[str]
        List of models to be included for pipeline
    
    Returns
    -------
    Pipeline
        Pipeline of models to be run
    """
    pipe_list = [(i,model_dict[i]) for i in list_of_models]
    return Pipeline(pipe_list)

def add_params(name,pipe_dict):
    """
    Add parameters for GridSearch
    
    Parameters
    ----------
    name: str
        Name of model/vectorization method to have params added.
    pipe_dict: Dictionary
        Dictionary that contains parameters to be added into GridSearch
    
    Returns
    -------
    Dictionary
        Dictionary that contains parameters to be added for GridSearch
    """
    params = param_dict[name]
    for k,v in params.items():
        pipe_dict[k] = v
    return pipe_dict

def grid_search(model,train_data=X,train_target=y):
    """
    Initialize and run GridSearch
    
    Parameters
    ----------
    model: str
        Initialize which classification model to use. Note classification model has to be contained in model_dict.
        
    train_data: list[str]
        List of training data to be used
    
    Returns
    -------
    List
        List that contains predicted values of the test data
    """
    X_train, X_test, y_train,y_test = train_test_split(train_data,y,test_size=0.25,stratify=y,random_state=42)
    pipe_params = {}
    pipe_params = add_params(model,pipe_params)
    pipe = prepare_pipeline(['ss',model])
    gs = GridSearchCV(pipe,param_grid=pipe_params,cv=3,n_jobs=-1,scoring='roc_auc')
    gs.fit(X_train,y_train)
    print(f'Using {model_full[model]}:')
    print(f'Train Score: {round(gs.best_score_,4)}')
    print(f'Test Score: {round(gs.score(X_test,y_test),4)}')
    print(f'Using the following parameters: {gs.best_params_}')
    pass


In [79]:
## Function to fit full data and predict kaggle target, store as csv
def predict_kaggle(model,output,X=X,y=y,X_kaggle=X_kaggle):
    model.fit(X,y)
    pred = model.predict_proba(X_kaggle)[:,1]
    pred_df = pd.DataFrame({'Id':test_weather_df['Id'],'WnvPresent': pred})
    pred_df.to_csv('../KaggleSubmission/'+output+'.csv',index=False)
    pass

In [35]:
grid_search('rf')

Using Random Forest:
Train Score: 0.8007
Test Score: 0.8174
Using the following parameters: {'rf__max_depth': 3, 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 2, 'rf__n_estimators': 500}


In [80]:
predict_kaggle(RandomForestClassifier(n_estimators=1000,min_samples_leaf=4,min_samples_split=3,max_depth=3),'rf_prediction')

In [37]:
grid_search('lr')

Using Logistic Regression:
Train Score: 0.7265
Test Score: 0.711
Using the following parameters: {'lr__max_iter': 100}


In [81]:
predict_kaggle(LogisticRegression(solver='lbfgs',max_iter=100),'lr_prediction')



In [41]:
grid_search('ada_dt')

Using AdaBoost - Decision Tree:
Train Score: 0.8144
Test Score: 0.8444
Using the following parameters: {'ada_dt__learning_rate': 1, 'ada_dt__n_estimators': 100}


In [82]:
predict_kaggle(AdaBoostClassifier(learning_rate=1,n_estimators=100),'ada_prediction')

In [47]:
grid_search('gboost')

Using Gradient Boosting Classifier:
Train Score: 0.8174
Test Score: 0.838
Using the following parameters: {'gboost__learning_rate': 0.1, 'gboost__max_depth': 3, 'gboost__n_estimators': 50}


In [83]:
predict_kaggle(GradientBoostingClassifier(n_estimators=50,max_depth=3,learning_rate=0.1),'gboost_prediction')

In [68]:
grid_search('ada_rf')

Using AdaBoost - Random Forest:
Train Score: 0.8125
Test Score: 0.841
Using the following parameters: {'ada_rf__base_estimator__max_depth': 3, 'ada_rf__base_estimator__min_samples_leaf': 4, 'ada_rf__base_estimator__min_samples_split': 2, 'ada_rf__base_estimator__n_estimators': 1000, 'ada_rf__learning_rate': 0.9, 'ada_rf__n_estimators': 50}


In [84]:
predict_kaggle(AdaBoostClassifier(base_estimator=RandomForestClassifier(max_depth=3,min_samples_leaf=4,min_samples_split=2,n_estimators=1000),learning_rate=0.9,n_estimators=50),'ada_rf_prediction')