# 6. Modeling

### Import modules and data

In [70]:
!pip install -U imbalanced-learn

Collecting imbalanced-learn
[?25l  Downloading https://files.pythonhosted.org/packages/80/a4/900463a3c0af082aed9c5a43f4ec317a9469710c5ef80496c9abc26ed0ca/imbalanced_learn-0.3.3-py3-none-any.whl (144kB)
[K    100% |████████████████████████████████| 153kB 6.4MB/s ta 0:00:01
Installing collected packages: imbalanced-learn
Successfully installed imbalanced-learn-0.3.3


In [71]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np

In [72]:
np.random.seed(42)

In [73]:
X = pd.read_csv('../data/train_weather_merged.csv')
test = pd.read_csv('../data/test_weather_merged.csv')
y = pd.read_csv('../data/target.csv')

In [74]:
X.set_index('date', inplace=True)

In [75]:
test.set_index('date', inplace=True)

In [76]:
X.shape, test.shape, y.shape

((10413, 179), (116293, 179), (10412, 1))

For some reason our csv reader is not reading in the last row, so let's add it manually for now:

In [77]:
y = y.append({'0': 0}, ignore_index=True)

In [78]:
y.shape

(10413, 1)

#### Check for imbalanced classes

We should always check for imbalanced classes and account for them.

In [129]:
y[y.columns[0]].value_counts()

0    9862
1     551
Name: 0, dtype: int64

In [118]:
y[y.columns[0]].value_counts(normalize=True)

0    0.947085
1    0.052915
Name: 0, dtype: float64

Our classes are really imbalanced! Only 5.5% of our data is of the positive class. We will have to account for this later.

### Train/test split

In [119]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y)

In [120]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [121]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7809, 179), (2604, 179), (7809, 1), (2604, 1))

### Balance Classes by oversampling (SMOTE)

In [122]:
sm = SMOTE(ratio = 'minority')
X_res, y_res = sm.fit_sample(X_train_sc, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [123]:
print(X_res.shape)
print(y_res.shape)

(14792, 179)
(14792,)


In [130]:
pd.Series(y_res).value_counts()

1    7396
0    7396
dtype: int64

Our number of observations increase because SMOTE creates new random samples in which our target equals 1. It also reduces the number of 0 classes have and makes them equal to the number of 1's we have.

### 2nd Train/test split on our re-balanced data

In [132]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_res, 
                                                            y_res,
                                                            stratify=y_res)

In [133]:
X_train_2.shape, X_test_2.shape, y_train_2.shape, y_test_2.shape

((11094, 179), (3698, 179), (11094,), (3698,))

### Random Forest

In [137]:
rf = RandomForestClassifier()
rf.fit(X_train_2, y_train_2)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [138]:
y_preds = rf.predict(X_test_sc)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.95      0.96      0.96      2466
          1       0.19      0.15      0.17       138

avg / total       0.91      0.92      0.92      2604



Unnamed: 0,Pred -,Pred +
Act -,2377,89
Act +,117,21


In [150]:
zipped = list(zip(X_train.columns, rf.feature_importances_))
sorted_importance = sorted(zipped, key=lambda tup: tup[1], reverse=True)
sorted_importance[0:20]

[('sunset', 0.09272385784407308),
 ('sunrise', 0.07954371818833295),
 ('tavg_30', 0.06399959004799173),
 ('month', 0.0637359019937076),
 ('species_culex pipiens', 0.04949109167270172),
 ('latitude', 0.046974169731718925),
 ('longitude', 0.040296335476727695),
 ('trap_T900', 0.03924378276447027),
 ('tavg', 0.036547742166047184),
 ('avgspeed', 0.03376178684130411),
 ('cool', 0.02894961740404368),
 ('sealevel', 0.02812243037005609),
 ('tmax', 0.026503521253003204),
 ('tmin', 0.026311031076807885),
 ('stnpressure', 0.026065633043752007),
 ('wetbulb', 0.02166844474933921),
 ('resultdir', 0.021641467856978226),
 ('day', 0.02142645151470674),
 ('species_culex pipiens/restuans', 0.02113180919166059),
 ('resultspeed', 0.021050112077753997)]

In [151]:
proba_pairs = rf.predict_proba(X_test_sc)
probas = [item[1] for item in proba_pairs]
roc_auc_score(y_test, probas)

0.6926930897892498