# 6. Modeling

### Import modules and data

In [2]:
!pip install -U imbalanced-learn

Requirement already up-to-date: imbalanced-learn in /Users/namanbhandari/anaconda3/lib/python3.6/site-packages (0.3.3)


In [3]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE

import pandas as pd
import numpy as np

In [4]:
np.random.seed(42)

In [5]:
X = pd.read_csv('../data/train_weather_merged.csv')
test = pd.read_csv('../data/test_weather_merged.csv')
y = pd.read_csv('../data/target.csv')

In [6]:
X.set_index('date', inplace=True)

In [7]:
test.set_index('date', inplace=True)

In [8]:
X.shape, test.shape, y.shape

((10413, 179), (116293, 179), (10412, 1))

For some reason our csv reader is not reading in the last row, so let's add it manually for now:

In [9]:
y = y.append({'0': 0}, ignore_index=True)

In [10]:
y.shape

(10413, 1)

#### Check for imbalanced classes

We should always check for imbalanced classes and account for them.

In [11]:
y[y.columns[0]].value_counts()

0    9862
1     551
Name: 0, dtype: int64

In [12]:
y[y.columns[0]].value_counts(normalize=True)

0    0.947085
1    0.052915
Name: 0, dtype: float64

Our classes are really imbalanced! Only 5.5% of our data is of the positive class. We will have to account for this later.

### Train/test split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    stratify=y)

In [14]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [15]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((7809, 179), (2604, 179), (7809, 1), (2604, 1))

### Balance Classes by oversampling (SMOTE)

In [16]:
sm = SMOTE(ratio = 'minority')
X_res, y_res = sm.fit_sample(X_train_sc, y_train)

  y = column_or_1d(y, warn=True)


In [17]:
print(X_res.shape)
print(y_res.shape)

(14792, 179)
(14792,)


In [18]:
pd.Series(y_res).value_counts()

1    7396
0    7396
dtype: int64

Our number of observations increase because SMOTE creates new random samples in which our target equals 1. It also reduces the number of 0 classes have and makes them equal to the number of 1's we have.

### 2nd Train/test split on our re-balanced data

In [19]:
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_res, 
                                                            y_res,
                                                            stratify=y_res)

In [20]:
X_train_2.shape, X_test_2.shape, y_train_2.shape, y_test_2.shape

((11094, 179), (3698, 179), (11094,), (3698,))

### Random Forest

In [21]:
rf = RandomForestClassifier()
rf.fit(X_train_2, y_train_2)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [22]:
y_preds = rf.predict(X_test_sc)
print(classification_report(y_test, y_preds))
pd.DataFrame(confusion_matrix(y_test, y_preds), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.95      0.97      0.96      2466
          1       0.18      0.11      0.14       138

avg / total       0.91      0.93      0.92      2604



Unnamed: 0,Pred -,Pred +
Act -,2399,67
Act +,123,15


Top 20 features:

In [23]:
zipped = list(zip(X_train.columns, rf.feature_importances_))
sorted_importance = sorted(zipped, key=lambda tup: tup[1], reverse=True)
sorted_importance[0:20]

[('sunset', 0.0952808176319204),
 ('sunrise', 0.04833919804496684),
 ('species_culex restuans', 0.048114408283381684),
 ('tavg_30', 0.04784691531477939),
 ('longitude', 0.04462278801878504),
 ('species_culex pipiens/restuans', 0.044061955206931486),
 ('avgspeed', 0.0427418122817518),
 ('tmin', 0.037205958914909194),
 ('wetbulb', 0.035022892486726215),
 ('month', 0.03474433884707335),
 ('tavg', 0.033723350946946194),
 ('tmax', 0.03318338245545421),
 ('trap_T900', 0.032664685405053975),
 ('resultspeed', 0.03167449860424964),
 ('latitude', 0.030844150734318322),
 ('cool', 0.028381444130743137),
 ('resultdir', 0.028208557635213783),
 ('sealevel', 0.027647371673611355),
 ('br', 0.02480440656771956),
 ('stnpressure', 0.024751775242240943)]

In [24]:
proba_pairs = rf.predict_proba(X_test_sc)
probas = [item[1] for item in proba_pairs]
roc_auc_score(y_test, probas)

0.6870849348237479

### Kaggle Submission

In [35]:
index = np.arange(1, len(test) + 1)

In [36]:
index

array([     1,      2,      3, ..., 116291, 116292, 116293])

In [38]:
predictions = rf.predict(test)
predictions
kaggle = pd.DataFrame(data=predictions, index=index, columns=['WnvPresent'])
kaggle.index.name = 'Id'
kaggle.reset_index(inplace=True)

In [39]:
kaggle.shape

(116293, 2)

In [40]:
kaggle.to_csv('../data/rf_model.csv', index=False)