In [3]:

import pandas as pd
import numpy as np


amazon = pd.read_csv("../assets/train.csv")

amazon.head()


amazon.isnull().sum()

amazon.iloc[:,1:].corr()


import seaborn as sns
sns.heatmap(amazon.corr())



import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

amazon.iloc[:,1:].corr()


from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import make_pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier
from sklearn.preprocessing import StandardScaler

X = amazon[amazon.columns[1:].tolist()]
y = amazon["ACTION"]
print(X.shape)

tr = DecisionTreeClassifier(max_depth=None)

model = make_pipeline(StandardScaler(with_mean=False),LinearDiscriminantAnalysis(),BaggingClassifier(tr))

model.fit(X,y)

test = pd.read_csv("../assets/test.csv")
sample = pd.read_csv("../assets/sampleSubmission.csv")

test.isnull().sum()

X_test = test[test.columns[1:].tolist()]

X.shape

X_test.shape

amazon.ACTION.value_counts()

predictions = model.predict(X_test)
sample['Action'] = predictions
sample.to_csv('../results/bagging_dt_LDA.csv', index=False)

model.get_params().keys()

from sklearn.grid_search import GridSearchCV

params = {
    'baggingclassifier__n_estimators':[50,100,250,500],
    'lineardiscriminantanalysis__n_components':[2,3,4,5]
}

LinearDiscriminantAnalysis()

gs = GridSearchCV(model, params, cv=5, verbose=1, n_jobs=-1,scoring="roc_auc")
gs.fit(X, y)

predictions = gs.predict(X_test)
sample['Action'] = predictions
sample.to_csv('../results/bagging_dt_LDA_gs.csv', index=False)

from sklearn.cross_validation import cross_val_score, StratifiedKFold

cv = StratifiedKFold(y, n_folds=5,shuffle=True)

def score(model, name):
    s = cross_val_score(model, X, y, cv=cv,scoring="roc_auc")
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

#Import our ensembles
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier

#initialize our random forest. Since we did not specify it in our parameters, the number of trees in our forest would
#be 10 as a default
rf = RandomForestClassifier(class_weight='balanced')

#Let's fit our X and y
rf.fit(X,y)

#One method the ensembles we are going to look at have that we can make use of is the feature_importances_.
#As the name suggests, this will show us how predictive our features are. It will return an array of percentages of 
#our features. To make it so that it makes more sense to us, I'm going to have the feature names beside the %.
#Let's take a look at them for our Random Forest Classifier.

for a,b in zip(test.columns[1:].tolist(), rf.feature_importances_):
    print a, b

#Let's look at the cross val score of our Random Forest
score(rf, "Random Forest Classifier")

#Initialize adaboost, fit and let's check the feature importances like we did for Random Forest Classifier
ada = AdaBoostClassifier()
ada.fit(X,y)

for a,b in zip(test.columns[1:].tolist(), ada.feature_importances_):
    print a, b

#Let's look at the cross val score of our AdaBoost
score(ada, "Adaptive Boosting Classifier")

#Lastly, let's do this for the gradient boosting classifier
gb = GradientBoostingClassifier()
gb.fit(X,y)

for a,b in zip(test.columns[1:].tolist(), gb.feature_importances_):
    print a, b

score(gb, "Gradient Boosting Classifier")

#It seems like our Random Forest has the best score, so before using the feature importances, let's run the test data
#on our model so that we can see what Random Forest outputs and see if we can play around with the features.
predictions = rf.predict(X_test)
sample['Action'] = predictions
sample.to_csv('../results/rf.csv', index=False)

params = {'n_estimators':[3, 5, 10, 50],
          'criterion': ['gini', 'entropy'],
          'max_depth': [3, 5, 10, 20],
          'min_samples_split': [2,5],
          'class_weight':[None, 'balanced']}


gsrf = GridSearchCV(rf,params, n_jobs=-1,cv=cv,scoring="roc_auc")
gsrf.fit(X, y)

predictions = gsrf.predict(X_test)
sample['Action'] = predictions
sample.to_csv('../results/gs_rf.csv', index=False)