# Importing Numpy Arrays

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict

In [3]:
X = np.load('/tmp/X.npy')
X_test = np.load('/tmp/X_test.npy')
y = np.load('/tmp/y.npy')

In [70]:
(y == 0).sum(),(y == 1).sum(),(y == 1).sum()*100/len(y)

(573518, 21694, 3.6447517859182947)

# 1) SGDClassifier

In [12]:
from sklearn.linear_model import SGDClassifier

In [13]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X, y)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [15]:
y_pred = sgd_clf.predict(X_test)

In [19]:
(y_pred == 0).sum(),(y_pred == 1).sum()

(892816, 0)

### a) Stratified 3 KFold Cross Vlidation

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X, y):
    clone_clf = clone(sgd_clf)
    X_train_folds = X[train_index]
    y_train_folds = (y[train_index])
    X_test_fold = X[test_index]
    y_test_fold = (y[test_index])
    clone_clf.fit(X, y)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))



0.963549305713




0.963554162214




0.963553978518


In [22]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X, y, cv=3, scoring="accuracy")



array([ 0.96354931,  0.96355416,  0.96355398])

In [23]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X, y, cv=3)



In [24]:
confusion_matrix(y, y_train_pred)

array([[573518,      0],
       [ 21694,      0]])

In [29]:
precision_score(y, y_train_pred),recall_score(y, y_train_pred)

  'precision', 'predicted', average, warn_for)


(0.0, 0.0)

# 2) DecisionTreeClassifier

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

In [79]:
max_depth = 10
tree_clf = DecisionTreeClassifier(max_depth=max_depth)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [80]:
export_graphviz(
            tree_clf,
            out_file="tree.dot",
            #feature_names=iris.feature_names[2:],
            #class_names=iris.target_names,
            rounded=True,
            filled=True
        )

In [81]:
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

In [45]:
confusion_matrix(y, y_train_pred)

array([[573503,     15],
       [ 21691,      3]])

In [46]:
precision_score(y, y_train_pred),recall_score(y, y_train_pred)

(0.16666666666666666, 0.00013828708398635567)

# 3) SVC

In [72]:
from sklearn.svm import SVC

In [73]:
svm_clf = SVC(probability=True)

In [74]:
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

In [78]:
confusion_matrix(y, y_train_pred)

array([[573506,     12],
       [ 21688,      6]])

# 4) LogisticRegression

In [82]:
from sklearn.linear_model import LogisticRegression

In [83]:
log_reg = LogisticRegression()
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

In [84]:
confusion_matrix(y, y_train_pred)

array([[573017,    501],
       [ 21640,     54]])

In [85]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)

In [None]:
X_poly.shape

In [None]:
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

# Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [6]:
rnd_clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
len(rnd_clf.feature_importances_)

In [None]:
sub = rnd_clf.predict(X_test)

In [11]:
y_train_pred = cross_val_predict(rnd_clf, X, y, cv=5)

In [13]:
confusion_matrix(y, y_train_pred)

array([[573518,      0],
       [ 21694,      0]])