# Importing Numpy Arrays

In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.scorer import make_scorer

In [3]:
X = np.load('/tmp/X.npy')
y = np.load('/tmp/y.npy')
X_test = np.load('/tmp/X_test.npy')

In [4]:
X.shape, y.shape

((595212, 114), (595212,))

In [5]:
X_small = np.load('/tmp/X_small.npy')
y_small = np.load('/tmp/y_small.npy')

In [6]:
X_small.shape, y_small.shape

((17856, 114), (17856,))

In [7]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

#my_scorer = make_scorer(normalized_gini, greater_is_better=True)

In [8]:
porto_test_id = pd.read_csv('data/test.csv',usecols=['id'])
def make_submission(name, pred):
    s = porto_test_id['id'].to_frame()
    s['target'] = pred
    s.to_csv('Submissions/'+name+'.csv',index=False)

In [39]:
def binarize(pred,threshold):
    s= pred
    s[s<threshold] = 0
    s[s>=threshold] = 1
    s.astype(int)
    return s

# 1) SGDClassifier

In [8]:
from sklearn.linear_model import SGDClassifier

In [9]:
sgd_clf = SGDClassifier(random_state=42)

In [13]:
sgd_clf.fit(X, y)



SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=None, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=42, shuffle=True,
       tol=None, verbose=0, warm_start=False)

In [15]:
y_pred = sgd_clf.predict(X_test)

In [19]:
(y_pred == 0).sum(),(y_pred == 1).sum()

(892816, 0)

### a) Stratified 3 KFold Cross Vlidation

In [21]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)

for train_index, test_index in skfolds.split(X, y):
    clone_clf = clone(sgd_clf)
    X_train_folds = X[train_index]
    y_train_folds = (y[train_index])
    X_test_fold = X[test_index]
    y_test_fold = (y[test_index])
    clone_clf.fit(X, y)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct / len(y_pred))



0.963549305713




0.963554162214




0.963553978518


In [22]:
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X, y, cv=3, scoring="accuracy")



array([ 0.96354931,  0.96355416,  0.96355398])

In [10]:
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X, y, cv=3)



In [11]:
confusion_matrix(y, y_train_pred)

array([[28500,   175],
       [ 1076,     9]])

In [15]:
pred = y_train_pred[:10]
y = y[:10]

In [22]:
y

array([1, 0, 1, 1, 0, 0, 0, 0, 0, 0])

In [37]:
pred

array([ 0.8 ,  0.4 ,  0.49,  0.9 ,  0.6 ,  0.2 ,  0.4 ,  0.4 ,  0.7 ,  0.1 ])

In [47]:
np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)

array([[ 1.  ,  0.8 ,  0.  ],
       [ 0.  ,  0.4 ,  1.  ],
       [ 1.  ,  0.49,  2.  ],
       [ 1.  ,  0.9 ,  3.  ],
       [ 0.  ,  0.6 ,  4.  ],
       [ 0.  ,  0.2 ,  5.  ],
       [ 0.  ,  0.4 ,  6.  ],
       [ 0.  ,  0.4 ,  7.  ],
       [ 0.  ,  0.7 ,  8.  ],
       [ 0.  ,  0.9 ,  9.  ]])

In [29]:
precision_score(y, y_train_pred),recall_score(y, y_train_pred)

  'precision', 'predicted', average, warn_for)


(0.0, 0.0)

In [48]:
g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)

In [49]:
print(-1*g[:,1])

[-0.8  -0.4  -0.49 -0.9  -0.6  -0.2  -0.4  -0.4  -0.7  -0.9 ]


In [50]:
np.lexsort((g[:,2], -1*g[:,1]))

array([3, 9, 0, 8, 4, 2, 1, 6, 7, 5])

In [53]:
g=g[np.lexsort((g[:,2], -1*g[:,1]))]

In [54]:
print(g)

[[ 1.    0.9   3.  ]
 [ 0.    0.9   9.  ]
 [ 1.    0.8   0.  ]
 [ 0.    0.7   8.  ]
 [ 0.    0.6   4.  ]
 [ 1.    0.49  2.  ]
 [ 0.    0.4   1.  ]
 [ 0.    0.4   6.  ]
 [ 0.    0.4   7.  ]
 [ 0.    0.2   5.  ]]


In [58]:
g[:,0].cumsum().sum()

23.0

In [59]:
g[:,0].sum()

3.0

In [65]:
y.get_label()

AttributeError: 'numpy.ndarray' object has no attribute 'get_label'

In [60]:
def gini(pred, y):
    g = np.asarray(np.c_[y, pred, np.arange(len(y)) ], dtype=np.float)
    g = g[np.lexsort((g[:,2], -1*g[:,1]))]
    gs = g[:,0].cumsum().sum() / g[:,0].sum()
    gs -= (len(y) + 1) / 2.
    return gs / len(y)

def normalized_gini(pred, y):
    return gini(pred, y) / gini(y, y)

In [64]:
gini(pred,y), gini(y,y), gini(y,pred)/gini(pred,pred)

(0.2166666666666667, 0.34999999999999998, 0.041373926619827861)

# 2) DecisionTreeClassifier

In [39]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_graphviz

In [79]:
max_depth = 10
tree_clf = DecisionTreeClassifier(max_depth=max_depth)
tree_clf.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [80]:
export_graphviz(
            tree_clf,
            out_file="tree.dot",
            #feature_names=iris.feature_names[2:],
            #class_names=iris.target_names,
            rounded=True,
            filled=True
        )

In [81]:
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

In [45]:
confusion_matrix(y, y_train_pred)

array([[573503,     15],
       [ 21691,      3]])

In [46]:
precision_score(y, y_train_pred),recall_score(y, y_train_pred)

(0.16666666666666666, 0.00013828708398635567)

# 3) SVC

In [72]:
from sklearn.svm import SVC

In [73]:
svm_clf = SVC(probability=True)

In [74]:
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

In [78]:
confusion_matrix(y, y_train_pred)

array([[573506,     12],
       [ 21688,      6]])

# 4) LogisticRegression

In [82]:
from sklearn.linear_model import LogisticRegression

In [83]:
log_reg = LogisticRegression()
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

In [84]:
confusion_matrix(y, y_train_pred)

array([[573017,    501],
       [ 21640,     54]])

In [85]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly_features = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly_features.fit_transform(X)

In [None]:
X_poly.shape

In [None]:
y_train_pred = cross_val_predict(tree_clf, X, y, cv=5)

# 4) Random Forest

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)

In [12]:
rnd_clf.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=16,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [15]:
sub = rnd_clf.predict_proba(X_test)[:,1]

In [20]:
sub_cross = cross_val_predict(rnd_clf, X, y, cv=5, method='predict_proba')[:,1]

In [43]:
s = np.arange(10)/10
print(s)

[ 0.   0.1  0.2  0.3  0.4  0.5  0.6  0.7  0.8  0.9]


In [47]:
binarize(s, 0.1)

array([ 0.,  0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.,  1.])

In [45]:
make_submission('df5',sub)

In [18]:
y_train_pred = cross_val_predict(rnd_clf, X, y, cv=5, method='predict_proba')

In [23]:
max(y_train_pred[:,1])

0.11168642889703877

In [13]:
confusion_matrix(y, y_train_pred)

array([[573518,      0],
       [ 21694,      0]])

### Grid Search

In [57]:
param_grid = [
        {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
        {'bootstrap': [False], 'n_estimators': [3, 10], 'max_features': [2, 3, 4]},
]

In [59]:
param_grid[0]['n_estimators']

[3, 10, 30]

In [26]:
forest_clf = RandomForestClassifier()

In [47]:
grid_search = GridSearchCV(forest_clf, param_grid, cv=5, scoring=my_scorer)

In [None]:
grid_search.fit(X, y)

In [49]:
grid_search.best_params_

{'max_features': 8, 'n_estimators': 3}

In [None]:
X_train, X_t, y_train, y_t = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

In [50]:
t = grid_search.predict_proba(X_test)

In [18]:
import itertools
from sklearn.model_selection import train_test_split

def RNDSearch(param_dic, X, y, cv = 5):
    gini=-10
    X_train, X_t, y_train, y_t = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)
    
    for i,p in enumerate (itertools.product(param_dic['n_estimators'],param_dic['max_features'])):
        
        clf = RandomForestClassifier(n_estimators=p[0],max_features=p[1], n_jobs=-1)
        
        # pred = cross_val_predict(clf, X, y, cv=cv)
        
        clf.fit(X_train,y_train)
        pred = clf.predict_proba(X_t)[:,1]
        
        norm_gini = normalized_gini(pred, y_t)
        if gini < norm_gini :
            gini = norm_gini
            best = p
        print(i+1, ': (n_estimators , max_features) =', p, '=> CV-gini = ', gini)
    return best

In [19]:
param = {'n_estimators': [50], 'max_features': [50]}
est = RNDSearch(param,X,y)

1 : (n_estimators , max_features) = (50, 50) => CV-gini =  0.152783672596


In [106]:
clf = RandomForestClassifier(max_depth=10, random_state=0,n_jobs=-1)

In [107]:
clf.fit(X_t, y_t)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [110]:
pred = clf.predict_proba(X_test)[:,1]