<h1> Santander by Random Forest </h1>

In [15]:
import pandas as pd 
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from scipy import stats
from matplotlib.legend_handler import HandlerLine2D
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
import scipy

In [16]:
def read_input():
    #Basic Input
    data = pd.read_csv("train.csv", header=0) 
    data = data.sample(frac=1).reset_index(drop=True)
    #get the 2d array from pandas
    np_data= data.values
    
    labels = np_data[:,1].astype('int')
    train= np_data[:,2:].astype(np.float32)
    print("Train Shape", train.shape)
    test = pd.read_csv("test.csv", header=0).values 
    ids = test[:,0]
    test= test[:,1:].astype(np.float32)
    print("Test Shape", test.shape)
    return train,labels, test, ids



def remove_outlier(train, labels):
    z = np.abs(stats.zscore(train))
    threshold = 3
    x,y=np.where(z > 3)
    lst= np.unique(x)
    all_indices = [i for i in range(0, train.shape[0])]
    non_outlier_indices = list(set(all_indices)-set(lst))
    train2 = train[non_outlier_indices,:]
    labels2 = labels[non_outlier_indices]
    return train2, labels2

def normalize(train):
    scaler = StandardScaler()
    scaler.fit(train)

    return scaler
def min_max_normalize(train):
    scaler = MinMaxScaler()
    scaler.fit(train)
    return scaler

#Append few extra features/columns
def append_features(train):
    min_col = np.min(train, axis=1).reshape((-1,1))
    max_col = np.max(train, axis=1).reshape((-1,1))
    mean_col = np.mean(train, axis=1).reshape((-1,1))
    std_col = np.std(train, axis=1).reshape((-1,1))
    sum_col = np.sum(train, axis=1).reshape((-1,1))
    med_col = np.median(train, axis=1).reshape((-1,1))
    skew_col = scipy.stats.skew(train, axis=1).reshape((-1,1))
    kurtosis_col = scipy.stats.kurtosis(train, axis=1).reshape((-1,1))
    

    new_train = train.copy()
    
    new_train=np.column_stack((new_train, min_col))
    new_train=np.column_stack((new_train, max_col))
    new_train=np.column_stack((new_train, mean_col))
    new_train=np.column_stack((new_train, std_col))
    new_train=np.column_stack((new_train, sum_col))
    new_train=np.column_stack((new_train, med_col))
    new_train=np.column_stack((new_train, skew_col))
    new_train=np.column_stack((new_train, kurtosis_col))
    
    lst= [0.01,0.05,0.10,0.25,0.50,0.60, 0.70, 0.80, 0.9, 0.95]
    for l in lst:
        quintile_col = np.quantile(train,l, axis=1)
        new_train=np.column_stack((new_train, med_col))
    
    total_features= new_train.shape[0]
    
#     new_train_2 = new_train**2
#     new_train_3 = new_train**2
    
#     new_train=np.column_stack((new_train, new_train_2))
#     new_train=np.column_stack((new_train, new_train_3))
        
    return new_train
#Append few extra features/columns
def append_bin_features(train):
    
    min_col = np.min(train, axis=1).reshape((-1,1))
    max_col = np.max(train, axis=1).reshape((-1,1))
    mean_col = np.mean(train, axis=1).reshape((-1,1))
    std_col = np.std(train, axis=1).reshape((-1,1))
    sum_col = np.sum(train, axis=1).reshape((-1,1))
    med_col = np.median(train, axis=1).reshape((-1,1))
    skew_col = scipy.stats.skew(train, axis=1).reshape((-1,1))
    kurtosis_col = scipy.stats.kurtosis(train, axis=1).reshape((-1,1))
    

    new_train = train.copy()
    
    new_train=np.column_stack((new_train, min_col))
    new_train=np.column_stack((new_train, max_col))
    new_train=np.column_stack((new_train, mean_col))
    new_train=np.column_stack((new_train, std_col))
    new_train=np.column_stack((new_train, sum_col))
    new_train=np.column_stack((new_train, med_col))
    new_train=np.column_stack((new_train, skew_col))
    new_train=np.column_stack((new_train, kurtosis_col))
    
    
    lst= [0.01,0.05,0.10,0.25,0.50,0.60, 0.70, 0.80, 0.9, 0.95]
    for l in lst:
        quintile_col = np.quantile(train,l, axis=1)
        new_train=np.column_stack((new_train, med_col))
    
    total_features= new_train.shape[0]
    
    
    bin_train = train.copy()
    scaler = MinMaxScaler()
    scaler.fit(bin_train)
    min_max_train = np.round(scaler.transform(bin_train) * 10.0)
    
    scaler2 = MinMaxScaler()
    scaler2.fit(min_max_train)
    min_max_train = scaler2.transform(min_max_train)
    
    new_train=np.column_stack((new_train, min_max_train))
    
    
    return new_train

def classify(X_train, X_test, y_train, y_test, nest=25, max_depth=15, min_samples_leaf=80,max_features='auto',class_weight={0:1,1:9}):
    classifier=RandomForestClassifier(n_estimators=nest, max_depth=max_depth, min_samples_leaf=min_samples_leaf,max_features=max_features, random_state=42, class_weight=class_weight,n_jobs=-1,verbose=True)
    classifier.fit(X_train, y_train)
    predictions = classifier.predict(X_train)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_train, predictions)
    roc_auc_tr = auc(false_positive_rate, true_positive_rate)
    pred_test = classifier.predict(X_test)
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, pred_test)
    roc_auc_test = auc(false_positive_rate, true_positive_rate)
    return roc_auc_tr, roc_auc_test, classifier



In [17]:
#Reading Input
org_train, org_labels, org_test,ids = read_input_balanced()

Train Shape (359804, 200)
Test Shape (200000, 200)


In [18]:

#try a basic random forest and see how does it work
#Split the data into train and test (validation)
X_train, X_test, y_train, y_test = train_test_split(org_train, org_labels, test_size=0.20)
tr_auc,test_auc,classifier= classify(X_train, X_test, y_train, y_test, nest =50, class_weight={0:1,1:5})
print("Training and Test AUC:", tr_auc, test_auc)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    1.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training and Test AUC: 0.5000104296312777 0.5000138577090436


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.2s finished


In [19]:
#Outlier Removal
train,labels= remove_outlier(org_train, org_labels)
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.20)
tr_auc,test_auc,classifier= classify(X_train, X_test, y_train, y_test,nest=25)
print("Training and Test AUC:", tr_auc, test_auc)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   57.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training and Test AUC: 0.5 0.5


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.1s finished


In [21]:
#outlier Removal + normalization

train,labels= remove_outlier(org_train, org_labels)
scaler= normalize(train)
train = scaler.transform(train)
print(train.shape, labels.shape)

X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.20)
tr_auc,test_auc,classifier= classify(X_train, X_test, y_train, y_test,nest=25)
print("Training and Test AUC:", tr_auc, test_auc)

(342743, 200) (342743,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   43.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training and Test AUC: 0.5 0.5


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.0s finished


In [27]:
#outlier Removal+advanced features without bin + normalization
train,labels= remove_outlier(org_train, org_labels)
scaler= normalize(train)
new_train = append_features(scaler.transform(train))
print(new_train.shape, labels.shape)
# test= scaler.transform(org_test)

X_train, X_test, y_train, y_test = train_test_split(new_train, labels, test_size=0.20)
tr_auc,test_auc,classifier= classify(X_train, X_test, y_train, y_test,nest=25)
print("Training and Test AUC:", tr_auc, test_auc)

(188969, 216) (188969,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   25.4s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  25 out of  25 | elapsed:    0.5s finished
[Parallel(n_jobs=12)]: Using backend ThreadingBackend with 12 concurrent workers.


Training and Test AUC: 0.8132638502234497 0.7171743445215831


[Parallel(n_jobs=12)]: Done  25 out of  25 | elapsed:    0.1s finished


In [22]:
#outlier Removal+advanced features with bin + normalization
train,labels= remove_outlier(org_train, org_labels)
scaler= normalize(train)
new_train = append_bin_features(scaler.transform(train))
print(new_train.shape, labels.shape)
# test= scaler.transform(org_test)

X_train, X_test, y_train, y_test = train_test_split(new_train, labels, test_size=0.20)
tr_auc,test_auc,classifier= classify(X_train, X_test, y_train, y_test,nest=25)
print("Training and Test AUC:", tr_auc, test_auc)

(342743, 418) (342743,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   44.9s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.7s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training and Test AUC: 0.5034320269018389 0.502377225024642


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.1s finished


In [23]:
#WithoutBin
#train,labels= remove_outlier(org_train, org_labels)
scaler= normalize(train)
new_train = append_features(scaler.transform(train))
print(new_train.shape, labels.shape)
new_test = append_features(scaler.transform(org_test))

tr_auc,test_auc,classifier= classify(new_train, new_train, labels, labels,nest=25)
print("Training and Test AUC:", tr_auc, test_auc)


test_predictions = classifier.predict(new_test)
columns=['ID_code','target']
query_output = pd.DataFrame(columns=columns)
query_output['ID_code']= ids
query_output['target']=test_predictions
query_output.to_csv('output_random_forest.csv', index=False,sep=',')   

(342743, 218) (342743,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed:   59.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


Training and Test AUC: 0.5036308535128362 0.5036308535128362


[Parallel(n_jobs=4)]: Done  25 out of  25 | elapsed:    0.3s finished


In [26]:
#WithBin
train,labels= remove_outlier(org_train, org_labels)
scaler= normalize(np.concatenate((train, org_test), axis=0))
new_train = append_bin_features(scaler.transform(train))
print(new_train.shape, labels.shape)
new_test = append_bin_features(scaler.transform(org_test))

tr_auc,test_auc,classifier= classify(new_train, new_train, labels, labels,nest=75,class_weight={0:1,1:1})
print("Training and Test AUC:", tr_auc, test_auc)


test_predictions = classifier.predict(new_test)
columns=['ID_code','target']
query_output = pd.DataFrame(columns=columns)
query_output['ID_code']= ids
query_output['target']=test_predictions
query_output.to_csv('output_random_forest.csv', index=False,sep=',')   

(342743, 418) (342743,)


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  75 out of  75 | elapsed:  3.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    2.4s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    2.7s finished


Training and Test AUC: 0.8674572835122331 0.8674572835122331


[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done  75 out of  75 | elapsed:    1.6s finished


In [9]:
pwd

'C:\\Users\\thm_m\\Desktop\\ML\\ML'

In [10]:
import os
os.chdir('C:\\Users\\thm_m\\Desktop\\ML')