
https://www.kaggle.com/c/santander-customer-satisfaction




In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.cross_validation import train_test_split, cross_val_score

# cleaners

def clean_constants(data):
    remove = []
    for col in data.columns:
        if data[col].std() == 0:
            remove.append(col)
    return data.drop(remove, axis=1)

def clean_duplicated(data):
    remove = []
    columns = data.columns
    for i in range(len(columns)-1):
        values = data[columns[i]].values
        for j in range(i+1,len(columns)):
            if np.array_equal(values, data[columns[j]].values):
                remove.append(columns[j])
    return data.drop(remove, axis=1)

def export(data, predictions, path):
    export = pd.DataFrame({ "ID": data['ID'], "TARGET": predictions })
    export.to_csv("./predictions.csv", index=False)   

    
# model

test  = pd.read_csv("./test.csv")
train = pd.read_csv("./train.csv")

print "Loaded data: ", train.shape

train = clean_constants(train)
print "Cleaned constants: ", train.shape

train = clean_duplicated(train)
print "Cleaned dups: ", train.shape

features = train.columns[(train.columns != 'TARGET') & (train.columns != 'ID')]



Loaded data:  (76020, 371)
Cleaned constants:  (76020, 337)
Cleaned dups:  (76020, 308)


### DecisionTree

In [3]:
from sklearn import tree

args  = [
    [.65, 7],
    [.7,  7],
    [.75, 7],
    [.85, 7],
    [.9,  7],
    
    [.8,  4],
    [.8,  5],
    [.8,  6],
    [.8,  7],
    [.8,  8],
]

print "Starting..."
for i in range(0, len(args)):
    clf    = tree.DecisionTreeClassifier(max_features=args[i][0], max_depth=args[i][1])
    scores = cross_val_score(clf, train[features], train.TARGET, cv=5, scoring='roc_auc')
    print "%i: %f +- %f" % (i, scores.mean(), scores.std()*2)

Starting...
0: 0.813482 +- 0.020543
1: 0.811429 +- 0.016863
2: 0.811543 +- 0.020739
3: 0.815288 +- 0.017613
4: 0.815949 +- 0.014824
5: 0.803798 +- 0.021053
6: 0.814394 +- 0.022809
7: 0.813517 +- 0.017097
8: 0.812993 +- 0.019044
9: 0.814533 +- 0.026988


### AdaBoost

In [11]:
from sklearn.ensemble import AdaBoostClassifier

args  = [
    [65, .33],
    [65, .35],
    [65, .37],
    [65, .39],
    [65, .41],
]

print "Starting..."
for i in range(0, len(args)):
    clf    = AdaBoostClassifier(n_estimators=args[i][0], learning_rate=args[i][1])
    scores = cross_val_score(clf, train[features], train.TARGET, cv=5, scoring='roc_auc', n_jobs=-1)
    print "%i: %f +- %f" % (i, scores.mean(), scores.std()*2)

Starting...
0: 0.832366 +- 0.013132
1: 0.832376 +- 0.012151
2: 0.832387 +- 0.012229
3: 0.832726 +- 0.013069
4: 0.832200 +- 0.012924


### Gradient

In [10]:
from sklearn.ensemble import GradientBoostingClassifier

args  = [
    [5, 0.03, 350],
]

print "Starting..."
for i in range(0, len(args)):
    clf    = GradientBoostingClassifier(max_depth=args[i][0], learning_rate=args[i][1], n_estimators=args[i][2])
    scores = cross_val_score(clf, train[features], train.TARGET, cv=5, scoring='roc_auc', n_jobs=-1)
    print "%i: %f +- %f" % (i, scores.mean(), scores.std()*2)


Starting...
0: 0.835969 +- 0.018303


### Model

In [12]:
from sklearn.ensemble import AdaBoostClassifier

# clf = GradientBoostingClassifier(max_depth=3, learning_rate=1, n_estimators=100)
clf = AdaBoostClassifier(n_estimators=65, learning_rate=.39)

scores = cross_val_score(clf, train[features], train.TARGET, cv=5, scoring='roc_auc')
print "%i: %f +- %f" % (i, scores.mean(), scores.std()*2)

clf = clf.fit(train[features], train['TARGET'])
res = clf.predict_proba(test[features])[:,1]
export(test, res, "./predictions.csv")


4: 0.832726 +- 0.013069
