In [1]:
import numpy as np
import xgboost as xgb
import pandas as pd
from sklearn.cross_validation import StratifiedKFold, cross_val_score
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier
from itertools import combinations
from numpy import array, array_equal, unique
from sklearn.cluster import KMeans

print('Started!')
train = pd.read_csv('train.csv', index_col=0)
test = pd.read_csv('test.csv', index_col=0)
target = train.TARGET
train.drop('TARGET', axis=1, inplace=True)


def identify_constant_features(X):
    count_uniques = X.apply(lambda x: len(x.unique()))
    constants = set(count_uniques[count_uniques == 1].index.tolist())
    return constants

def identify_equal_features(X):
    features_to_compare = list(combinations(X.columns.tolist(),2))
    equal_features = []
    for compare in features_to_compare:
        is_equal = array_equal(X[compare[0]],X[compare[1]])
        if is_equal:
            equal_features.append(list(compare))
    return array(equal_features)[:,1]

constants = identify_constant_features(train)
train.drop(constants, axis = 1, inplace = True)

equal_features = identify_equal_features(train)
train.drop(equal_features, axis = 1, inplace = True)

test.drop(constants, axis = 1, inplace = True)
test.drop(equal_features, axis = 1, inplace = True)


features = train.columns
train.insert(1, 'SumZeros', (train[features] == 0).astype(int).sum(axis=1))
test.insert(1, 'SumZeros', (test[features] == 0).astype(int).sum(axis=1))
train['var38'] = train['var38'].map(np.log1p)
test['var38'] = test['var38'].map(np.log1p)
pca = PCA(n_components = 3)
x_train_projected = pca.fit_transform(normalize(train[features], axis=0))
x_test_projected = pca.fit_transform(normalize(test[features], axis=0))
train.insert(1, 'PCAOne', x_train_projected[:, 0])
train.insert(1, 'PCATwo', x_train_projected[:, 1])
train.insert(1, 'PCAThree', x_train_projected[:, 2])
test.insert(1, 'PCAOne', x_test_projected[:, 0])
test.insert(1, 'PCATwo', x_test_projected[:, 1])
test.insert(1, 'PCAThree', x_test_projected[:, 2])

etc = ExtraTreesClassifier(n_estimators=500, max_features=40, criterion= 'entropy',min_samples_split= 4,
                        max_depth = 35, min_samples_leaf = 2, n_jobs = -1, random_state=1)
etc.fit(train, target)

features = etc.feature_importances_
feat_imp = pd.Series(features, index = train.columns)
l = list(feat_imp[feat_imp > 0.00005].index)
train = train[l]
test = test[l]

skf = StratifiedKFold(target, n_folds=5, shuffle=True, random_state = 1)
score_metric = 'roc_auc'

etc_new = xgb.XGBClassifier(max_depth=5, n_estimators=350, objective = "binary:logistic", silent = True, learning_rate=0.03, 
                                    nthread=4, subsample=0.8, colsample_bytree=0.7, seed=4242)

def score_model(model):
    return cross_val_score(model, train, target, cv = skf, scoring = score_metric)

print score_model(etc_new).mean()

Started!
0.840699684531


In [162]:
etc_new.fit(train, target)
pred = etc_new.predict_proba(test)[:, 1]
test.insert(0,'TARGET',pred)
test[['TARGET']].to_csv('santander_etc.csv')