In [60]:
import numpy as np
import pandas as pd
from IPython.display import display
import visuals as vs
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score
from sklearn.grid_search import GridSearchCV
import xgboost as xgb

In [61]:
traindf = pd.read_csv('./train_data.csv' , index_col='index')
test = pd.read_csv('./test_data.csv')

In [62]:
cont_features = traindf.columns[traindf.columns.str.startswith('cont')] 
cat_features = traindf.columns[traindf.columns.str.startswith('cat')] 

In [65]:
X_train_cont = traindf[cont_features]
X_train_cat = traindf[cat_features]
y_train = traindf.target

In [64]:
X_train_cont.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            169297, 169298, 169299, 169300, 169301, 169302, 169303, 169304,
            169305, 169306],
           dtype='int64', name=u'index', length=169307)

In [66]:
target = traindf['target']
traindf = traindf.drop(['target' , 'connection_id'] , axis=1 )

In [67]:
X_train = traindf[cont_features]
X_test = test[cont_features]

In [68]:
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder(sparse=False)
for col in cat_features:
    data=traindf[[col]].append(test[[col]])
    enc.fit(data)
    # Fitting One Hot Encoding on train data
    temp = enc.transform(traindf[[col]])
    # Changing the encoded features into a data frame with new column names
    temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
            .value_counts().index])
    # In side by side concatenation index values should be same
    # Setting the index values similar to the X_train data frame
    temp=temp.set_index(traindf.index.values)
    # adding the new One Hot Encoded varibales to the train data frame
    X_train=pd.concat([X_train,temp],axis=1)
    # fitting One Hot Encoding on test data
    temp = enc.transform(test[[col]])
    # changing it into data frame and adding column names
    temp=pd.DataFrame(temp,columns=[(col+"_"+str(i)) for i in data[col]
            .value_counts().index])
    # Setting the index for proper concatenation
    temp=temp.set_index(test.index.values)
    # adding the new One Hot Encoded varibales to test data frame
    X_test=pd.concat([X_test,temp],axis=1)


In [69]:
cont_features = traindf.columns[traindf.columns.str.startswith('cont')] 
cat_features = X_train.columns[X_train.columns.str.startswith('cat')] 

In [70]:
X_train_cat = X_train[cat_features]

In [71]:
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'C' : 0.025
    }

In [72]:
from vecstack import stacking
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [73]:
models_cont = [LinearSVC(**svc_params)]

In [74]:
models_cat = [
    ExtraTreesClassifier(**et_params),
    RandomForestClassifier(**rf_params)
    AdaBoostClassifier(**ada_params)
    GradientBoostingClassifier(**gb_params)
]

In [75]:
X_train_cont_sparse = X_train_cont.as_matrix()
X_train_cat_sparse = X_train_cat.as_matrix()
y_train_sparse = y_train.as_matrix()
X_test_cat_sparse = X_test[cat_features].as_matrix()
X_test_cont_sparse = X_test[cont_features].as_matrix()

In [76]:
X_train_cont_sparse.shape

(169307, 18)

In [78]:
y_train.shape

(169307,)

In [79]:
S_train_1 , S_test_1 = stacking(models_cont , X_train_cont_sparse , y_train , X_test_cont_sparse , regression=False , metric=accuracy_score , n_folds = 4, 
    stratified = True, shuffle = True, random_state = 0, verbose = 2)

task:   [classification]
metric: [accuracy_score]

model 0: [LinearSVC]
    fold 0: [0.49166037]
    fold 1: [0.75129350]
    fold 2: [0.68841847]
    fold 3: [0.75270519]
    ----
    MEAN:   [0.67101774]



In [80]:
S_train_2 , S_test_2 = stacking(models_cat , X_train_cat_sparse , y_train , X_test_cat_sparse , regression=False , metric=accuracy_score , n_folds = 4, 
    stratified = True, shuffle = True, random_state = 0, verbose = 2)

task:   [classification]
metric: [accuracy_score]

model 0: [ExtraTreesClassifier]
    fold 0: [0.77620015]
    fold 1: [0.77617124]
    fold 2: [0.77642584]
    fold 3: [0.77687473]
    ----
    MEAN:   [0.77641799]

model 1: [RandomForestClassifier]
    fold 0: [0.77454640]
    fold 1: [0.77437569]
    fold 2: [0.77474838]
    fold 3: [0.77448849]
    ----
    MEAN:   [0.77453974]



In [81]:
model = xgb.XGBClassifier(seed = 0, learning_rate = 0.1, 
    n_estimators = 100, max_depth = 6)
    
# Fit 2-nd level model
scores = cross_val_score(model , np.c_[S_train_1, S_train_2] , y_train , cv=3)
print scores
print np.mean(scores)



[ 0.77691544  0.77666738  0.77652166]
0.776701493855


In [82]:
model.fit(np.c_[S_train_1, S_train_2] , y_train )

XGBClassifier(base_score=0.5, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=6, min_child_weight=1, n_estimators=100,
       nthread=-1, objective='multi:softprob', seed=0, silent=True,
       subsample=1)

In [83]:
# Predict
y_pred = model.predict(np.c_[X_test_cat_sparse , X_test_cont_sparse])

In [87]:
y_pred

array([0, 0, 0, ..., 0, 0, 0])

In [88]:
test['target'] = y_pred

In [22]:
test[['connection_id' , 'target']].to_csv('./result.csv' , index = False)

array([0])