In [1]:
import os
import pandas as pd
import numpy as np
import sklearn

In [2]:
os.listdir('.')

['XGB-fulltrain.csv',
 'trainingData.tsv',
 'EDA.ipynb',
 'Untitled.ipynb',
 'EDA_trainData.ipynb',
 '.ipynb_checkpoints',
 'Untitled1.ipynb',
 'dataDeck.pdf',
 'Ensemble models.ipynb',
 'challengeData.tsv',
 'sampleData.tsv',
 'EDA_Challenge.ipynb',
 'ibsaMain.dictionary.xlsx',
 'submissionFormat03092018.csv',
 'ensemble-wmean_lr_xgb_rf.csv',
 'XGB-entiretrain.csv',
 'nohup.out',
 'lost+found',
 'scoring_set.tsv',
 'Basic models.ipynb',
 'ensemble-mean_lr_xgb_rf.csv']

In [None]:
train = pd.read_table('trainingData.tsv')
test = pd.read_table('scoring_set.tsv')

In [6]:
train=train[train['challenge_data.renewed_yorn'].notnull()]

In [7]:
#remove data set name from columns names to make it uniform
columns=[]
for col in train.columns:
    columns.append(col.replace('challenge_data.',''))
train.columns=columns
columns=[]
for col in test.columns:
    columns.append(col.replace('scoring_set.',''))
test.columns=columns

In [8]:
#remove the 'challenge_data.renewed_yorn' from training and test data and move it to a seperate variable
train_y=train['renewed_yorn']
train = train.drop('renewed_yorn',axis=1)
test_y=test['renewed_yorn']
test = test.drop('renewed_yorn',axis=1)

In [9]:
#remove columns which have any null values
percent = (test.isnull().sum()/test.isnull().count())
test.columns[percent>0]
#We would ignore all these 102 columns so we are left with 135 columns
len(test.columns[percent>0])
#all_clean_columns=train.columns[percent==0]
#train_clean=train[all_clean_columns]
unclean_columns=test.columns[percent>0]
train = train.drop(unclean_columns,axis=1)
test = test.drop(unclean_columns,axis=1)

In [10]:
len(unclean_columns)

103

In [10]:
#Remove columns which have too many categories and is string type
messy_columns = []
for col in test.columns:
    if(test[col].dtype=='object' and len(test[col].unique())>20):
        messy_columns.append(col)

In [11]:
len(messy_columns)

56

In [12]:
train=train.drop(messy_columns,axis=1)
test=test.drop(messy_columns,axis=1)

In [13]:
train.shape

(226140, 77)

In [14]:
#Remove the primary key column for data fitting
train_X = train.drop('innovation_challenge_key',axis=1)
test_X = test.drop('innovation_challenge_key',axis=1)

In [15]:
#convert data to one hot encoding to handle categorical values
train_objs_num = len(train_X)
dataset = pd.concat(objs=[train_X, test_X], axis=0)
dataset = pd.get_dummies(dataset)
train_X = dataset[:train_objs_num]
test_X = dataset[train_objs_num:]

In [16]:
#Replace all NaN left with 0
train_X = train_X.fillna(0)
test_X = test_X.fillna(0)

In [17]:
train_X.isnull().any().any()

False

In [18]:
test_X.isnull().any().any()

False

In [19]:
#split the data between train and validation set
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(train_X,train_y,test_size=0.33, random_state=42)

In [47]:
from sklearn.neural_network import MLPClassifier
import time
t1 = time.time()
clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(15,10,5), random_state=1)
clf.fit(X_train, y_train) 
t2 =time.time()

In [48]:
t2-t1

19.932247638702393

In [49]:
pred=clf.predict_proba(X_valid)

In [50]:
from sklearn.metrics import log_loss
log_loss(y_valid,pred[:,1])

2.3142581658067005

In [66]:
#Time for xgboost
import xgboost as xgb
t1 = time.time()
model1 = xgb.XGBClassifier()
#model2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)
xgb1 = model1.fit(X_train, y_train)
#xgb2 = model2.fit(X_train.iloc[0:10,], y_train[0:10])
t2 = time.time()

In [67]:
t2-t1

75.73963451385498

In [70]:
pred=xgb1.predict_proba(X_valid)

In [73]:
log_loss(y_valid,pred[:,1])

0.549033039678095

In [79]:
#submit the prediction score
submit = pd.DataFrame()
submit['INNOVATION_CHALLENGE_KEY'] = test['innovation_challenge_key']
#RENEWAL_PROBABLIITY
submit['RENEWAL_PROBABLIITY']=xgb1.predict_proba(test_X)[:,1]
submit=submit.sort_values('INNOVATION_CHALLENGE_KEY')
submit.to_csv('XGB-default.csv',index=False)

In [77]:
len(pred)

74627

In [81]:
#Let's do a little Gridsearch, Hyperparameter Tunning
model3 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
t1 = time.time()
xgb3 = model3.fit(X_train, y_train)
t2 = time.time()

In [82]:
t2-t1

564.9680795669556

In [83]:
pred=xgb1.predict_proba(X_valid)
log_loss(y_valid,pred[:,1])

0.549033039678095

In [None]:
from sklearn.model_selection import GridSearchCV
param_test = {
 'max_depth':[4,5,6],
 'min_child_weight':[4,5,6]
}
t1 = time.time()
gsearch = GridSearchCV(estimator = xgb.XGBClassifier( learning_rate=0.1, n_estimators=140, max_depth=5,
 min_child_weight=2, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test, scoring='neg_log_loss',n_jobs=4,iid=False, cv=5)
train_model4 = gsearch.fit(X_train, y_train)
pred4 = train_model4.predict(X_valid)
t1 = time.time()

In [None]:
t1