In [1]:
import os
import pandas as pd
import numpy as np
import sklearn
import itertools
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import log_loss

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import cross_val_score, train_test_split

from mlxtend.plotting import plot_learning_curves
from mlxtend.plotting import plot_decision_regions

In [2]:
train = pd.read_table('challengeData.tsv')
test = pd.read_table('scoring_set.tsv')

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
train=train[train['challenge_data.renewed_yorn'].notnull()]

In [4]:
#remove data set name from columns names to make it uniform
columns=[]
for col in train.columns:
    columns.append(col.replace('challenge_data.',''))
train.columns=columns
columns=[]
for col in test.columns:
    columns.append(col.replace('scoring_set.',''))
test.columns=columns

In [5]:
#remove the 'challenge_data.renewed_yorn' from training and test data and move it to a seperate variable
train_y=train['renewed_yorn']
train = train.drop('renewed_yorn',axis=1)
test_y=test['renewed_yorn']
test = test.drop('renewed_yorn',axis=1)

In [6]:
#remove columns which have any null values
percent = (test.isnull().sum()/test.isnull().count())
test.columns[percent>0]
#We would ignore all these 102 columns so we are left with 135 columns
len(test.columns[percent>0])
#all_clean_columns=train.columns[percent==0]
#train_clean=train[all_clean_columns]
unclean_columns=test.columns[percent>0]
train = train.drop(unclean_columns,axis=1)
test = test.drop(unclean_columns,axis=1)

In [7]:
len(unclean_columns)

103

In [8]:
#Remove columns which have too many categories and is string type
messy_columns = []
for col in test.columns:
    if(test[col].dtype=='object' and len(test[col].unique())>20):
        messy_columns.append(col)

In [9]:
len(messy_columns)

56

In [10]:
train=train.drop(messy_columns,axis=1)
test=test.drop(messy_columns,axis=1)

In [11]:
train.shape

(226140, 77)

In [12]:
#Remove the primary key column for data fitting
train_X = train.drop('innovation_challenge_key',axis=1)
test_X = test.drop('innovation_challenge_key',axis=1)

In [13]:
#convert data to one hot encoding to handle categorical values
train_objs_num = len(train_X)
dataset = pd.concat(objs=[train_X, test_X], axis=0)
dataset = pd.get_dummies(dataset)
train_X = dataset[:train_objs_num]
test_X = dataset[train_objs_num:]

In [14]:
#Replace all NaN left with 0
train_X = train_X.fillna(0)
test_X = test_X.fillna(0)

In [15]:
train_X.isnull().any().any()

False

In [16]:
test_X.isnull().any().any()

False

In [17]:
#split the data between train and validation set
X_train, X_valid, y_train, y_valid = train_test_split(train_X,train_y,test_size=0.33, random_state=42)

In [19]:
#Trying ensembeling techniques
#Starting with bagging
clf1 = DecisionTreeClassifier(criterion='entropy', max_depth=1)

bagging1 = BaggingClassifier(base_estimator=clf1, n_estimators=10, max_samples=0.8, max_features=0.8)

In [33]:
bagging1.fit(X_train,y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=0.8,
         max_samples=0.8, n_estimators=10, n_jobs=1, oob_score=False,
         random_state=None, verbose=0, warm_start=False)

In [21]:
pred = bagging1.predict_proba(X_valid)

In [22]:
log_loss(y_valid,pred[:,1])

0.5972742234071063

In [23]:
#Trying Boosting
from sklearn import svm
svc= svm.SVC(kernel='rbf',probability=True)
num_est = [1, 2, 3, 10]
label = ['AdaBoost (n_est=1)', 'AdaBoost (n_est=2)', 'AdaBoost (n_est=3)', 'AdaBoost (n_est=10)']

In [105]:
from sklearn.ensemble import AdaBoostClassifier
boosting = AdaBoostClassifier(base_estimator=svc, n_estimators=10)

In [None]:
boosting.fit(X_train,y_train)

In [None]:
pred=boosting.predict_proba(X_valid)
log_loss(y_valid,pred)

In [None]:
pred = bagging1.predict_proba(X_valid)
log_loss(y_valid,pred[:,1])

In [71]:
#Trying stacking of 3 classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier
import xgboost as xgb

clf1 = RandomForestClassifier(max_depth=4,random_state=0)
clf2 = xgb.XGBClassifier()
lr = LogisticRegression(max_iter=100)

In [72]:
clf1.fit(X_train,y_train)
clf2.fit(X_train,y_train)
lr.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [73]:
pred_clf1 = clf1.predict_proba(X_valid)
pred_clf2 = clf2.predict_proba(X_valid)
pred_lr = lr.predict_proba(X_valid)

In [74]:
pred=[]
for i in range(0,len(pred_lr)):
    pred.append(np.mean([pred_clf1[i][1],pred_clf2[i][1],pred_lr[i][1]]))

In [75]:
log_loss(y_valid,pred)

0.567010870428388

In [None]:
pred_clf1 = clf1.predict_proba(X_valid)
pred_clf2 = clf2.predict_proba(X_valid)
pred_lr = lr.predict_proba(X_valid)
pred=[]
for i in range(0,len(pred_lr)):
    pred.append(np.mean([pred_clf1[i][1],pred_clf2[i][1],pred_lr[i][1]]))

In [101]:
pred_clf1 = clf1.predict_proba(test_X)
pred_clf2 = clf2.predict_proba(test_X)
pred_lr = lr.predict_proba(test_X)
weights = [2,3,1]
pred=[]
for i in range(0,len(pred_lr)):
    x=pred_clf1[i][1]*weights[0]+pred_clf2[i][1]*weights[1]+pred_lr[i][1]*weights[2]
    pred.append(x/6)

In [102]:
#submit the prediction score
submit = pd.DataFrame()
submit['INNOVATION_CHALLENGE_KEY'] = test['innovation_challenge_key']
#RENEWAL_PROBABLIITY
submit['RENEWAL_PROBABLIITY']=pred
submit=submit.sort_values('INNOVATION_CHALLENGE_KEY')
submit.to_csv('ensemble-wmean_lr_xgb_rf.csv',index=False)

In [80]:
#trying out svm on very small data
from sklearn import svm
import time
t1=time.time()
svc= svm.SVC(kernel='rbf',probability=True)
svc.fit(X_train.iloc[0:10000,], y_train[0:10000])
t2=time.time()
pred = svc.predict_proba(X_valid)
log_loss(y_valid,pred[:,1])

0.5893879389214136

In [103]:
#Trying boosting with svm
X_train.shape

(151513, 257)