In [1]:
# Importing Libraries  #0.859331533603457
import os
import pandas as pd
import numpy as np
import math
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
warnings.filterwarnings('ignore')

In [2]:
# Reading Dataframes
train= pd.read_csv('train.csv')
test= pd.read_csv('test.csv')

In [3]:
train.isnull().sum()

ID                                         0
Gender                                     0
DOB                                       15
Lead_Creation_Date                         0
City_Code                                814
City_Category                            814
Employer_Code                           4018
Employer_Category1                      4018
Employer_Category2                      4298
Monthly_Income                             0
Customer_Existing_Primary_Bank_Code     9391
Primary_Bank_Type                       9391
Contacted                                  0
Source                                     0
Source_Category                            0
Existing_EMI                              51
Loan_Amount                            27709
Loan_Period                            27709
Interest_Rate                          47437
EMI                                    47437
Var1                                       0
Approved                                   0
dtype: int

In [3]:
# Checking imbalance
train['Approved'].value_counts()

0    68693
1     1020
Name: Approved, dtype: int64

In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69713 entries, 0 to 69712
Data columns (total 22 columns):
ID                                     69713 non-null object
Gender                                 69713 non-null object
DOB                                    69698 non-null object
Lead_Creation_Date                     69713 non-null object
City_Code                              68899 non-null object
City_Category                          68899 non-null object
Employer_Code                          65695 non-null object
Employer_Category1                     65695 non-null object
Employer_Category2                     65415 non-null float64
Monthly_Income                         69713 non-null float64
Customer_Existing_Primary_Bank_Code    60322 non-null object
Primary_Bank_Type                      60322 non-null object
Contacted                              69713 non-null object
Source                                 69713 non-null object
Source_Category                    

In [4]:
# Pre Proessing
train_id, test_id = train['ID'], test['ID']
target = train['Approved']
train = train.drop(['ID', 'Approved'], axis=1)
test = test.drop(['ID'], axis=1)

In [5]:
# Concating train and test into one
data = pd.concat([train, test], ignore_index=True)

In [6]:
# Handling dates
import datetime
data['DOB'] =  pd.to_datetime(data['DOB'])
data['Lead_Creation_Date'] =  pd.to_datetime(data['Lead_Creation_Date'])

data['DOB'].fillna(0, inplace=True)
data['Lead_Creation_Date'].fillna(0, inplace=True)

In [7]:
# Extracting key attributes from dates
data['DOB_month'] = data['DOB'].dt.month
data['DOB_day'] = data['DOB'].dt.day
data['DOB_year'] = data['DOB'].dt.year

data['Creation_Month'] = data['Lead_Creation_Date'].dt.month
data['Creation_Day'] = data['Lead_Creation_Date'].dt.day

data['Age']= data['Lead_Creation_Date']-data['DOB']

data['Age'] = data['Age'].astype('str')
data['Age']= data['Age'].apply(lambda x: x.split(' ', 1)[0])
data['Age'] = data['Age'].astype('int64')

# Taking care of dob above 2000
dob_year = data['DOB_year']
dob_year[dob_year>2018] -= 100
data['DOB_year'] = dob_year


In [8]:
# Dropping dob and lead_creation_rate
data = data.drop(['DOB', 'Lead_Creation_Date'], axis=1)

In [9]:
# Label Encoding
from sklearn import preprocessing 
for i in data.columns: 
    if data[i].dtype=='object': 
        encoder = preprocessing.LabelEncoder() 
        encoder.fit(list(data[i].values)) 
        data[i] = encoder.transform(list(data[i].values))


In [10]:
# Splitting back into train and test
train = data[:len(train)]
test  = data[len(train):]

In [11]:
# Model 1: XGBClassifier
train1=train
test1=test
target1 =target

In [12]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(train1, target1, test_size = 0.2, random_state = 0)

In [13]:
from sklearn import preprocessing, metrics
from sklearn.model_selection import KFold
import random
random.seed(3)
from xgboost import XGBClassifier

In [14]:
weight = float(np.sum(target1 == 0)) / float(np.sum(target1 == 1))

In [15]:
# Fitting XGBoost to the Training set
classifier = XGBClassifier(scale_pos_weight=weight)
classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=67.34607843137255, seed=0, silent=True,
       subsample=1)

In [None]:
from sklearn import cross_validation, metrics 
from sklearn.grid_search import GridSearchCV   #Performing grid search

param = {
 'reg_alpha':[ 0.1, 0.2, 0.3, 0.4]
 
}
predictors=data.columns
gsearch = GridSearchCV(estimator = XGBClassifier(colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='binary:logistic', reg_lambda=1,
       scale_pos_weight=67.34607843137255, seed=0,
       subsample=1), 
param_grid = param, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch.fit(train[predictors],target)
gsearch.grid_scores_, gsearch.best_params_, gsearch.best_score_

In [None]:
# Predicting the Test set results
y_pred = classifier.predict(X_test)

In [None]:
def create_feature_map(features):
    outfile = open('classifier.fmap', 'w')
    i = 0
    for feat in features:
        outfile.write('{0}\t{1}\tq\n'.format(i, feat))
        i = i + 1
    outfile.close()

In [None]:
# Plotting Feature Importances
import operator
create_feature_map(data.columns)
importance = classifier.booster().get_fscore(fmap='classifier.fmap')
importance = sorted(importance.items(), key=operator.itemgetter(1))

df = pd.DataFrame(importance, columns=['feature', 'fscore'])
df['fscore'] = df['fscore'] / df['fscore'].sum()
plt.figure()
df.plot(kind='barh', x='feature', y='fscore', legend=False, figsize=(6, 10))
plt.title('XGBoost Feature Importance')
plt.xlabel('Relative Importance')
plt.show()

In [None]:
predict = classifier.predict_proba(test)
predict = predict[:,1]

In [None]:
# Model 2 : XGBoost
train2=train
test2=test
target2 = target

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(train2, label=target2, missing=np.nan)
dtest = xgb.DMatrix(test2, missing=np.nan)

In [None]:
params = {'booster':'gbtree', 'objective':'multi:softprob', 'max_depth':4, 'num_class': 3, 'seed': 0,
          'eta':0.1, 'nthread':4, 'subsample':0.9, 'scale_pos_weight':weight}

In [None]:
num_rounds = 350
clf_xgb = xgb.train(params, dtrain, num_rounds)
xgb_preds = clf_xgb.predict(dtest)
xgb_preds = xgb_preds[:,1]

In [None]:
# Ensembling
ens = predict.copy()
ens = predict * 0.6 + xgb_preds * 0.4

In [None]:
# Final Submission
from IPython.display import FileLink
sub_3 = pd.DataFrame({'ID': test_id, 'Approved': ens})
sub_3 = sub_3[['ID', 'Approved']]
filename = 'final.csv'
sub_3.to_csv(filename, index=False)
FileLink(filename)