In [1]:
import pandas as pd, numpy as np, matplotlib.pyplot as plt,seaborn as sns, warnings, requests, datetime

In [2]:
warnings.filterwarnings('ignore')

In [3]:
demo_train = pd.read_csv('traindemographics.csv')
perf_train = pd.read_csv('trainperf.csv')
prev_train = pd.read_csv('trainprevloans.zip')
demo_test = pd.read_csv('testdemographics.csv')
perf_test = pd.read_csv('testperf.csv')
prev_test = pd.read_csv('testprevloans.zip')
submission = pd.read_csv('SampleSubmission.csv')
geo_test = pd.read_csv('geo_test.csv')
geo_train = pd.read_csv('geo_train.csv')

In [4]:
geo_train.drop('Unnamed: 0', axis=1,inplace=True)
geo_test.drop('Unnamed: 0', axis=1,inplace=True)

In [5]:
demo_train = pd.merge(demo_train, geo_train, on='customerid', how='left')
demo_test = pd.merge(demo_test, geo_test, on='customerid', how='left')

In [6]:
perf_test = perf_test.set_index('customerid').loc[submission['customerid']].reset_index()

<h3>Treating Missing Values</h3>

In [7]:
perf_test.nunique()

customerid      1450
systemloanid    1450
loannumber        20
approveddate    1179
creationdate    1186
loanamount        10
totaldue          37
termdays           4
referredby       178
dtype: int64

In [8]:
demo_train.columns

Index(['customerid', 'birthdate', 'bank_account_type', 'longitude_gps',
       'latitude_gps', 'bank_name_clients', 'bank_branch_clients',
       'employment_status_clients', 'level_of_education_clients', 'Country',
       'State', 'LGA'],
      dtype='object')

In [9]:
demo_train.drop('bank_branch_clients', axis=1, inplace=True)
demo_test.drop('bank_branch_clients', axis=1, inplace=True)

In [10]:
demo_train['employment_status_clients'] = demo_train['employment_status_clients'].fillna('Unknown')
demo_test['employment_status_clients'] = demo_test['employment_status_clients'].fillna('Unknown')

In [11]:
demo_train['level_of_education_clients'] = demo_train['level_of_education_clients'].fillna('No Level')
demo_test['level_of_education_clients'] = demo_test['level_of_education_clients'].fillna('No Level')

<h3>The dataset contains no outliers, so let's move on to feature engineering<h3>

<h3>Feature Engineering</h3>

In [12]:
cat_features = []

In [13]:
count_ser = None
def get_count(series, x):
    global count_ser
    try:
        return count_ser[x]
    except:
        count_ser = series.value_counts()
        return count_ser[x]
        

In [14]:
def demo_feat_eng(data):
    global cat_features
    global count_ser
    data = data.copy()
    data['Age'] = ((datetime.datetime.today() - pd.to_datetime(data['birthdate'])).dt.days / 365).astype(np.int64) - 2
    data.drop('birthdate', inplace=True, axis=1)
    
    dummies_bank_account_type = pd.get_dummies(data[['bank_account_type']], drop_first=True)
    data = pd.concat([data, dummies_bank_account_type], axis = 1)
    data.drop('bank_account_type', inplace=True, axis=1)

#     data['bank_account_type'] = data['bank_account_type'].astype('category')
#     cat_features.append('bank_account_type')
    
    dummies_employment_status_clients = pd.get_dummies(data[['employment_status_clients']], drop_first=True)
    data = pd.concat([data, dummies_employment_status_clients], axis=1)
    data.drop('employment_status_clients', inplace=True, axis=1)

#     data['employment_status_clients'] = data['employment_status_clients'].astype('category')
#     cat_features.append('employment_status_clients')
    
    dummies_level_of_education_clients = pd.get_dummies(data[['level_of_education_clients']], drop_first=True)
    data = pd.concat([data, dummies_level_of_education_clients], axis=1)
    data.drop('level_of_education_clients', inplace=True, axis=1)

#     data['level_of_education_clients'] = data['level_of_education_clients'].astype('category')
#     cat_features.append('level_of_education_clients')
    
    dummies_bank_name_clients = pd.get_dummies(data[['bank_name_clients']], drop_first=True)
    data = pd.concat([data, dummies_bank_name_clients], axis=1)
    data.drop('bank_name_clients', inplace=True, axis=1)

#     data['bank_name_clients'] = data['bank_name_clients'].astype('category')
#     cat_features.append('bank_name_clients')

    dummies_bank_name_clients = pd.get_dummies(data[['Country']], drop_first=True)
    data = pd.concat([data, dummies_bank_name_clients], axis=1)
    data.drop('Country', inplace=True, axis=1)
#     count_ser = None
#     data['Country'] = data['Country'].apply(lambda x: get_count(data['Country'], x))
    
#     dummies_bank_name_clients = pd.get_dummies(data[['State']], drop_first=True)
#     data = pd.concat([data, dummies_bank_name_clients], axis=1)
    data.drop('State', inplace=True, axis=1)
#     count_ser = None
#     data['State'] = data['State'].apply(lambda x: get_count(data['State'], x))
    
#     dummies_bank_name_clients = pd.get_dummies(data[['LGA']], drop_first=True)
#     data = pd.concat([data, dummies_bank_name_clients], axis=1)
    data.drop('LGA', inplace=True, axis=1)
#     count_ser = None
#     data['LGA'] = data['LGA'].apply(lambda x: get_count(data['LGA'], x))
    
#     data.drop(['longitude_gps', 'latitude_gps'], inplace=True, axis=1)
    
    return data
    
    

In [15]:
demo_train_feat = demo_feat_eng(demo_train)
demo_test_feat = demo_feat_eng(demo_test)

In [16]:
prev_train.columns

Index(['customerid', 'systemloanid', 'loannumber', 'approveddate',
       'creationdate', 'loanamount', 'totaldue', 'termdays', 'closeddate',
       'referredby', 'firstduedate', 'firstrepaiddate'],
      dtype='object')

In [17]:
def prev_feat_eng(data):
    data = data.copy()
    
#     data['availability_period'] = ((pd.to_datetime(data['closeddate']) - pd.to_datetime(data['creationdate'])).dt.days).astype(np.int64)
    
    data.drop(['approveddate', 'creationdate'], inplace=True, axis=1)
    
    data.drop(['closeddate'], inplace=True, axis=1)
    
    data['repayment_default_days'] = ((pd.to_datetime(data['firstduedate']) - pd.to_datetime(data['firstrepaiddate'])).dt.days).astype(np.int64)
    data.drop(['firstduedate', 'firstrepaiddate'], inplace=True, axis=1)
    
    data['is_referred'] = data['referredby'].fillna("").apply(lambda x: 1 if x else 0)
    data.drop(['referredby'], inplace=True, axis=1)
    
    data.drop(['systemloanid'], inplace=True, axis=1)
    
#     data.drop(['loanamount'], inplace=True, axis=1)
    return data
    

In [18]:
prev_train_feat = prev_feat_eng(prev_train)
prev_test_feat = prev_feat_eng(prev_test)

In [19]:
def perf_feat_eng(data):
    data = data.copy()
    
    data.drop(['approveddate', 'creationdate'], inplace=True, axis=1)
    
    data['is_referred'] = data['referredby'].fillna("").apply(lambda x: 1 if x else 0)
    data.drop(['referredby'], inplace=True, axis=1)
    
    data.drop(['systemloanid'], inplace=True, axis=1)
    
    data.drop(['loanamount'], inplace=True, axis=1)
    
    return data

In [20]:
perf_train_feat = perf_feat_eng(perf_train)
perf_test_feat = perf_feat_eng(perf_test)

<h3>Merging the datasets together</h3>

In [21]:
def merging(demo, prev, perf,mode):
    prev = prev.copy()
    perf = perf.copy()
    demo = demo.copy()
    demo = demo.drop_duplicates(subset=['customerid']).reset_index(drop=True)
    prev_agg = prev.groupby('customerid').agg(['mean'])
    prev_agg.columns = ['_'.join(col).strip() for col in prev_agg.columns.values]
    prev_agg.reset_index(inplace=True)
    df1 = pd.merge(prev_agg, perf, on='customerid', how='right')
    print(df1.shape)
    df2 = pd.merge(demo, df1, on='customerid', how='right')
    print(df2.shape)
    return df2

In [22]:
train_df = merging(pd.concat([demo_train_feat, demo_test_feat] ,axis=0), pd.concat([prev_train_feat, prev_test_feat] ,axis=0), perf_train_feat, 'train')
test_df = merging(pd.concat([demo_train_feat, demo_test_feat] ,axis=0), pd.concat([prev_train_feat, prev_test_feat] ,axis=0), perf_test_feat, 'test')

(4368, 12)
(4368, 57)
(1450, 11)
(1450, 56)


<h3>Filing missing values by imputation</h3>

In [23]:
#filing missing values by imputation
from sklearn.impute import SimpleImputer
#train
def imputation_(data):
    data = data.copy()
    col_na_train = data.columns[data.isna().any()]
    imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    data_ = imp.fit_transform(data[col_na_train])
    data[col_na_train] = data_
    return data

In [24]:
train_df = imputation_(train_df)
test_df = imputation_(test_df)

In [25]:
def remove_customer_id(data):
    return data.drop('customerid', axis=1)

In [26]:
working_train_df = remove_customer_id(train_df)
working_test_df = remove_customer_id(test_df)

<h3>Extracting the target feature from the training dataset</h3>

In [27]:
from sklearn.preprocessing import LabelEncoder
def processing_target(data):
    data = data.copy()
    target = data.pop('good_bad_flag')
    label_encoder = LabelEncoder()
    target = label_encoder.fit_transform(target)
    return data, target, label_encoder

In [28]:
X , y, lbe = processing_target(working_train_df)

<h3>Feature Scaling</h3>

In [29]:
# from sklearn.preprocessing import StandardScaler, MinMaxScaler
# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)
# working_test_df = scaler.transform(working_test_df)

<h1>Model Fitting</h1>

In [30]:
seed = 1
np.random.seed(seed)

In [31]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=seed)

In [32]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [33]:
from sklearn.utils import class_weight
class_weight = dict(zip(np.unique(y), class_weight.compute_class_weight('balanced',
                                                 np.unique(y),
                                                 y))) 

In [34]:
import optuna

In [35]:
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

In [38]:
# def objective(trial):
#     params_lgbm={
#       'n_estimators':trial.suggest_int('n_estimators', 0, 1000), 
#       'num_leaves':trial.suggest_int('num_leaves', 2, 512),
#       'max_depth':trial.suggest_int('max_depth', 2, 128),
#       'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.15),
#       'min_split_gain': trial.suggest_loguniform('min_split_gain', 0.001, 0.1),
#       'feature_fraction':trial.suggest_uniform('feature_fraction',0.1, 1.0),
#       'bagging_freq':trial.suggest_int('bagging_freq',0.1,10),
# #         'early_stopping_rounds':100,
#       'random_state':seed
#             }
#     params_xgb = {
#         'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.15),
#         'max_depth': trial.suggest_int('max_depth', 2, 20),
#         'min_child_weight': trial.suggest_int('max_depth', 1, 20),
#         'subsample': trial.suggest_uniform('subsample',0.1, 1.0),
#         'colsample_bytree': trial.suggest_uniform('colsample_bytree',0.1, 1.0),
#               'n_estimators':trial.suggest_int('n_estimators', 0, 1000), 
#                     'random_state':seed
# #         'objective': ['binary:logistic']
#     }
#     params_forest={
#       'n_estimators':trial.suggest_int('n_estimators', 200, 1000), 
#     'max_features':trial.suggest_categorical('max_features', ['auto', 'sqrt']),
#       'max_depth':trial.suggest_int('max_depth', 2, 10),
#         'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
#         'min_samples_leaf': trial.suggest_int('min_samples_leaf', 2, 10),
#         'bootstrap':trial.suggest_categorical('bootstrap', [True, False]),
#       'random_state':seed
#     }
#     params_knn={
#       'leaf_size':trial.suggest_int('leaf_size', 1, 200), 
#       'n_neighbors':trial.suggest_int('n_neighbors', 1, 200), 
#     'p':trial.suggest_categorical('p', [1, 2,3,4,5])
#     }
    
#     params_catboost={
#       'n_estimators':1000, 
#       'subsample':trial.suggest_uniform('subsample', 0.1, 1.0),
#       'max_depth':trial.suggest_int('max_depth', 1, 10),
#       'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.15),
#       'random_state':seed,
#         'use_best_model':True,
#     'od_type': "Iter",
#   'od_wait': 100,
#             }
#     params_svc = {'C': trial.suggest_float('C', 0.1, 100), 
#                  'gamma': trial.suggest_loguniform('gamma', 0.001, 1.0),
#                  'kernel': trial.suggest_categorical('kernel', ['rbf', 'poly', 'sigmoid']),
#                   'random_state':seed,
#                   'probability':True
#                 }
#     params_logistic = {
#         'C':trial.suggest_loguniform('C', 0.001, 100)
#     }
#     clf = LogisticRegression(**params_logistic)
#     clf.fit(X_train, y_train)
#     clf = CatBoostClassifier(**params_catboost, eval_metric='Accuracy',verbose=0, cat_features=cat_features)
#     clf = LGBMClassifier(**params_lgbm, objective='multiclass')
#     clf = LGBMClassifier(**params_lgbm)
#     clf = XGBClassifier(**params_xgb)
#     clf.fit(X_train, y_train)
#     clf.fit(X_train, y_train, eval_set=(X_test, y_test))
#     return cross_val_score(clf, X, y, n_jobs=-1,cv=3).mean()
#     return accuracy_score(y_test, clf.predict(X_test))

# clf = CatBoostClassifier(**params_catboost, cat_features=cat_feat, eval_metric='Accuracy')
# clf = LGBMClassifier(**params_lgbm, objective='multiclass')
# clf.fit(X_train, y_train, eval_set=(X_test, y_test))

In [39]:
# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials = 200)

In [40]:
#Getting the best parameters of the tuning process
logistic_params = {'C': 0.018520232368098567}
knn_params = {'leaf_size': 89, 'n_neighbors': 65, 'p': 4}
lgb_params = {'n_estimators': 308, 'num_leaves': 484, 'max_depth': 5, 'learning_rate': 0.03736581921793045, 'min_split_gain': 0.02161109975341182, 'feature_fraction': 0.6951873684079937, 'bagging_freq': 4}
# cat_params = {'max_depth': 4, 'learning_rate': 0.029606808629493046}
cat_params = {'max_depth': 3, 'learning_rate': 0.003881677395564221}  #topped, with country
# cat_params = {'max_depth': 6, 'learning_rate': 0.0036995806190383896} with counry
xgb_params = {'learning_rate': 0.005385078545080316, 'max_depth': 3, 'subsample': 0.3659056662933683, 'colsample_bytree': 0.6897493555092099, 'n_estimators': 236}
forest_params = {'n_estimators': 461, 'max_features': 'auto', 'max_depth': 4, 'min_samples_split': 3, 'min_samples_leaf': 9, 'bootstrap': False}
lgb_def = {'early_stopping_rounds':100,
      'random_state':seed,
              'objective':'binary'
            }
cb_def ={
      'n_estimators':1000, 
      'random_state':seed,
        'use_best_model':True,
    'od_type': "Iter",
  'od_wait': 100,
    'eval_metric':'Accuracy',
    'cat_features': cat_features
            }

In [41]:
eval_set = (X_test, y_test)

In [42]:
class CatBoostClassifierCorrected(CatBoostClassifier):
    def fit(self, X, y=None, cat_features=None, text_features=None, embedding_features=None, sample_weight=None, baseline=None, use_best_model=None,
            eval_set=eval_set, verbose=None, logging_level=None, plot=False, column_description=None,
            verbose_eval=None, metric_period=None, silent=None, early_stopping_rounds=None,
            save_snapshot=None, snapshot_file=None, snapshot_interval=None, init_model=None):
        
        return super().fit(X, y, cat_features, text_features, embedding_features, sample_weight, baseline, use_best_model,
            eval_set, verbose, logging_level, plot, column_description,
            verbose_eval, metric_period, silent, early_stopping_rounds,
            save_snapshot, snapshot_file, snapshot_interval, init_model)
    
class LGBMClassifierCorrected(LGBMClassifier):
    def fit(self, X, y,
            sample_weight=None, init_score=None,
            eval_set=eval_set, eval_names=None, eval_sample_weight=None,
            eval_class_weight=None, eval_init_score=None, eval_metric=None,
            early_stopping_rounds=None, verbose=True,
            feature_name='auto', categorical_feature='auto',
            callbacks=None, init_model=None):
        return super().fit(X, y,
            sample_weight, init_score,
            eval_set, eval_names, eval_sample_weight,
            eval_class_weight, eval_init_score, eval_metric,
            early_stopping_rounds, verbose,
            feature_name, categorical_feature,
            callbacks, init_model)


In [43]:
# print(inspect.getsource(LGBMClassifier))

In [44]:
#Instantiating and fitting model with the gotten parameters
cb_model = CatBoostClassifierCorrected(**cat_params,**cb_def)
# lgb_model = LGBMClassifierCorrected(**lgb_params,**lgb_def)
# model = DecisionTreeClassifier(random_state=seed)
# logistic_model = LogisticRegression(**logistic_params)
# knn_model = KNeighborsClassifier(**knn_params)
# model = GaussianNB()
# xgb_model = XGBClassifier(**xgb_params,random_state=seed)
# forest_model = RandomForestClassifier(**forest_params)
cb_model.fit(X_train, y_train, eval_set=(X_test, y_test))
# lgb_model.fit(X_train, y_train, eval_set=(X_test, y_test))
# model = KNeighborsClassifier(**best_params)
# forest_model.fit(X_train, y_train)
# model.fit(X_train, y_train)

0:	learn: 0.7857143	test: 0.8141026	best: 0.8141026 (0)	total: 93.9ms	remaining: 1m 33s
1:	learn: 0.7857143	test: 0.8141026	best: 0.8141026 (0)	total: 103ms	remaining: 51.3s
2:	learn: 0.7890720	test: 0.8214286	best: 0.8214286 (2)	total: 111ms	remaining: 37s
3:	learn: 0.7887668	test: 0.8214286	best: 0.8214286 (2)	total: 120ms	remaining: 29.9s
4:	learn: 0.7899878	test: 0.8214286	best: 0.8214286 (2)	total: 129ms	remaining: 25.7s
5:	learn: 0.7948718	test: 0.8223443	best: 0.8223443 (5)	total: 138ms	remaining: 22.8s
6:	learn: 0.7942613	test: 0.8195971	best: 0.8223443 (5)	total: 147ms	remaining: 20.8s
7:	learn: 0.7948718	test: 0.8232601	best: 0.8232601 (7)	total: 155ms	remaining: 19.3s
8:	learn: 0.7948718	test: 0.8232601	best: 0.8232601 (7)	total: 164ms	remaining: 18.1s
9:	learn: 0.7939560	test: 0.8214286	best: 0.8232601 (7)	total: 173ms	remaining: 17.1s
10:	learn: 0.7939560	test: 0.8241758	best: 0.8241758 (10)	total: 182ms	remaining: 16.4s
11:	learn: 0.7939560	test: 0.8241758	best: 0.8241758

103:	learn: 0.7936508	test: 0.8232601	best: 0.8278388 (12)	total: 1.11s	remaining: 9.57s
104:	learn: 0.7933455	test: 0.8250916	best: 0.8278388 (12)	total: 1.12s	remaining: 9.55s
105:	learn: 0.7936508	test: 0.8241758	best: 0.8278388 (12)	total: 1.13s	remaining: 9.53s
106:	learn: 0.7942613	test: 0.8241758	best: 0.8278388 (12)	total: 1.14s	remaining: 9.51s
107:	learn: 0.7939560	test: 0.8241758	best: 0.8278388 (12)	total: 1.15s	remaining: 9.49s
108:	learn: 0.7936508	test: 0.8250916	best: 0.8278388 (12)	total: 1.16s	remaining: 9.47s
109:	learn: 0.7933455	test: 0.8241758	best: 0.8278388 (12)	total: 1.17s	remaining: 9.45s
110:	learn: 0.7933455	test: 0.8232601	best: 0.8278388 (12)	total: 1.18s	remaining: 9.44s
111:	learn: 0.7936508	test: 0.8250916	best: 0.8278388 (12)	total: 1.19s	remaining: 9.42s
112:	learn: 0.7933455	test: 0.8250916	best: 0.8278388 (12)	total: 1.21s	remaining: 9.5s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8278388278
bestIteration = 12

Shrink mode

<__main__.CatBoostClassifierCorrected at 0x27ae5800550>

In [45]:
# estimators = [cb_model,lgb_model,xgb_model,forest_model]
# estimators_label = ["cb", "lgbm", "xgb", "rf"]
# estm = list(zip(estimators_label,estimators))

In [46]:
# from sklearn.ensemble import VotingClassifier, StackingClassifier

In [47]:
# voting = VotingClassifier(estimators=estm, voting='soft')

In [48]:
# voting.fit(X_train, y_train)

In [49]:
# estm.append(('vclf', voting))
# stacking = StackingClassifier(estimators=estm)

In [50]:
# stacking.fit(X_train, y_train)

In [51]:
predictions = cb_model.predict(X_test)
print(confusion_matrix(y_test, predictions))
print(classification_report(y_test, predictions))
print(accuracy_score(y_test, predictions))

[[ 43 160]
 [ 28 861]]
              precision    recall  f1-score   support

           0       0.61      0.21      0.31       203
           1       0.84      0.97      0.90       889

    accuracy                           0.83      1092
   macro avg       0.72      0.59      0.61      1092
weighted avg       0.80      0.83      0.79      1092

0.8278388278388278


In [52]:
test_prob = cb_model.predict(working_test_df)
ss = pd.read_csv('SampleSubmission.csv')
ss['Good_Bad_flag'] = test_prob
ss.to_csv('Submission_1.csv',index=False)