#Pre-learning assignment

In [1]:
import numpy as np
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
app_train = pd.read_csv('/content/drive/MyDrive/application_train.csv')
app_test = pd.read_csv('/content/drive/MyDrive/application_test.csv')
print(app_train.shape)
print(app_test.shape)
missing = app_train.isnull().sum()
train = app_train.loc[:,missing.index[missing<10000]]
test = app_test.loc[:,(missing.index[missing<10000]).drop('TARGET')]
print(train.shape)
print(test.shape)
test_id=test['SK_ID_CURR']

(307511, 122)
(48744, 121)
(307511, 65)
(48744, 64)


In [3]:
y_train = train.TARGET
x_train = train.drop(['TARGET'],axis=1)
print(y_train.shape)
print(x_train.shape)
fdata = pd.concat([x_train,test])
wdata = pd.concat([x_train,test])
wdata.shape

(307511,)
(307511, 64)


(356255, 64)

In [4]:
col_obj = wdata.columns[wdata.dtypes == 'object']
for i in col_obj:
    wdata.loc[:,i],_ = pd.factorize(wdata.loc[:,i])
data = wdata.fillna(0)
data = data.drop(['SK_ID_CURR'],axis=1)
xtrain = data.iloc[0:len(x_train),:]
xtest = data.iloc[len(x_train):len(data),:] 
print(xtrain.shape)
print(xtest.shape)

(307511, 63)
(48744, 63)


In [5]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler().fit(xtrain)
x_scaled = scaler.transform(xtrain)
x_scaled = pd.DataFrame(x_scaled)

In [6]:
from sklearn.model_selection import train_test_split
X_train, X_val, Y_train, Y_val = train_test_split(x_scaled, y_train, test_size=0.2, random_state=18)
from sklearn.linear_model import LogisticRegression
lreg = LogisticRegression(max_iter=1000).fit(X_train, Y_train)
y_pred = lreg.predict_proba(X_val)
from sklearn.metrics import roc_auc_score
print("Validation ROC AUC:",roc_auc_score(Y_val, y_pred[:,1]))

Validation ROC AUC: 0.700318584229964


#Problem 1 
Cross validation

In [7]:
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
kf.get_n_splits(x_scaled)
roc_kfold=[]
for train_index, test_index in kf.split(x_scaled):
  x,x_test = x_scaled.loc[train_index,:],x_scaled.loc[test_index,:]
  y,y_test = y_train[train_index],y_train[test_index]
  lreg = LogisticRegression(max_iter=1000).fit(x, y)
  y_pred = lreg.predict_proba(x_test)
  roc_kfold.append(roc_auc_score(y_test, y_pred[:,1]))

In [8]:
roc_kfold

[0.7023513387297634,
 0.70435465386954,
 0.7026891080689418,
 0.7012215060652581,
 0.705965358050384]

In [9]:
from statistics import mean
mean(roc_kfold)

0.7033163929567775

#Problem 2
Grid search


In [26]:
from sklearn.model_selection import GridSearchCV 
param = {'penalty':('l2','none'),'solver':('newton-cg', 'lbfgs')}
lreg = LogisticRegression(max_iter=1000) 
# with cv parameter to default means 5-fold cross validation
clf = GridSearchCV(lreg, param)

In [27]:
clf.fit(x_scaled,y_train)
clf.cv_results_

{'mean_fit_time': array([29.57994628, 12.59579463, 38.06015458, 11.1342371 ]),
 'mean_score_time': array([0.02045107, 0.02023072, 0.0196332 , 0.01997566]),
 'mean_test_score': array([0.91924191, 0.91923541, 0.91924517, 0.91924517]),
 'param_penalty': masked_array(data=['l2', 'l2', 'none', 'none'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'param_solver': masked_array(data=['newton-cg', 'lbfgs', 'newton-cg', 'lbfgs'],
              mask=[False, False, False, False],
        fill_value='?',
             dtype=object),
 'params': [{'penalty': 'l2', 'solver': 'newton-cg'},
  {'penalty': 'l2', 'solver': 'lbfgs'},
  {'penalty': 'none', 'solver': 'newton-cg'},
  {'penalty': 'none', 'solver': 'lbfgs'}],
 'rank_test_score': array([3, 4, 1, 1], dtype=int32),
 'split0_test_score': array([0.91923971, 0.91922345, 0.91922345, 0.91922345]),
 'split1_test_score': array([0.91918962, 0.91918962, 0.91920588, 0.91920588]),
 'split2_test_score': a

# Problem 3
Survey from Kaggle Notebooks

# problem 4
Creating a model with high generalization performance

In [28]:
# use of LightGBM
import lightgbm as lgb
lgb_train = lgb.Dataset(data=X_train, label=Y_train)
lgb_eval = lgb.Dataset(data=X_val, label=Y_val)
params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 
          'learning_rate': 0.01, 'num_leaves': 48, 'num_iteration': 5000, 'verbose': 0 ,
          'colsample_bytree':.8, 'subsample':.9, 'max_depth':7, 'reg_alpha':.1, 'reg_lambda':.1, 
          'min_split_gain':.01, 'min_child_weight':1}
model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=150, verbose_eval=200)



Training until validation scores don't improve for 150 rounds.
[200]	valid_0's auc: 0.707307
[400]	valid_0's auc: 0.714927
[600]	valid_0's auc: 0.718358
[800]	valid_0's auc: 0.720562
[1000]	valid_0's auc: 0.721764
[1200]	valid_0's auc: 0.722494
[1400]	valid_0's auc: 0.72297
[1600]	valid_0's auc: 0.723322
[1800]	valid_0's auc: 0.723511
[2000]	valid_0's auc: 0.723586
Early stopping, best iteration is:
[1862]	valid_0's auc: 0.723636


In [34]:
# LightGBM with given parameters with 5-fold
kf = KFold(n_splits=5)
kf.get_n_splits(x_scaled)
roc_kfold=[]
for train_index, test_index in kf.split(x_scaled):
  x,x_test = x_scaled.loc[train_index,:],x_scaled.loc[test_index,:]
  y,y_test = y_train[train_index],y_train[test_index]
  lgb_train = lgb.Dataset(data=x, label=y)
  lgb_eval = lgb.Dataset(data=x_test, label=y_test)
  params = {
            'objective': 'binary',
            'boosting_type': 'gbdt',
            'nthread': 4,
            'learning_rate': 0.02, 
            'num_leaves': 20,
            'colsample_bytree': 0.9497036,
            'subsample': 0.8715623,
            'subsample_freq': 1,
            'max_depth': 8,
            'reg_alpha': 0.041545473,
            'reg_lambda': 0.0735294,
            'min_split_gain': 0.0222415,
            'min_child_weight': 60, # 39.3259775,
            'seed': 0,
            'verbose': -1,
            'metric': 'auc',
        }
  model = lgb.train(params=params,train_set=lgb_train, valid_sets=lgb_eval, early_stopping_rounds=200,num_boost_round=10000, verbose_eval=False)
  y_pred = model.predict(x_test)
  print('AUC:', roc_auc_score(y_test, y_pred))

AUC: 0.7260836343837338
AUC: 0.7280164039692631
AUC: 0.7236870807948773
AUC: 0.7229348287494292
AUC: 0.7274011111262515


#Problem 5
Final model selection

In [37]:
xs_test = scaler.transform(xtest)
y_pred = model.predict(xs_test)
y=pd.DataFrame(y_pred)
y.columns=['TARGET']
outcome = pd.concat([test_id,y],axis=1)
outcome.head(3)

Unnamed: 0,SK_ID_CURR,TARGET
0,100001,0.026793
1,100005,0.105781
2,100013,0.035728


In [38]:
outcome.to_csv (r'/content/drive/MyDrive/pred_test.csv', index = False, header=True)

#Kaggle submission
I only used features from application_train file. <br/>
Previous score was 0.68725 <br/>
This time score was 0.70733 <br/>
