In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

1. The goal of the project is to create a bank scoring model predicting whether or not the client will default

In [None]:
from pandas import Series
import pandas as pd
import numpy as np
from scipy import stats


import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.feature_selection import f_classif, mutual_info_classif
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler, OrdinalEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc, roc_auc_score, roc_curve, f1_score

from datetime import datetime
from datetime import timedelta

In [None]:
# fixing RANDOM_SEED to ensure the experiment can be replicated
RANDOM_SEED = 42
# fixing package version to ensure the experiment can be replicated
!pip freeze > requirements.txt

# **DATA**

In [None]:
DATA_DIR = '/kaggle/input/sf-dst-scoring/'
train = pd.read_csv(DATA_DIR+'/train.csv')
test = pd.read_csv(DATA_DIR+'test.csv')
sample_submission = pd.read_csv(DATA_DIR+'/sample_submission.csv')

In [None]:
# joining train and test together to process
train['sample'] = 1 # mark train
test['sample'] = 0 # mark test
test['default'] = 0 # filling in "default" column with zeros as it is the value to be predicted

data = test.append(train, sort=False).reset_index(drop=True) # join

client_id 

education - by levels

sex -

age - 

car - if owns or does not own a car

car_type - if the car is of foreign bran

decline_app_cnt - quantity of previously rejected applications

good_work - if work is considered to be good

bki_request_cnt - number of requests to credit bureau

home_address - home address classifier

work_address - work address classifier

income - 

foreign_passport - if has or does not have a passport

sna - borrower's connection with bank's clients

first_time - how long ago information about the borrower was entered into the system

score_bki - score according to credit bureu

region_rating - 

app_date - when application was submitted (date)

default - if the borrower has defaulted

In [None]:
data.head()

In [None]:
data.info()

# Cleaning and Prepping Data¶


In [None]:
# The only missing values are in the education column, let's see what unique values can be found there
data.education.unique()

In [None]:
data.education.value_counts()

In [None]:
#It's reasonable to suppose that all the clients will have at least school level education, plus it's also the most common
# level, so I will fill in Nans with this value
data.education.fillna('SCH', inplace=True)

In [None]:
# No Nans
data.isnull().sum()

In [None]:
# let's convert sex, car_type, car and foreign_passport to binary
bin_cols = ['sex', 'car', 'car_type', 'foreign_passport']
label_encoder = LabelEncoder()

for column in bin_cols:
    data[column] = label_encoder.fit_transform(data[column])
    
   
data.head()

In [None]:
# The next column to preprocess is app_date
# Because data format for app_date is object, let's change it to datetime first:
date_string = data.app_date.to_list()
dt_list = []
for i in date_string:
    i = datetime.strptime(i, '%d%b%Y')
    dt_list.append(i)


In [None]:
# and then replace the old values with the new ones
data = data.assign(app_date=pd.Series(dt_list))

In [None]:
# checking if it's worked
data

In [None]:
# because the model will only work with numbers I will transform this feature into a "how many days ago" ago feature:
data['now'] = pd.to_datetime("now") # create the column with current time to calculate the time since application
data['days_ago'] = (data['now'] - data['app_date']).dt.days
data

In [None]:
data.education.unique()

In [None]:
# the last column with string data is education. As education is represented as levels (school, undergraduate, graduate,
# post-graduate, academic career), I believe I can map them as ordinal numbers:
mapping = { "SCH" : 0,"UGR" : 1,"GRD":2,"PGR" : 3,"ACD":4}
data['education'] = data.education.apply(lambda x : mapping[x])
data

# **EDA**

In [None]:
# for the ease of use in exploratory analysis I will split the features into types:
bin_cols = ['sex', 'car', 'car_type', 'foreign_passport', 'good_work']
num_cols = ['age', 'decline_app_cnt', 'income', 'bki_request_cnt', 'score_bki', 'days_ago']
cat_cols = ['education', 'home_address', 'work_address', 'sna', 'first_time', 'region_rating'] 
# even though first_time talks about how long ago client's data was first entered, I am putting it as a categorical
# variable based on the values
                                                                    

In [None]:
# let's see how binary features correlate with the target:
sex = data.sex.to_numpy()
car = data.car.to_numpy()
car_type = data.car_type.to_numpy()
foreign_passport = data.foreign_passport.to_numpy()
good_work = data.good_work.to_numpy()
default = data.default.to_numpy()
print(f'sex = {stats.pointbiserialr(default, sex)}, car = {stats.pointbiserialr(default, car)}, \
car_type = {stats.pointbiserialr(default, car_type)}, passport = {stats.pointbiserialr(default, foreign_passport)}, \
work = {stats.pointbiserialr(default, good_work)}' )


Based on this, car_type and passport correlate with default the most. Let's see correlation of categorical variables


In [None]:
sns.heatmap(data[cat_cols].corr(method='spearman'))

we can see that home address and work address are correlated quite highly so we should only take one of them for the model

In [None]:
# let's see the dependency between the target and categorical and binary viriables
imp_cat = Series(mutual_info_classif(data[bin_cols + cat_cols], data['default'],
                                     discrete_features =True), index = bin_cols + cat_cols)
imp_cat.sort_values(inplace = True)
imp_cat.plot(kind = 'barh')

As we can see  there is a greater dependency between sna, first_time, home_adress, region_rating and education with the target. 
I assume sna is related to having a possible co-signer in the same bank, first_time to having a longer/shorter credit history, education can help predict the level of income, and as far as home address/region_rating goes it may be connected with residing in more expensive/cheaper areas which signals a higher/lower income as well.

In [None]:
# let's see if numerical data is distributed normally:
for i in data[num_cols]:
    plt.figure()
    sns.distplot(data[i], kde = False, rug=False)
    plt.title(i)
    plt.show()

In [None]:
# let's check if there are outliers:
def iqr_outliers(s): 
    q75, q25 = np.percentile(s, [75, 25], axis=0)
    iqr = q75 - q25
    lower_bound = q25 - 1.5 * iqr
    upper_bound = q75 + 1.5 * iqr
    return ~((s < lower_bound) | (s > upper_bound)).any(axis=1)
# data[iqr_outliers(data[['age', 'decline_app_cnt', 'income', 'bki_request_cnt', 'score_bki', \
#                        'region_rating','sna', 'first_time', 'days_ago']])]
# there turned to be a lot of them and eliminating outliers led to deleting about a half of the dataset, so it doesn't look
# like a good solution to me

I will use logarithm to make distribution more normal

In [None]:
for i in data[num_cols]:
    data[i] = np.log1p(data[i].abs())
    plt.figure(figsize=(10,5))
    sns.distplot(data[i][data[i] > 0].dropna(), kde = False, rug=False)
    plt.title(i)
    plt.show()

In [None]:
# checking correlation of numerical variables:
sns.heatmap(data[num_cols].corr())

In [None]:
data[num_cols].corr()


Score_bki is somewhat correlated with decline_app_cnt and bki_request_cnt, but the numbers are pretty low

In [None]:
# let's check the dependency between numerical data and target:
imp_num = Series(f_classif(data[num_cols], data['default'])[0], index = num_cols)
imp_num.sort_values(inplace = True)
imp_num.plot(kind = 'barh')

There is no question as to why the greatest dependency exists between the score of the credit bureau and the number of declined applications. 

# **Preparing data for ML**

In [None]:
# drop datetime data
data.drop(['app_date', 'now'], axis = 1, inplace=True)


In [None]:
# Splitting data into train and test dropping unnecessary columns
train_data = data.query('sample == 1').drop(['sample', 'client_id'], axis=1)  # train
test_data = data.query('sample == 0').drop(['sample', 'default'], axis=1)     # test

# Saving client id from test for submission
id_test = test_data['client_id']

# deleting client id from test
test_data.drop(['client_id'], axis=1, inplace = True)


In [None]:
# Standardizing train_data
X_num_train = StandardScaler().fit_transform(train_data[num_cols].values)


# Standardizing test_data
X_num_test = StandardScaler().fit_transform(test_data[num_cols].values)


In [None]:
# Selecting categorical data

# train
df_cat_train = train_data.drop(["default"], axis = 1)
df_cat_train.drop(num_cols, axis = 1, inplace = True)


# test
df_cat_test = test_data.drop(num_cols, axis = 1)


In [None]:
# Joining together binary, numerical and categorical data

# train
X_1 = np.hstack([X_num_train, df_cat_train.values])

# test
X_t_1 = np.hstack([X_num_test, df_cat_test.values])

# target
y_1 = train_data['default'].values

In [None]:
# splitting data into reain and test for validation
X_train, X_test, y_train, y_test = train_test_split(X_1, y_1, test_size=0.20, shuffle = True, random_state=RANDOM_SEED)

# **Model**

In [None]:
model = LogisticRegression(C = 10, random_state=RANDOM_SEED)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
probs = model.predict_proba(X_test)
probs = probs[:,1]


fpr, tpr, threshold = roc_curve(y_test, probs)
roc_auc = roc_auc_score(y_test, probs)

plt.figure()
plt.plot([0, 1], label='Baseline', linestyle='--')
plt.plot(fpr, tpr, label = 'Regression')
plt.title('Logistic Regression ROC AUC = %0.3f' % roc_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()

In [None]:
confusion_matrix(y_test, y_pred)

eliminating less relevant features didn't do anything for the model
confusion matrix classifies many defaulting clients as non-defaulting

In [None]:
from sklearn.model_selection import GridSearchCV

# Зададим ограничения для параметра регуляризации
C = np.logspace(0, 4, 10)

# Создадим гиперпараметры
param_grid = [
    {'penalty': ['l1'], 'C':[0.1, 1, 10], 'max_iter':[1000],'tol':[1e-5]},
    {'penalty': ['l2'], 'C':[0.1, 1, 10], 'max_iter':[1000],'tol':[1e-5]},
    {'penalty': ['none'], 'max_iter':[1000],'tol':[1e-5]},
]

model = LogisticRegression()
model.fit(X_train, y_train)

# Создаем сетку поиска с использованием 5-кратной перекрестной проверки
clf = GridSearchCV(model, param_grid, cv=5, verbose=0)

best_model = clf.fit(X_train, y_train)
# View best hyperparameters
print('Лучшее Penalty:', best_model.best_estimator_.get_params()['penalty'])
print('Лучшее C:', best_model.best_estimator_.get_params()['C'])

In [None]:
model_2 = LogisticRegression(C= 10, class_weight='balanced', solver='liblinear', random_state=RANDOM_SEED)
model_2.fit(X_train, y_train)

y_pred_2 = model.predict(X_test)
probs_2 = model_2.predict_proba(X_test)
probs_2 = probs_2[:,1]


fpr, tpr, threshold = roc_curve(y_test, probs)
roc_auc = roc_auc_score(y_test, probs)

plt.figure()
plt.plot([0, 1], label='Baseline', linestyle='--')
plt.plot(fpr, tpr, label = 'Regression')
plt.title('Logistic Regression ROC AUC = %0.3f' % roc_auc)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.legend(loc = 'lower right')
plt.show()


In [None]:
confusion_matrix(y_test, y_pred_2)

Even though ROC AUC has grown, the number of correctly identified clients has decreased compared with the first one, so I will go with the first model 

Выводы для себя: словарь с гиперпараметрами я скопировала из слака, и уже одно только значение С сильно повлияло на результат.
Остальные параметры я настроила после чтения документации вручную: class_weight='balanced' так как недефолтных клиентов непропорционально много, solver='liblinear' так как он используется не для мультиклассов
Я видела, что другие студенты использовали хитрые функции, но я не совсем поняла как они работают, и пошла по методу чтения документации.


# Submission

In [None]:
predict_submission = model.predict_proba(X_t_1)[:,1]


In [None]:
submission = pd.DataFrame({'client_id': id_test, 
                            'default': predict_submission})

In [None]:
submission

In [None]:
submission.to_csv('submission.csv', index = False)


In [None]:
type(submission)