In [1]:
from datetime import datetime
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

In [2]:
# Load the loan dataset
X_train = pd.read_csv("raw_data\X_train.csv")
X_test = pd.read_csv("raw_data\X_test.csv")

#### Question 1

In [3]:
# (1) What is the percentage of default ("bad") loans for the Connecticut ZIP codes in 
# the test sample (X_train.csv)*?
# 0
# 17.5
# 0.17
# 0.18

In [4]:
# condition state = connecticut & bad
condition = (X_train['addr_state'] == 'CT') & (X_train['zip_code'])
connecticut = X_train.loc[condition]
connecticut = connecticut.loc[:, ['addr_state', 'zip_code']]
connecticut

# percentage

Unnamed: 0,addr_state,zip_code
98,CT,064xx
193,CT,060xx
218,CT,064xx
277,CT,067xx
324,CT,060xx
...,...,...
1199284,CT,064xx
1199364,CT,063xx
1199390,CT,060xx
1199451,CT,060xx


#### Question 2

In [5]:
# (2) Choose the correct statements:* --> MCQ
# [.] 1. In the test sample, there are 3 times more loans issued for 3 years than for 5 years
# [.] 2. Median interest rate in train (X_train.csv) 16.29%
# X 3. The last loan was issued in August 2020 --> Sep-2020
# ? 4. Default for the top 10% is higher than the rest ?

In [6]:
# extract the month from issue_d
X_train['issue_month'] = X_train['issue_d'].apply(lambda x: int(datetime.strptime(x.split('-')[0],'%b').month))

In [7]:
# extract the year from issue_d
X_train['issue_year'] = X_train['issue_d'].apply(lambda x: int(datetime.strptime(x.split('-')[1],'%Y').year))

In [14]:
# term 
X_train.term.unique()

array([' 36 months', ' 60 months'], dtype=object)

In [9]:
# loans issued for 3 years (36 months) and for 5 years (60 months)
loan_term = X_train['term'].value_counts()
# type(loan_term)
loan_term
# loan_term[1]

 36 months    903699
 60 months    296162
Name: term, dtype: int64

In [10]:
# 1. In the test sample, there are 3 times more loans issued for 3 years than for 5 years
print(903699/296162)

3.0513671571639844


In [19]:
# convert first to float
X_train['int_rate'] = X_train['int_rate'].str.replace(" ", "").str.replace("%", "").astype('float')
X_train['int_rate'].value_counts()

10.99    28586
11.99    27545
5.32     27417
13.99    25416
11.49    19940
         ...  
15.13        1
17.50        1
19.16        1
17.44        1
13.75        1
Name: int_rate, Length: 669, dtype: int64

In [21]:
# 2. Median interest rate in train (X_train.csv) 16.29%
X_train['int_rate'].median()

12.69

In [27]:
# 3. The last loan was issued in August 2020
X_train['issue_d'].max()#, X_train['issue_d'].min()

'Sep-2020'

#### Question 3

In [6]:
# (3) For defaulted loans in train versus non-defaulted, tick the correct answers 
# (use t-test if necessary):*
# 1. The share of problem accounts (i.e. acc_now_delinq>0) is approximately the same (for defaulted and non-defaulted loans)
# 2. The lower threshold from the credit bureau (fico_range_low) is not significantly different for bad and good
# 3. Bad and good do not differ in the difference between the average amount requested and approved
# 4. Shares of missing values ​​in mths_since_recent_bc_dlq for bad and good differ significantly
# There is no such number here.

In [26]:
X_train['acc_now_delinq'].value_counts()

0.0    1194659
1.0       4910
2.0        248
3.0         36
4.0          5
5.0          2
6.0          1
Name: acc_now_delinq, dtype: int64

#### Question 4

In [7]:
# (4) Check the correct statements:*
# 1. It is necessary to carry out income verification (see verification_status values). 
#     it affects the probability of default
# 2. Those who have had their income verified have more income
# 3. Those who have had their income verified have less default
# 4. Income verification and (loan_amnt - funded_amnt) have a significant (>0.5) 
#     positive correlation

#### Question 5

In [None]:
# (5) By how much will the default rate decrease if it is issued only to those who 
# have at least one mortgage (mort_acc > 0) and an acceptable debt-to-income ratio 
# (dti < 0.3). What are the correct answers?*
# 0.049
# 0.5%
# 5%
# 25%

#### Question 6

In [8]:
# (6) For further questions, you need to split the training sample into two, so that 
# you learn on one and check the quality on the other. Use X_train2, X_test2, y_train2, 
# y_test2 = train_test_split(X_train, y_train, test_size=0.33, random_state=100).

# Using sklearn and one hot encoding transformation of categorical features where the 
# number of categories is less than 15, build a decision tree with default parameters 
# (if necessary, fill in the gaps with -999) on X_train2.

# Correct statements:
# *
# 1. The first feature in the tree changes when you set max_depth=2
# 2. Second feature = funded_amnt in the tree with max_depth=2
# 3. Tree with unlimited max_depth and random_state=0 gives quality on y_test2 0.28 Gini
# 4. Tree with limited max_depth = 10 and random_state=0 gives quality on y_train2 0.285 Gini

#### Question 7

In [None]:
# (7) Plot a random forest from sklearn on the features prepared for the tree (X_train2). 
# Use Gini (and y_test2) as a measure of quality. 

# Choose the correct answers:*
# 1. Tree with default max_depth and random_state=0 gives better quality than forest with 
#     random_state=10 by 0.002
# 2. A forest of three trees is better than a forest at default values
# 3. Forest at max_depth=10 and other default values ​​does not predict anything
# 4. Forest at max_depth=10 and min_samples_leaf=100 works best

#### Question 8

In [None]:
# (8) What feature should be made from emp_length and another categorical feature 
# with 15 or fewer categories to have the highest default rate?
# Consider that we make a decision on at least 100 observations.*
# if emp_length=='8 years' & home_ownership=='RENT'
# if emp_length=='4 years' & purpose=='small_business'
# if emp_length=='1 year' & term=='60 months'
# if emp_length=='3 years' & verification_status=='Verified'

#### Question 9

In [None]:
# (9) In further questions, we will use the original X_train, y_train. Let's divide them into 
# training and test sets: X_train2, X_test2, y_train2, y_test2 = 
#     train_test_split(X_train, y_train, test_size=0.33, random_state=100). 

# Choose the correct answers:*
# 1. catboost with iterations=80 and other default parameters, 
#     works worse than a forest, but better than trees, if we compare models from previous questions
# 2. catboost with iterations=80 and other default parameters is better than lightgbm with 
#     default parameters
# 3. xgboost with parameters n_estimators=250, learning_rate=0.10, colsample_bytree=0.70, 
#     max_depth=3 is better than lightgbm with default parameters
# 4. xgboost with n_estimators=250, learning_rate=0.10, colsample_bytree=0.70, max_depth=3 
#     is better than catboost with iterations=80 and other default parameters

#### Question 10

In [None]:
# (10) Using the data for catboost from the previous question, compare a model trained 
# with default parameters and iterations=80 on all data and models trained on a particular 
# year's data.

# Do I need to build a separate catboost model for each year? Mark the correct answer.*
# Yes, because for a particular year, always more accurate predictions
# No, because a model built on all the data always beats a model built only on a part
# No, because the model for a specific year always predicts a specific year worse than 
#   the general model
# Depends, because some years are special and you need to take this into account when 
#   choosing a model. In our data, there are about 1 out of 5 such years.