In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset to pandas DataFrame
loan_dataset = pd.read_csv('credit_train.csv')

In [3]:
type(loan_dataset)

pandas.core.frame.DataFrame

In [4]:
# printing the first 5 rows of the dataframe
loan_dataset.head()

Unnamed: 0,Loan ID,Customer ID,Loan Status,Current Loan Amount,Term,Credit Score,Annual Income,Years in current job,Home Ownership,Purpose,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
0,14dd8831-6af5-400b-83ec-68e61888a048,981165ec-3274-42f5-a3b4-d104041a9ca9,Fully Paid,445412.0,Short Term,709.0,1167493.0,8 years,Home Mortgage,Home Improvements,5214.74,17.2,,6.0,1.0,228190.0,416746.0,1.0,0.0
1,4771cc26-131a-45db-b5aa-537ea4ba5342,2de017a3-2e01-49cb-a581-08169e83be29,Fully Paid,262328.0,Short Term,,,10+ years,Home Mortgage,Debt Consolidation,33295.98,21.1,8.0,35.0,0.0,229976.0,850784.0,0.0,0.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,5efb2b2b-bf11-4dfd-a572-3761a2694725,Fully Paid,99999999.0,Short Term,741.0,2231892.0,8 years,Own Home,Debt Consolidation,29200.53,14.9,29.0,18.0,1.0,297996.0,750090.0,0.0,0.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,e777faab-98ae-45af-9a86-7ce5b33b1011,Fully Paid,347666.0,Long Term,721.0,806949.0,3 years,Own Home,Debt Consolidation,8741.9,12.0,,9.0,0.0,256329.0,386958.0,0.0,0.0
4,d4062e70-befa-4995-8643-a0de73938182,81536ad9-5ccf-4eb8-befb-47a4d608658e,Fully Paid,176220.0,Short Term,,,5 years,Rent,Debt Consolidation,20639.7,6.1,,15.0,0.0,253460.0,427174.0,0.0,0.0


In [5]:
# number of rows and columns
loan_dataset.shape

(100514, 19)

In [6]:
# statistical measures
loan_dataset.describe()

Unnamed: 0,Current Loan Amount,Credit Score,Annual Income,Monthly Debt,Years of Credit History,Months since last delinquent,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit,Bankruptcies,Tax Liens
count,100000.0,80846.0,80846.0,100000.0,100000.0,46859.0,100000.0,100000.0,100000.0,99998.0,99796.0,99990.0
mean,11760450.0,1076.456089,1378277.0,18472.412336,18.199141,34.901321,11.12853,0.16831,294637.4,760798.4,0.11774,0.029313
std,31783940.0,1475.403791,1081360.0,12174.992609,7.015324,21.997829,5.00987,0.482705,376170.9,8384503.0,0.351424,0.258182
min,10802.0,585.0,76627.0,0.0,3.6,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,179652.0,705.0,848844.0,10214.1625,13.5,16.0,8.0,0.0,112670.0,273438.0,0.0,0.0
50%,312246.0,724.0,1174162.0,16220.3,16.9,32.0,10.0,0.0,209817.0,467874.0,0.0,0.0
75%,524942.0,741.0,1650663.0,24012.0575,21.7,51.0,14.0,0.0,367958.8,782958.0,0.0,0.0
max,100000000.0,7510.0,165557400.0,435843.28,70.5,176.0,76.0,15.0,32878970.0,1539738000.0,7.0,15.0


In [7]:
# number of missing values in each column
loan_dataset.isnull().sum()

Loan ID                           514
Customer ID                       514
Loan Status                       514
Current Loan Amount               514
Term                              514
Credit Score                    19668
Annual Income                   19668
Years in current job             4736
Home Ownership                    514
Purpose                           514
Monthly Debt                      514
Years of Credit History           514
Months since last delinquent    53655
Number of Open Accounts           514
Number of Credit Problems         514
Current Credit Balance            514
Maximum Open Credit               516
Bankruptcies                      718
Tax Liens                         524
dtype: int64

In [8]:
loan_dataset=loan_dataset.drop(columns=['Credit Score','Customer ID','Years in current job','Home Ownership','Months since last delinquent','Tax Liens','Bankruptcies'])

In [9]:
loan_dataset.shape

(100514, 12)

In [10]:
# dropping the missing values
loan_dataset = loan_dataset.dropna()

In [11]:
loan_dataset.isnull().sum()

Loan ID                      0
Loan Status                  0
Current Loan Amount          0
Term                         0
Annual Income                0
Purpose                      0
Monthly Debt                 0
Years of Credit History      0
Number of Open Accounts      0
Number of Credit Problems    0
Current Credit Balance       0
Maximum Open Credit          0
dtype: int64

In [12]:
loan_dataset.shape

(80845, 12)

In [13]:
# label encoding
loan_dataset.replace({"Loan Status":{'Charged Off':0,'Fully Paid':1}},inplace=True)

In [14]:
# printing the first 5 rows of the dataframe
loan_dataset.head()

Unnamed: 0,Loan ID,Loan Status,Current Loan Amount,Term,Annual Income,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit
0,14dd8831-6af5-400b-83ec-68e61888a048,1,445412.0,Short Term,1167493.0,Home Improvements,5214.74,17.2,6.0,1.0,228190.0,416746.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,1,99999999.0,Short Term,2231892.0,Debt Consolidation,29200.53,14.9,18.0,1.0,297996.0,750090.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,1,347666.0,Long Term,806949.0,Debt Consolidation,8741.9,12.0,9.0,0.0,256329.0,386958.0
5,89d8cb0c-e5c2-4f54-b056-48a645c543dd,0,206602.0,Short Term,896857.0,Debt Consolidation,16367.74,17.3,6.0,0.0,215308.0,272448.0
6,273581de-85d8-4332-81a5-19b04ce68666,1,217646.0,Short Term,1184194.0,Debt Consolidation,10855.08,19.6,13.0,1.0,122170.0,272052.0


In [15]:
# Dependent column values
loan_dataset['Monthly Debt'].value_counts()

0.00        54
10647.98     8
16279.20     8
11162.88     8
13033.43     8
            ..
22374.78     1
6425.61      1
2619.72      1
9955.62      1
11080.04     1
Name: Monthly Debt, Length: 55869, dtype: int64

In [16]:
# dependent values
loan_dataset['Annual Income'].value_counts()

1162572.0    22
973370.0     19
969475.0     18
1140000.0    18
1112640.0    17
             ..
1846781.0     1
1363763.0     1
2041949.0     1
2598060.0     1
1538240.0     1
Name: Annual Income, Length: 36174, dtype: int64

In [17]:
# Annual Income & Loan Status
#sns.countplot(x='Annual Income',hue='Loan Status',data=loan_dataset)

In [18]:
# Purpose & Loan Status
#sns.countplot(x='Purpose',hue='Loan Status',data=loan_dataset)

In [19]:
#Term & Loan Status
#sns.countplot(x='Term',hue = 'Loan Status',data=loan_dataset)

In [20]:
# convert categorical columns to numerical values
loan_dataset.replace({'Term':{'Short Term':1,'Long Term':2}},inplace=True)

In [21]:
loan_dataset.head()

Unnamed: 0,Loan ID,Loan Status,Current Loan Amount,Term,Annual Income,Purpose,Monthly Debt,Years of Credit History,Number of Open Accounts,Number of Credit Problems,Current Credit Balance,Maximum Open Credit
0,14dd8831-6af5-400b-83ec-68e61888a048,1,445412.0,1,1167493.0,Home Improvements,5214.74,17.2,6.0,1.0,228190.0,416746.0
2,4eed4e6a-aa2f-4c91-8651-ce984ee8fb26,1,99999999.0,1,2231892.0,Debt Consolidation,29200.53,14.9,18.0,1.0,297996.0,750090.0
3,77598f7b-32e7-4e3b-a6e5-06ba0d98fe8a,1,347666.0,2,806949.0,Debt Consolidation,8741.9,12.0,9.0,0.0,256329.0,386958.0
5,89d8cb0c-e5c2-4f54-b056-48a645c543dd,0,206602.0,1,896857.0,Debt Consolidation,16367.74,17.3,6.0,0.0,215308.0,272448.0
6,273581de-85d8-4332-81a5-19b04ce68666,1,217646.0,1,1184194.0,Debt Consolidation,10855.08,19.6,13.0,1.0,122170.0,272052.0


In [22]:
loan_dataset['Term'].value_counts()

1    57739
2    23106
Name: Term, dtype: int64

In [23]:
loan_dataset['Purpose'].value_counts()

Debt Consolidation      63313
other                    5017
Home Improvements        4720
Other                    2642
Business Loan            1275
Buy a Car                1024
Medical Bills             921
Buy House                 565
Take a Trip               476
major_purchase            282
small_business            234
moving                    125
wedding                    88
Educational Expenses       82
vacation                   73
renewable_energy            8
Name: Purpose, dtype: int64

In [24]:
loan_dataset.replace({'Purpose':{'Debt Consolidation':1,'other':2,'Home Improvements':3,'Other':0,'Business Loan':4,'Buy a Car':4,
                                'Medical Bills':5,'Buy House':6,'Take a Trip':7,'major_purchase':8,'small_business':9
                                ,'moving':10,'wedding':11,'Educational Expenses':12,'vacation':13,'renewable_energy':14}},inplace=True)

In [25]:
loan_dataset['Loan Status'].value_counts()

1    63636
0    17209
Name: Loan Status, dtype: int64

In [26]:
loan_dataset['Years of Credit History'].value_counts()

16.0    1085
15.0    1021
17.0     992
16.5     937
14.0     928
        ... 
51.4       1
50.8       1
53.4       1
50.5       1
58.0       1
Name: Years of Credit History, Length: 501, dtype: int64

In [27]:
loan_dataset['Number of Open Accounts'].value_counts()

9.0     7580
10.0    7282
8.0     7020
11.0    6990
7.0     6532
12.0    6042
6.0     5443
13.0    5033
14.0    4177
5.0     3808
15.0    3485
16.0    2819
4.0     2322
17.0    2318
18.0    1819
19.0    1520
20.0    1171
3.0     1103
21.0     886
22.0     656
23.0     548
24.0     460
2.0      372
25.0     355
27.0     227
26.0     227
28.0     132
29.0     101
30.0      91
31.0      65
32.0      52
33.0      46
34.0      34
36.0      19
1.0       18
35.0      15
37.0      15
38.0      10
43.0       9
41.0       6
40.0       6
45.0       6
39.0       5
42.0       4
44.0       4
48.0       4
56.0       2
76.0       2
47.0       2
52.0       2
Name: Number of Open Accounts, dtype: int64

In [28]:
loan_dataset['Number of Credit Problems'].value_counts()

0.0     69465
1.0      9835
2.0      1060
3.0       307
4.0       101
5.0        43
6.0        16
7.0         7
8.0         4
11.0        2
9.0         2
15.0        1
12.0        1
10.0        1
Name: Number of Credit Problems, dtype: int64

In [29]:
loan_dataset.rename(columns = {'Customer ID':'Customer_ID'}, inplace = True)

In [30]:
def append_Customer_ID(customer_ids: dict, Customer_ID: str):
    if Customer_ID not in customer_ids:
        customer_ids[Customer_ID] = len(customer_ids)
        
customer_ids = {}    
Customer_ID = "random id"
append_Customer_ID(customer_ids, Customer_ID)

In [31]:
number = customer_ids[Customer_ID]

In [32]:
# separating the data and label
X = loan_dataset.drop(columns=['Loan ID','Loan Status'],axis=1)
Y = loan_dataset['Loan Status']

In [33]:
print(X)
print(Y)

       Current Loan Amount  Term  Annual Income  Purpose  Monthly Debt  \
0                 445412.0     1      1167493.0        3       5214.74   
2               99999999.0     1      2231892.0        1      29200.53   
3                 347666.0     2       806949.0        1       8741.90   
5                 206602.0     1       896857.0        1      16367.74   
6                 217646.0     1      1184194.0        1      10855.08   
...                    ...   ...            ...      ...           ...   
99995             147070.0     1       475437.0        2       2202.86   
99996           99999999.0     1      1289416.0        1      13109.05   
99997             103136.0     1      1150545.0        1       7315.57   
99998             530332.0     1      1717524.0        1       9890.07   
99999           99999999.0     1       935180.0        1       9118.10   

       Years of Credit History  Number of Open Accounts  \
0                         17.2                      

In [34]:
X_train, X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.1,stratify=Y,random_state=2)

In [35]:
print(X.shape, X_train.shape, X_test.shape)

(80845, 10) (72760, 10) (8085, 10)


In [36]:
classifier = svm.SVC(kernel='linear')

In [None]:
#training the support Vector MacHine model
classifier.fit(X_train,Y_train)

In [None]:
X_train_prediction = classifier.predict(X_train)
training_data_accuray = accuracy_score(X_train_prediction,Y_train)

In [None]:
print('Accuracy on training data : ', training_data_accuray)

In [None]:
# accuracy score on training data
X_test_prediction = classifier.predict(X_test)
test_data_accuray = accuracy_score(X_test_prediction,Y_test)

In [None]:
print('Accuracy on test data : ', test_data_accuray)