<a href="https://colab.research.google.com/github/mzignis/loan_prediction/blob/master/svm_linear_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
HOME = '/content/drive/My Drive/Colab Notebooks/projects/loan_prediction/loan_prediction'
%cd $HOME

/content/drive/My Drive/Colab Notebooks/projects/loan_prediction/loan_prediction


In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.svm import LinearSVC
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import mean_squared_error, confusion_matrix
from sklearn.model_selection import GridSearchCV

### Load data

In [22]:
train_data = pd.read_pickle('data/train.p')
X_train_df, y_train_df = train_data['X_train'], train_data['y_train']
X_train_df.shape, y_train_df.shape

((614, 21), (614,))

In [23]:
test_data = pd.read_pickle('data/test.p')
X_test_df = test_data['X_test']
X_test_df.shape

(367, 21)

### Preapare data

In [0]:
X_train, y_train = X_train_df.values, y_train_df.values

### Creating model

In [0]:
model = LinearSVC()
model.fit(X_train, y_train)
y_train_pred = model.predict(X_train)

In [13]:
def score_model(y_true, y_pred):
    print(f'Mean Square Error: {mean_squared_error(y_true, y_pred):.3f}')
    print(f'Accuracy:          {accuracy_score(y_true, y_pred):.3f}')
    print(f'Precission:        {precision_score(y_true, y_pred):.3f}')
    print(f'Recall:            {recall_score(y_true, y_pred):.3f}')
    print(f'F1:                {f1_score(y_true, y_pred):.3f}')
    print()

    df = pd.DataFrame(
        confusion_matrix(y_true, y_pred), 
        index=['Actual False', 'Actual True'], 
        columns=['Detected False', 'Detected True']
    )

    return df

score_model(y_train, y_train_pred)

Mean Square Error: 0.187
Accuracy:          0.813
Precission:        0.793
Recall:            0.983
F1:                0.878



Unnamed: 0,Detected False,Detected True
Actual False,84,108
Actual True,7,415


### Select best model

In [15]:
param = {
    'random_state': [42],
    'penalty': ['l1', 'l2'],
    'dual': [False],
    'tol': np.linspace(1e-2, 0)[:-1], 
    'C': np.linspace(1, 10),
    'max_iter': [1e6]
}

model = LinearSVC()
grid_searcher = GridSearchCV(model, param, cv=20)
grid_searcher.fit(X_train, y_train)
grid_searcher.best_estimator_

LinearSVC(C=1.183673469387755, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000000.0, multi_class='ovr', penalty='l2', random_state=42,
          tol=0.00020408163265306194, verbose=0)

In [16]:
model = grid_searcher.best_estimator_
model

LinearSVC(C=1.183673469387755, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, loss='squared_hinge',
          max_iter=1000000.0, multi_class='ovr', penalty='l2', random_state=42,
          tol=0.00020408163265306194, verbose=0)

In [17]:
y_pred = model.predict(X_train)
score_model(y_train, y_train_pred)

Mean Square Error: 0.187
Accuracy:          0.813
Precission:        0.793
Recall:            0.983
F1:                0.878



Unnamed: 0,Detected False,Detected True
Actual False,84,108
Actual True,7,415


### Feature relevance

In [25]:
features_df = pd.DataFrame(zip(X_train_df.columns, model.coef_[0]), columns=['Feature', 'Coef_'])
features_df['Relevance'] = features_df['Coef_'] / np.abs(model.coef_).sum()
features_df['Relevance (ABS)'] = features_df['Coef_'].abs() / np.abs(model.coef_).sum()
features_df.sort_values(by='Relevance (ABS)', ascending=False)

Unnamed: 0,Feature,Coef_,Relevance,Relevance (ABS)
18,CoapplicantIncome,-0.76483,-0.20185,0.20185
13,Credit_History_1.0,0.706377,0.186424,0.186424
12,Credit_History_0.0,-0.705002,-0.186061,0.186061
19,LoanAmount,-0.373019,-0.098446,0.098446
17,ApplicantIncome,0.197193,0.052042,0.052042
20,Loan_Amount_Term,-0.173979,-0.045916,0.045916
15,Property_Area_Semiurban,0.15035,0.03968,0.03968
5,Dependents_1,-0.124353,-0.032819,0.032819
14,Property_Area_Rural,-0.112989,-0.029819,0.029819
3,Married_Yes,0.093988,0.024805,0.024805


In [26]:
features_df.sort_values(by='Relevance', ascending=False)

Unnamed: 0,Feature,Coef_,Relevance,Relevance (ABS)
13,Credit_History_1.0,0.706377,0.186424,0.186424
17,ApplicantIncome,0.197193,0.052042,0.052042
15,Property_Area_Semiurban,0.15035,0.03968,0.03968
3,Married_Yes,0.093988,0.024805,0.024805
6,Dependents_2,0.089259,0.023557,0.023557
8,Education_Graduate,0.062976,0.01662,0.01662
4,Dependents_0,0.019726,0.005206,0.005206
7,Dependents_3+,0.016742,0.004419,0.004419
0,Gender_Female,0.004054,0.00107,0.00107
11,Self_Employed_Yes,0.001013,0.000267,0.000267
