In [15]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
import warnings
warnings.filterwarnings('ignore')


## Retrieve the Data

The data is located in the Challenge Files Folder:

* `lending_data.csv`

Import the data using Pandas. Display the resulting dataframe to confirm the import was successful.


In [16]:
# Import the data
file_path = Path('Resources/lending_data.csv')
credit_risk = pd.read_csv(file_path)
credit_risk.head()


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


### **PART 1: Prepare the Data**

In [17]:
# Column names
credit_risk.columns


Index(['loan_size', 'interest_rate', 'borrower_income', 'debt_to_income',
       'num_of_accounts', 'derogatory_marks', 'total_debt', 'loan_status'],
      dtype='object')

In [18]:
#Count of rows with null values
credit_risk.isnull().sum()


loan_size           0
interest_rate       0
borrower_income     0
debt_to_income      0
num_of_accounts     0
derogatory_marks    0
total_debt          0
loan_status         0
dtype: int64

In [26]:
# Seperate the data set into data and target - define the X (features) and y (Target) sets

#y = credit_risk["loan_size"].values
#X = credit_risk.drop('loan_size', axis=1)
credit_risk.head()


Unnamed: 0,loan_size,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,10700.0,7.672,52800,0.431818,5,1,22800,0
1,8400.0,6.692,43600,0.311927,3,0,13600,0
2,9000.0,6.963,46100,0.349241,3,0,16100,0
3,10700.0,7.664,52700,0.43074,5,1,22700,0
4,10800.0,7.698,53000,0.433962,5,1,23000,0


In [27]:
# Standardize the data
X_scaled = StandardScaler().fit_transform(X)
print(X_scaled[0:5])


[[ 0.4268375   0.42740435  0.66857141  0.61614258  1.04399575  0.42740435
  -0.18253038]
 [-0.67491817 -0.67155173 -0.80216566 -0.43404935 -0.67397306 -0.67155173
  -0.18253038]
 [-0.370249   -0.37292236 -0.34442423 -0.43404935 -0.67397306 -0.37292236
  -0.18253038]
 [ 0.41784357  0.41545918  0.65534557  0.61614258  1.04399575  0.41545918
  -0.18253038]
 [ 0.45606775  0.4512947   0.69487335  0.61614258  1.04399575  0.4512947
  -0.18253038]]


### **PART 2: Apply Dimensionality Reduction**

In [28]:
# Initialize PCA model
pca = PCA(n_components=0.90)

# Use PCA to compress data
X_pca = pca.fit_transform(X_scaled)

# Print number of features in the orignial and reduced datasets
print('Number of features in original dataset:', X.shape[1])
print('Number of features in reduced dataset:', X_pca.shape[1])

Number of features in original dataset: 7
Number of features in reduced dataset: 2


In [32]:
# Create new dataframe for TSNE
credit_2 = credit_risk.drop(['loan_size'], axis=1)
credit_2.head()

Unnamed: 0,interest_rate,borrower_income,debt_to_income,num_of_accounts,derogatory_marks,total_debt,loan_status
0,7.672,52800,0.431818,5,1,22800,0
1,6.692,43600,0.311927,3,0,13600,0
2,6.963,46100,0.349241,3,0,16100,0
3,7.664,52700,0.43074,5,1,22700,0
4,7.698,53000,0.433962,5,1,23000,0


In [33]:
# Initialize TNSE model
tsne = TSNE(learning_rate=50)

# Reduce dimensions
tsne_features = tsne.fit_transform(X_pca)

# Print number of features
print('Number of features in reduced dataset:', tsne_features.shape[1])

Number of features in reduced dataset: 2


In [None]:
# Prepare the plot for the dataset

# Create a new dataframe with the transformed features
