# Student Loan Risk with Deep Learning

In [243]:
# Imports
import pandas as pd
from pathlib import Path
import tensorflow as tf
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from keras.utils import to_categorical
from keras.optimizers import SGD
from keras.optimizers import Adam

---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student_loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [244]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/mbc/ai/m6/datasets/student_loans.csv"


# Review the DataFrame
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [245]:
# Review the data types associated with the columns
df.dtypes

payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object

### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [246]:
# Define the target set y using the credit_ranking column
y = df['credit_ranking']

# Display a sample of y
y

0       5
1       5
2       5
3       6
4       5
       ..
1594    5
1595    6
1596    6
1597    5
1598    6
Name: credit_ranking, Length: 1599, dtype: int64

In [247]:
y.value_counts()

credit_ranking
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

In [248]:
# Define features set X by selecting all columns but credit_ranking
X = df.copy().drop('credit_ranking', axis= 1)

# Review the features DataFrame
X

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4
...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2


### Step 3: Split the features and target sets into training and testing datasets.


In [249]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,stratify=y)

In [250]:
# Import SMOTE from imblearn
from imblearn.over_sampling import SMOTE

# Instantiate the SMOTE instance 
# Set the sampling_strategy parameter equal to auto
smote_sampler = SMOTE(random_state=1, sampling_strategy='auto')

# Fit the training data to the smote_sampler model
X_resampled, y_resampled = smote_sampler.fit_resample(X_train, y_train)

In [251]:
X_resampled

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.900000,0.180000,0.400000,1.800000,0.062000,7.000000,20.000000,0.994100,3.280000,0.700000,11.100000
1,7.000000,0.780000,0.080000,2.000000,0.093000,10.000000,19.000000,0.995600,3.400000,0.470000,10.000000
2,7.500000,0.770000,0.200000,8.100000,0.098000,30.000000,92.000000,0.998920,3.200000,0.580000,9.200000
3,5.900000,0.395000,0.130000,2.400000,0.056000,14.000000,28.000000,0.993620,3.620000,0.670000,12.400000
4,6.400000,0.370000,0.250000,1.900000,0.074000,21.000000,49.000000,0.997400,3.570000,0.620000,9.800000
...,...,...,...,...,...,...,...,...,...,...,...
3061,10.362621,0.337024,0.537786,2.651904,0.072595,5.259522,16.259522,0.996992,3.150000,0.720071,11.181665
3062,8.923103,0.376303,0.490055,2.704621,0.078728,6.635862,17.000000,0.995112,3.165897,0.856414,12.145103
3063,7.546888,0.365133,0.384333,2.792886,0.066902,15.000000,32.964432,0.995999,3.289466,0.809555,12.043331
3064,7.885545,0.381801,0.406517,3.412087,0.076121,17.746416,38.156386,0.996848,3.365900,0.842654,12.785545


In [252]:
y_resampled

0       5
1       5
2       5
3       6
4       6
       ..
3061    8
3062    8
3063    8
3064    8
3065    8
Name: credit_ranking, Length: 3066, dtype: int64

In [253]:
y_resampled.value_counts()

credit_ranking
5    511
6    511
7    511
4    511
3    511
8    511
Name: count, dtype: int64

### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [254]:
# Create a StandardScaler instance
scaler = StandardScaler()
X_scaler = scaler.fit(X_resampled)

# Fit the scaler to the features training dataset
X_train_scaled = X_scaler.transform(X_resampled)

# Fit the scaler to the features training dataset
X_test_scaled = X_scaler.transform(X_test)

---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [255]:
# Define the number of inputs (features) to the model
number_input_features = 11

# Review the number of features
number_input_features

11

In [256]:
# Define the number of neurons in the output layer
output_classes =1

In [257]:
# Define the number of hidden nodes for the first hidden layer
hidden_nodes_layer1 = 8

# Review the number hidden nodes in the first layer
hidden_nodes_layer1

8

In [258]:
# Define the number of hidden nodes for the second hidden layer
hidden_nodes_layer2 = 4

# Review the number hidden nodes in the second layer
hidden_nodes_layer2

4

In [259]:
# Create the Sequential model instance
nn = Sequential()

In [260]:
# Add the first hidden layer
nn.add(Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))


In [261]:
# Add the second hidden layer
nn.add(Dense(units=hidden_nodes_layer2, activation="relu"))

In [262]:
# Add the output layer to the model specifying the number of output neurons and activation function
nn.add(Dense(units=output_classes, activation="linear"))

In [263]:
# Display the Sequential model summary
nn.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_31 (Dense)            (None, 8)                 96        
                                                                 
 dense_32 (Dense)            (None, 4)                 36        
                                                                 
 dense_33 (Dense)            (None, 1)                 5         
                                                                 
Total params: 137
Trainable params: 137
Non-trainable params: 0
_________________________________________________________________


### Step 2: Compile and fit the model using the `mse` loss function, the `adam` optimizer, and the `mse` evaluation metric.


In [264]:
#opt = SGD(lr=0.001, momentum=0.9)
opt  = Adam(learning_rate=0.00001)
# Compile the Sequential model
nn.compile(loss="mean_squared_error", optimizer=opt, metrics=["mse"])

In [265]:
# Fit the model using 50 epochs and the training data
model = nn.fit(X_train_scaled, y_resampled, epochs=500, validation_data=(X_test_scaled, y_test))

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [266]:
# Evaluate the model loss and accuracy metrics using the evaluate method and the test data
model_loss, model_mse = nn.evaluate(X_test_scaled, y_test, verbose=2)

# Display the model loss and accuracy results
print(f"Loss: {model_loss}, Accuracy: {model_mse}")

13/13 - 0s - loss: 3.8054 - mse: 3.8054 - 21ms/epoch - 2ms/step
Loss: 3.8054003715515137, Accuracy: 3.8054003715515137


### Step 4: Save and export your model to an HDF5 file, and name the file `student_loans.h5`.


In [267]:
# Set the model's file path
file_path = Path("saved_models/student_loans.h5")

# Export your model to a HDF5 file
nn.save(file_path)

---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [268]:
# Set the model's file path
file_path = Path("saved_models/student_loans.h5")

# Load the model to a new object
nn_imported = tf.keras.models.load_model(file_path)

### Step 2: Make predictions on the testing data.

In [269]:
# Make predictions on the testing data
predictions = nn_imported.predict(X_test_scaled).round().astype("int32")



### Step 3: Create a DataFrame to compare the predictions with the actual values.

In [270]:
# Create a DataFrame to compare the predictions with the actual values
results = pd.DataFrame({"predictions": predictions.ravel(), "actual": y_test})
results

Unnamed: 0,predictions,actual
551,3,6
1413,6,5
1090,7,8
1369,5,4
536,4,5
...,...,...
624,5,5
1532,3,6
1073,2,6
839,4,5


### Step 4: Display a sample of the DataFrame you created in step 3.

In [271]:
# Display sample data
results.head(100)

Unnamed: 0,predictions,actual
551,3,6
1413,6,5
1090,7,8
1369,5,4
536,4,5
...,...,...
996,7,7
148,3,6
618,5,5
135,3,5
