# Student Loan Risk with Deep Learning

In [None]:
# !pip install plotly.express

In [None]:
# !pip install pathlib

In [1]:
# Imports
import pandas as pd
import tensorflow as tf
import numpy as np
import plotly.express as px
import sklearn.preprocessing
import sklearn.model_selection
from tensorflow import keras
from tensorflow.keras.layers import Dense, LeakyReLU
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn import preprocessing
from pathlib import Path


---

## Prepare the data to be used on a neural network model

### Step 1: Read the `student-loans.csv` file into a Pandas DataFrame. Review the DataFrame, looking for columns that could eventually define your features and target variables.   

In [2]:
# Read the csv into a Pandas DataFrame
file_path = "https://static.bc-edx.com/ai/ail-v-1-0/m18/lms/datasets/student-loans.csv"
loans_df = pd.read_csv(file_path)

# Review the DataFrame
loans_df.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,0


In [None]:
# @title payment_history vs location_parameter

from matplotlib import pyplot as plt
loans_df.plot(kind='scatter', x='payment_history', y='location_parameter', s=32, alpha=.8)
plt.gca().spines[['top', 'right',]].set_visible(False)

In [3]:
# Reminder - explore plotly express

In [4]:
# Check statistics of data within columns
loans_df.describe()


Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
count,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0,1599.0
mean,8.319637,0.527821,0.270976,2.538806,0.087467,15.874922,46.467792,0.996747,3.311113,0.658149,10.422983,0.534709
std,1.741096,0.17906,0.194801,1.409928,0.047065,10.460157,32.895324,0.001887,0.154386,0.169507,1.065668,0.49895
min,4.6,0.12,0.0,0.9,0.012,1.0,6.0,0.99007,2.74,0.33,8.4,0.0
25%,7.1,0.39,0.09,1.9,0.07,7.0,22.0,0.9956,3.21,0.55,9.5,0.0
50%,7.9,0.52,0.26,2.2,0.079,14.0,38.0,0.99675,3.31,0.62,10.2,1.0
75%,9.2,0.64,0.42,2.6,0.09,21.0,62.0,0.997835,3.4,0.73,11.1,1.0
max,15.9,1.58,1.0,15.5,0.611,72.0,289.0,1.00369,4.01,2.0,14.9,1.0


In [5]:
# Review the data types associated with the columns
loans_df.dtypes

payment_history           float64
location_parameter        float64
stem_degree_score         float64
gpa_ranking               float64
alumni_success            float64
study_major_code          float64
time_to_completion        float64
finance_workshop_score    float64
cohort_ranking            float64
total_loan_score          float64
financial_aid_score       float64
credit_ranking              int64
dtype: object

In [6]:
# Check the credit_ranking value counts
print()
loans_df["credit_ranking"].value_counts()




1    855
0    744
Name: credit_ranking, dtype: int64

In [7]:
loans_df.shape

(1599, 12)

In [8]:
# Display the column names horizontally using a for loop
print("Column Names:")
for column in loans_df.columns:
    print(column)

Column Names:
payment_history
location_parameter
stem_degree_score
gpa_ranking
alumni_success
study_major_code
time_to_completion
finance_workshop_score
cohort_ranking
total_loan_score
financial_aid_score
credit_ranking


### Step 1.1: Remove features to see if model performance improves.

In [None]:
# OPTIONAL:REMOVE FEATURES FROM ORIGINAL DATAFRAME
# loans_df = loans_df.drop(columns=["location_parameter", "finance_workshop_score", "financial_aid_score"], axis=1)
# loans_df.head()

### Step 2: Using the preprocessed data, create the features (`X`) and target (`y`) datasets. The target dataset should be defined by the preprocessed DataFrame column “credit_ranking”. The remaining columns should define the features dataset.

In [9]:
# Define the target set y using the credit_ranking column
y = loans_df["credit_ranking"]

# # Display a sample of y
# y

array_y = np.array(y)

# Display y as an array
print(array_y)

[0 0 0 ... 1 0 1]


In [10]:
# Define features set X by selecting all columns but credit_ranking
X = loans_df.drop("credit_ranking", axis=1)

# Review the features DataFrame
X.head()

Unnamed: 0,payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


### Step 3: Split the features and target sets into training and testing datasets.


In [11]:
# Split the preprocessed data into a training and testing dataset
# Assign the function a random_state equal to 1

# This line of code uses train_test_split function from
# sklearn.model_selection module to split dataset X and target variable y
# into training and testing sets.

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# After executing this line:
# X_train will contain training set of features
# X_test will contain testing set of features
# y-train will contain training set of target variable values
# y-test will contain testing set of target variable values

# The train_test_split function randomly shuffles and splits the X and y data
# into training and testing sets based on the provided random_state,
# with 75% of the data used for training and 25% for testing by default.

In [None]:
X_train.head(3)

### Step 4: Use scikit-learn's `StandardScaler` to scale the features data.

In [12]:
# Create a StandardScaler instance

# This line creates an instance of the StandardScaler class from
# sklearn.preprocessing module. The StandardScaler is used for
# standardizing features by removing the mean and scaling to unit variance.

scaler = StandardScaler()

# After this line of code, the variable scaler will hold an instance of the
# StandardScaler class, which can be used to scale and transform
# features in your dataset.

# Fit the scaler to the features training dataset

# This line fits the X_train dataset to the scaler instance using fit method.
# This means scaler will compute mean and standard deviation for scaling
# based on values in the X_train dataset.

X_scaler = scaler.fit(X_train)

# Now, the X_scaler will hold the fitted scaler instance, which can be used
# to transform features in the training and testing datasets.

# Fit the scaler to the features training dataset

# Code below transforms the features in training set X_train and testing set
# X_test using the fitted X_scaler instance.  This scaling is based on the
# mean and standard deviation computed during fitting (code line above)
# of X_scaler instance to the X_train dataset.

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Now, we have training data and test data modified with the same scaler.
# This data is ready for development of a model.

In [13]:
X_train_scaled

array([[-0.56794375, -1.18088385, -0.19241296, ...,  0.73249574,
        -0.22553894, -0.31915735],
       [-0.00940983,  0.80065564, -0.04123135, ..., -0.03016065,
        -0.75844691, -1.15783431],
       [ 2.33643265, -0.84597577,  1.67216021, ..., -1.36480934,
         0.42579302,  0.05358797],
       ...,
       [-0.9589175 ,  1.07974571, -0.89792713, ...,  0.22405814,
        -0.9952949 ,  0.33314696],
       [-0.62379715,  0.54947458, -1.35147195, ...,  0.35116754,
        -0.46238693, -1.34420697],
       [ 0.43741731, -0.73433974,  1.16822152, ..., -0.66570765,
         0.18894504,  1.91731456]])

In [None]:
# OPTIONAL
# import matplotlib.pyplot as plt
# plt.imshow(X_train_scaled, aspect='auto')
# plt.colorbar()
# plt.show()

In [None]:
# OPTIONAL
# plt.imshow(X_test_scaled, aspect='auto')
# plt.colorbar()
# plt.show()

In [14]:
# Create a dictionary with dataset names and their shapes
data = {
    'Dataset': ['X', 'X_train', 'X_test', 'y', 'y_train', 'y_test'],
    'Shape': [X.shape, X_train.shape, X_test.shape, y.shape, y_train.shape, y_test.shape]
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)
print(df)

   Dataset       Shape
0        X  (1599, 11)
1  X_train  (1199, 11)
2   X_test   (400, 11)
3        y     (1599,)
4  y_train     (1199,)
5   y_test      (400,)


---

## Compile and Evaluate a Model Using a Neural Network

### Step 1: Create a deep neural network by assigning the number of input features, the number of layers, and the number of neurons on each layer using Tensorflow’s Keras.

> **Hint** You can start with a two-layer deep neural network model that uses the `relu` activation function for both layers.


In [16]:
# Define the the number of inputs (features) to the model
# number_input_features = len(X_train_scaled[0])
rows, columns = X_train_scaled.shape
number_input_features = columns
hidden_nodes_layer1 = 6
hidden_nodes_layer2 = 6
# hidden_nodes_layer3 = 6

# Review the number of features, and model parameters
print()
print(f"Features Count: {number_input_features}")
print(f"Layer1 Neurons: {hidden_nodes_layer1}")
print(f"Layer2 Neurons: {hidden_nodes_layer2}")
# print(f"Layer3 Neurons: {hidden_nodes_layer3}")


Features Count: 11
Layer1 Neurons: 6
Layer2 Neurons: 6


In [17]:
# Create a sequential neural network model
model = keras.Sequential()

# Define Hidden Layer 1
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu"))

# Define Hidden Layer 2
model.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# # Define Hidden Layer 3
# model.add(Dense(units=hidden_nodes_layer3))
# model.add(LeakyReLU(alpha=0.01))

# Define Output Layer
model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

In [18]:
# Display the Sequential model summary
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 6)                 72        
                                                                 
 dense_1 (Dense)             (None, 6)                 42        
                                                                 
 dense_2 (Dense)             (None, 1)                 7         
                                                                 
Total params: 121 (484.00 Byte)
Trainable params: 121 (484.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


## Why does 'leaky_re_lu' row show zero in Param #?

In neural network models, particularly when summarized in libraries like TensorFlow/Keras, the "Param #" column indicates the number of trainable parameters within each layer. Trainable parameters are those that are adjusted via backpropagation during the training process, such as the weights and biases in dense (fully connected) layers.

The LeakyReLU layer, or any activation function for that matter, does not have trainable parameters in the same sense that dense layers do.

Activation functions like LeakyReLU are mathematical operations applied to the input they receive. In the case of LeakyReLU, it applies the function f(x) = x for x > 0 and f(x) = αx for x ≤ 0, where α is a small constant.

While α is a parameter, it is not a trainable parameter; it's a hyperparameter that you set before training the model. Because it does not change during training, it does not count towards the model's trainable parameters.

That's why you see "0" under "Param #" for the leaky_re_lu (LeakyReLU) row in your model summary. It simply means there are no weights or biases to be learned within that layer; it only performs a fixed mathematical operation on its inputs.

### Step 2: Compile and fit the model using the `binary_crossentropy` loss function, the `adam` optimizer, and the `accuracy` evaluation metric.


In [19]:
# Compile the Sequential model
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

In [20]:
# Fit the model using 50 epochs and the training data
fit_model = model.fit(X_train_scaled,y_train,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Step 3: Evaluate the model using the test data to determine the model’s loss and accuracy.


In [21]:
# Evaluate the model using the test data
model_loss, model_accuracy = model.evaluate(X_test_scaled,y_test,verbose=2)

# Display evaluation results
print()
print(f"Loss: {model_loss:.2f}, Accuracy: {model_accuracy:.2f}")

13/13 - 0s - loss: 0.5384 - accuracy: 0.7550 - 274ms/epoch - 21ms/step

Loss: 0.54, Accuracy: 0.75


### Step 4: Save and export your model to a keras file, and name the file `student_loans.keras`.


In [22]:
# Set the model's file path
file_path = Path("student_loans.keras")

# Export your model to a keras file
model.save(file_path)

---
## Predict Loan Repayment Success by Using your Neural Network Model

### Step 1: Reload your saved model.

In [23]:
# Set the model's file path
file_path = Path("student_loans.keras")

# Load the model to a new object
new_model = keras.models.load_model(file_path)

### Step 2: Make predictions on the testing data and save the predictions to a DataFrame.

In [24]:
# Make predictions with the test data
predictions = new_model.predict(X_test_scaled)

# Display the first 5 predictions
print(predictions[:5])

[[0.12596737]
 [0.33481395]
 [0.9152258 ]
 [0.73194474]
 [0.98359704]]


In [25]:
# Save the predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['Prediction'])

# Round the predictions to binary results
predictions_df['BinaryPrediction'] = (predictions_df['Prediction'] > 0.5).astype(int)

predictions_df.head()

Unnamed: 0,Prediction,BinaryPrediction
0,0.125967,0
1,0.334814,0
2,0.915226,1
3,0.731945,1
4,0.983597,1


In [26]:
# Drop the 'Prediction' column because it contains continuous data.
# Classification metrics can only handle binary data.
# Binary predictions are contained in column 'BinaryPrediction'.

predictions_rounded = predictions_df.drop("Prediction", axis=1)
predictions_rounded.head()

Unnamed: 0,BinaryPrediction
0,0
1,0
2,1
3,1
4,1


### Step 4: Display a classification report with the y test data and predictions

In [27]:
# Print the classification report with the y test data and predictions
print(classification_report(y_test, predictions_rounded))

              precision    recall  f1-score   support

           0       0.73      0.76      0.74       188
           1       0.78      0.75      0.76       212

    accuracy                           0.76       400
   macro avg       0.75      0.76      0.75       400
weighted avg       0.76      0.76      0.76       400



---
## Discuss creating a recommendation system for student loans

Briefly answer the following questions in the space provided:

1. Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.

2. Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.

3. Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.

**1. Describe the data that you would need to collect to build a recommendation system to recommend student loan options for students. Explain why this data would be relevant and appropriate.**

To build a recommendation system for student loan options, the following data is needed:
- y = prior student loan performance.  y can be one of three integer choices: 0 (defaulted on loan), 1 (late on payments), 2 (current = less than 1 payments missed per 12 months)

Defining y in this way enables a more nuanced approach to defining loan status, which should help the model be more accurate in its predictions.  The downside is more 'rows' of data are needed than when y is just two binary choices (0,1).

- features:
*  financial need as family income, funds earmarked for the loan, expenses as % of income, liquid assets as % of net worth, student income history and projection
*  academic performance: gpa, athlete, band, clubs
*  education credentials: college ranking & % hired from college, major ranking & % hired from college, minor as binary 0/1, student clubs 0/1
*  education level: undegraduate, graduate, doctoral, certificate
*  degree sought: Associates, BA, BS, MA, MS, PhD
*  institution type: community, 4-year, vocational school
*  interest rates
*  repayment terms
*  loan amt
*  fees
*  loan forgivenss programs
*  lender reputation
*  external economic indicators
*  legislation effects

These features are relevant because they contextualize the main drivers of loan performance: ability of lendee to pay, willingness of lendor to foreclose, key external factors.


**2. Based on the data you chose to use in this recommendation system, would your model be using collaborative filtering, content-based filtering, or context-based filtering? Justify why the data you selected would be suitable for your choice of filtering method.**

I would choose context-based filtering for features like education level and degree sought. I also would chose content based filtering for features liek interest rates, loan terms and eligibility criteria.


**3. Describe two real-world challenges that you would take into consideration while building a recommendation system for student loans. Explain why these challenges would be of concern for a student loan recommendation system.**

Bias.  Bias inherent in data may cause false negatives.

Data Privacy and Ssecurity.  Handling sensitive personal and financial information requires adherence to regulations and laws.

Cold Start Problem.  How can we address users with a lot of missing data?

Regulatory Compliance.  Loans must comply with federal, state, local laws.

Explainability.  It's important the user understand what decisions are being made and how they are being made.  This improves confidence and trust, and in some way, validates the recommendation.