In [2]:
#*******************************************************************************************
 #
 #  File Name:  student_loans.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, student_loans.ipynb, reads a csv file,
 #      student_loans.csv, and uses deep learning methods to process the features 
 #      in the provided dataset and create a binary classifier that can predict 
 #      whether student loan applicants will default or not.
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  04/15/2024      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

import logx
import pandas_processx
import student_loans_constants

import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from silence_tensorflow import silence_tensorflow
silence_tensorflow()

In [3]:
CONSTANT_LOCAL_FILE_NAME = 'student_loans.ipynb'


logx.set_log_mode(False)

logx.set_image_mode(False)


logx.begin_program('student_loans')

# <br> **Section 1: Extraction**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [4]:
student_loan_dataframe = pd.read_csv(student_loans_constants.CONSTANT_INPUT_FILE_PATH)

logx.log_write_object(student_loan_dataframe)

## **1.2: Display Student Loan DataFrame**

In [5]:
pandas_processx.return_formatted_table(student_loan_dataframe, 'Table 1.2: Student Loan Table')

payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.07,17.0,60.0,1.0,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5
7.4,0.66,0.0,1.8,0.07,13.0,40.0,1.0,3.51,0.56,9.4,5
7.9,0.6,0.06,1.6,0.07,15.0,59.0,1.0,3.3,0.46,9.4,5
7.3,0.65,0.0,1.2,0.07,15.0,21.0,0.99,3.39,0.47,10.0,7
7.8,0.58,0.02,2.0,0.07,9.0,18.0,1.0,3.36,0.57,9.5,7
7.5,0.5,0.36,6.1,0.07,17.0,102.0,1.0,3.35,0.8,10.5,5


# <br> **Section 2: Preprocessing**

## **2.1: Create the labels series (`y`)  from the “spam” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, the Labels**

In [6]:
y_series = student_loan_dataframe['credit_ranking']

logx.log_write_object(y_series)

In [7]:
y_series.value_counts()

credit_ranking
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [8]:
x_dataframe = student_loan_dataframe.drop(columns = 'credit_ranking', axis = 1)

logx.log_write_object(x_dataframe)

In [9]:
pandas_processx.return_formatted_table(x_dataframe, 'Table 2.1: Student Loan Features DataFrame')

payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4
7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8
7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8
11.2,0.28,0.56,1.9,0.07,17.0,60.0,1.0,3.16,0.58,9.8
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4
7.4,0.66,0.0,1.8,0.07,13.0,40.0,1.0,3.51,0.56,9.4
7.9,0.6,0.06,1.6,0.07,15.0,59.0,1.0,3.3,0.46,9.4
7.3,0.65,0.0,1.2,0.07,15.0,21.0,0.99,3.39,0.47,10.0
7.8,0.58,0.02,2.0,0.07,9.0,18.0,1.0,3.36,0.57,9.5
7.5,0.5,0.36,6.1,0.07,17.0,102.0,1.0,3.35,0.8,10.5


## **2.2: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [10]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split \
        (x_dataframe, y_series, 
         random_state = student_loans_constants.CONSTANT_DL_RANDOM_STATE_1)

In [11]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **2.3: Use the StandardScaler to Scale the X Variables**

### **Create a StandardScaler Instance**

In [12]:
current_standard_scalar = StandardScaler()

### **Fit the StandardScaler**

In [13]:
x_standard_scalar = current_standard_scalar.fit(x_train_dataframe)

### **Scale the Data**

In [14]:
x_train_scaled_nparray = x_standard_scalar.transform(x_train_dataframe)

logx.log_write_object(x_train_scaled_nparray)

In [15]:
x_test_scaled_nparray = x_standard_scalar.transform(x_test_dataframe)

logx.log_write_object(x_test_scaled_nparray)

# <br> **Section 3: Compile, Train, Evaluate, and Export the Model**

## **3.1: Compile Model**

### **Model Definition**

In [16]:
number_input_features_integer = len(x_train_scaled_nparray[0])

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of inputs (features) in the model is {:,}.' \
         .format(number_input_features_integer) 
     + '\033[0m')

[1mThe number of inputs (features) in the model is 11.[0m


In [17]:
activation_layer1_string = 'relu'

hidden_nodes_layer1_integer = 8

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of nodes in the first hidden layer is {:,}.' \
         .format(hidden_nodes_layer1_integer) 
     + '\033[0m')

[1mThe number of nodes in the first hidden layer is 8.[0m


In [18]:
activation_layer2_string = 'relu'

hidden_nodes_layer2_integer = 4

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of nodes in the second hidden layer is {:,}.' \
         .format(hidden_nodes_layer2_integer) 
     + '\033[0m')

[1mThe number of nodes in the second hidden layer is 4.[0m


In [19]:
activation_output_layer_string = 'linear'

output_layer_integer = 1

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of nodes in the output layer is {:,}.' \
         .format(output_layer_integer) 
     + '\033[0m')

[1mThe number of nodes in the output layer is 1.[0m


### **Instantiate the Model**

In [20]:
neural_network_sequential_model = tf.keras.models.Sequential()

### **Layers**

In [21]:
neural_network_sequential_model.add \
    (tf.keras.layers.Dense \
         (units = hidden_nodes_layer1_integer, 
          activation = activation_layer1_string, 
          input_dim = number_input_features_integer))

neural_network_sequential_model.add \
    (tf.keras.layers.Dense \
         (units = hidden_nodes_layer2_integer, 
          activation = activation_layer2_string))

neural_network_sequential_model.add \
    (tf.keras.layers.Dense \
         (units = output_layer_integer, 
          activation = activation_output_layer_string))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### **Model Summary**

In [22]:
neural_network_sequential_model.summary()

### **Compile**

In [23]:
neural_network_sequential_model.compile(loss = 'mean_squared_error', optimizer = 'adam', metrics = ['mse'])

## **3.2: Fit and Train Model**

In [24]:
earlystopping_callback \
    = tf.keras.callbacks.EarlyStopping  \
        (monitor = 'val_mse', mode = 'min', patience = 42, restore_best_weights = True)

neural_network_sequential_model \
    .fit \
        (x_train_scaled_nparray,
         y_train_series.values,
         epochs = 1000,
         validation_data = (x_test_scaled_nparray, y_test_series.values),
         callbacks = [earlystopping_callback])

Epoch 1/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 28.8864 - mse: 28.8864 - val_loss: 26.4748 - val_mse: 26.4748
Epoch 2/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 25.0582 - mse: 25.0582 - val_loss: 21.7818 - val_mse: 21.7818
Epoch 3/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 21.1613 - mse: 21.1613 - val_loss: 16.4217 - val_mse: 16.4217
Epoch 4/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 15.5242 - mse: 15.5242 - val_loss: 11.2381 - val_mse: 11.2381
Epoch 5/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 10.0926 - mse: 10.0926 - val_loss: 7.5534 - val_mse: 7.5534
Epoch 6/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 6.9507 - mse: 6.9507 - val_loss: 5.5020 - val_mse: 5.5020
Epoch 7/1000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x16a012850>

## **3.3: Evaluate Model**

In [25]:
model_loss_float, model_mse_float \
    = neural_network_sequential_model.evaluate(x_test_scaled_nparray, y_test_series.values, verbose = 2)

logx.print_and_log_text \
    (f'\nModel Loss: {round(model_loss_float * 100, 2)}%, '
     + f'Model Accuracy: {round(model_mse_float * 100, 2)}%')

13/13 - 0s - 1ms/step - loss: 0.4435 - mse: 0.4435

Model Loss: 44.35%, Model Accuracy: 44.35%


## **3.4: Save and Export Model**

In [26]:
neural_network_sequential_model.save(student_loans_constants.CONSTANT_NN_MODEL_FILE_PATH)

# <br> **Section 4: Predict Loan Repayment Success**

## **4.1: Reload Model**

In [27]:
reloaded_neural_network_sequential_model \
    = tf.keras.models.load_model \
        (student_loans_constants.CONSTANT_NN_MODEL_FILE_PATH)

## **4.2: Predictions**

In [28]:
predictions_nparray \
    = reloaded_neural_network_sequential_model.predict \
        (x_test_scaled_nparray).round().astype('int32')

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


## **4.3: Compare Predictions and Actual Values**

In [29]:
comparison_dataframe \
    = pd.DataFrame({'predictions': predictions_nparray.ravel(), 'actual': y_test_series.values})

In [30]:
pandas_processx.return_formatted_table \
    (comparison_dataframe, 'Table 4.3: Model Predictions vs. Actual Values')

predictions,actual
6,6
6,6
5,5
5,6
6,6
5,7
5,4
6,5
5,5
5,5


In [31]:
# logx.end_program()