In [1]:
#*******************************************************************************************
 #
 #  File Name:  student_loans.ipynb
 #
 #  File Description:
 #      This interactive Python notebook, student_loans.ipynb, reads a csv file,
 #      student_loans.csv, and uses deep learning methods to process the features 
 #      in the provided dataset and create a binary classifier that can predict 
 #      whether student loan applicants will default or not.
 #
 #
 #  Date            Description                             Programmer
 #  ----------      ------------------------------------    ------------------
 #  04/15/2024      Initial Development                     Nicholas J. George
 #
 #******************************************************************************************/

import logx
import pandas_processx
import student_loans_constants

import pandas as pd
import tensorflow as tf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from silence_tensorflow import silence_tensorflow
silence_tensorflow()

2024-04-17 09:02:03.739456: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
CONSTANT_LOCAL_FILE_NAME = 'student_loans.ipynb'


logx.set_log_mode(False)

logx.set_image_mode(False)


logx.begin_program('student_loans')

# <br> **Section 1: Extraction**

## **1.1: Read the CSV data into a Pandas DataFrame**

In [3]:
student_loan_dataframe = pd.read_csv(student_loans_constants.CONSTANT_INPUT_FILE_PATH)

logx.log_write_object(student_loan_dataframe)

## **1.2: Display Student Loan DataFrame**

In [4]:
pandas_processx.return_formatted_table(student_loan_dataframe, 'Table 1.2: Student Loan Table')

payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score,credit_ranking
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5
7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8,5
7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8,5
11.2,0.28,0.56,1.9,0.07,17.0,60.0,1.0,3.16,0.58,9.8,6
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4,5
7.4,0.66,0.0,1.8,0.07,13.0,40.0,1.0,3.51,0.56,9.4,5
7.9,0.6,0.06,1.6,0.07,15.0,59.0,1.0,3.3,0.46,9.4,5
7.3,0.65,0.0,1.2,0.07,15.0,21.0,0.99,3.39,0.47,10.0,7
7.8,0.58,0.02,2.0,0.07,9.0,18.0,1.0,3.36,0.57,9.5,7
7.5,0.5,0.36,6.1,0.07,17.0,102.0,1.0,3.35,0.8,10.5,5


# <br> **Section 2: Preprocessing**

## **2.1: Create the labels series (`y`)  from the “spam” column, and then create the features (`X`) DataFrame from the remaining columns.**

### **Separate the Y Variable, the Labels**

In [5]:
y_series = student_loan_dataframe['credit_ranking']

logx.log_write_object(y_series)

In [6]:
y_series.value_counts()

credit_ranking
5    681
6    638
7    199
4     53
8     18
3     10
Name: count, dtype: int64

### **Separate the X Variable, the Features**

In [7]:
x_dataframe = student_loan_dataframe.drop(columns = 'credit_ranking', axis = 1)

logx.log_write_object(x_dataframe)

In [8]:
pandas_processx.return_formatted_table(x_dataframe, 'Table 2.1: Student Loan Features DataFrame')

payment_history,location_parameter,stem_degree_score,gpa_ranking,alumni_success,study_major_code,time_to_completion,finance_workshop_score,cohort_ranking,total_loan_score,financial_aid_score
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4
7.8,0.88,0.0,2.6,0.1,25.0,67.0,1.0,3.2,0.68,9.8
7.8,0.76,0.04,2.3,0.09,15.0,54.0,1.0,3.26,0.65,9.8
11.2,0.28,0.56,1.9,0.07,17.0,60.0,1.0,3.16,0.58,9.8
7.4,0.7,0.0,1.9,0.08,11.0,34.0,1.0,3.51,0.56,9.4
7.4,0.66,0.0,1.8,0.07,13.0,40.0,1.0,3.51,0.56,9.4
7.9,0.6,0.06,1.6,0.07,15.0,59.0,1.0,3.3,0.46,9.4
7.3,0.65,0.0,1.2,0.07,15.0,21.0,0.99,3.39,0.47,10.0
7.8,0.58,0.02,2.0,0.07,9.0,18.0,1.0,3.36,0.57,9.5
7.5,0.5,0.36,6.1,0.07,17.0,102.0,1.0,3.35,0.8,10.5


## **2.2: Split the Data into Training and Testing Datasets by Using `train_test_split`.**

In [9]:
x_train_dataframe, x_test_dataframe, \
y_train_series, y_test_series \
    = train_test_split \
        (x_dataframe, y_series, 
         random_state = student_loans_constants.CONSTANT_DL_RANDOM_STATE_1)

In [10]:
logx.log_write_object(x_train_dataframe)

logx.log_write_object(x_test_dataframe)

logx.log_write_object(y_train_series)

logx.log_write_object(y_test_series)

## **2.3: Use the StandardScaler to Scale the X Variables**

### **Create a StandardScaler Instance**

In [11]:
current_standard_scalar = StandardScaler()

### **Fit the StandardScaler**

In [12]:
x_standard_scalar = current_standard_scalar.fit(x_train_dataframe)

### **Scale the Data**

In [13]:
x_train_scaled_nparray = x_standard_scalar.transform(x_train_dataframe)

logx.log_write_object(x_train_scaled_nparray)

In [14]:
x_test_scaled_nparray = x_standard_scalar.transform(x_test_dataframe)

logx.log_write_object(x_test_scaled_nparray)

# <br> **Section 3: Compile, Train, Evaluate, and Export the Model**

## **3.1: Compile Model**

### **Model Definition**

In [15]:
number_input_features_integer = len(x_train_scaled_nparray[0])

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of inputs (features) in the model is {:,}.' \
         .format(number_input_features_integer) 
     + '\033[0m')

[1mThe number of inputs (features) in the model is 11.[0m


In [16]:
input_layer_units_integer = 98

input_layer_activation_string = 'tanh'

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of nodes in the input layer is {:,}.' \
         .format(input_layer_units_integer) 
     + '\033[0m')

[1mThe number of nodes in the input layer is 98.[0m


In [17]:
hidden_layer_units_integer = 45

hidden_layer_activation_string = 'tanh'

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of nodes in the hidden layer is {:,}.' \
         .format(hidden_layer_units_integer) 
     + '\033[0m')

[1mThe number of nodes in the hidden layer is 45.[0m


In [18]:
output_layer_units_integer = 1

output_layer_activation_string = 'linear'

logx.print_and_log_text \
    ('\033[1m'
     + 'The number of nodes in the output layer is {:,}.' \
         .format(output_layer_units_integer) 
     + '\033[0m')

[1mThe number of nodes in the output layer is 1.[0m


### **Instantiate the Model**

In [19]:
neural_network_sequential_model = tf.keras.models.Sequential()

### **Layers**

In [20]:
neural_network_sequential_model.add \
    (tf.keras.layers.Dense \
         (units = input_layer_units_integer, 
          activation = input_layer_activation_string, 
          input_dim = number_input_features_integer))

neural_network_sequential_model.add \
    (tf.keras.layers.Dense \
         (units = hidden_layer_units_integer, 
          activation = hidden_layer_activation_string))

neural_network_sequential_model.add \
    (tf.keras.layers.Dense \
         (units = output_layer_units_integer,
          activation = output_layer_activation_string))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


### **Model Summary**

In [21]:
neural_network_sequential_model.summary()

### **Compile**

In [22]:
neural_network_sequential_model.compile \
    (loss = 'mean_squared_error', 
     optimizer = tf.keras.optimizers.Adam(learning_rate = 0.0024876),
     metrics = ['mse'])

## **3.2: Fit and Train Model**

In [23]:
earlystopping_callback \
    = tf.keras.callbacks.EarlyStopping  \
        (monitor = 'val_mse', mode = 'min', patience = 200, restore_best_weights = True)

neural_network_sequential_model \
    .fit \
        (x_train_scaled_nparray,
         y_train_series.values,
         epochs = 10000,
         validation_data = (x_test_scaled_nparray, y_test_series.values),
         callbacks = [earlystopping_callback])

Epoch 1/10000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 26.7575 - mse: 26.7575 - val_loss: 2.9345 - val_mse: 2.9345
Epoch 2/10000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 1.4549 - mse: 1.4549 - val_loss: 0.4558 - val_mse: 0.4558
Epoch 3/10000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5514 - mse: 0.5514 - val_loss: 0.4875 - val_mse: 0.4875
Epoch 4/10000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.5127 - mse: 0.5127 - val_loss: 0.4122 - val_mse: 0.4122
Epoch 5/10000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4729 - mse: 0.4729 - val_loss: 0.4022 - val_mse: 0.4022
Epoch 6/10000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 0.4640 - mse: 0.4640 - val_loss: 0.3948 - val_mse: 0.3948
Epoch 7/10000
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1m

<keras.src.callbacks.history.History at 0x160b35250>

## **3.3: Evaluate Model**

In [24]:
model_loss_float, model_mse_float \
    = neural_network_sequential_model.evaluate(x_test_scaled_nparray, y_test_series.values, verbose = 2)

logx.print_and_log_text \
    (f'\nModel Loss: {round(model_loss_float * 100, 2)}%, '
     + f'Model MSE: {round(model_mse_float * 100, 2)}%')

13/13 - 0s - 1ms/step - loss: 0.3397 - mse: 0.3397

Model Loss: 33.97%, Model MSE: 33.97%


## **3.4: Save and Export Model**

In [25]:
neural_network_sequential_model.save(student_loans_constants.CONSTANT_NN_MODEL_FILE_PATH)

# <br> **Section 4: Predict Loan Repayment Success**

## **4.1: Reload Model**

In [26]:
reloaded_neural_network_sequential_model \
    = tf.keras.models.load_model \
        (student_loans_constants.CONSTANT_NN_MODEL_FILE_PATH)

## **4.2: Predictions**

In [27]:
predictions_nparray \
    = reloaded_neural_network_sequential_model.predict \
        (x_test_scaled_nparray).round().astype('int32')

[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


## **4.3: Compare Predictions and Actual Values**

In [28]:
comparison_dataframe \
    = pd.DataFrame({'predictions': predictions_nparray.ravel(), 'actual': y_test_series.values})

In [29]:
pandas_processx.return_formatted_table \
    (comparison_dataframe, 'Table 4.3: Model Predictions vs. Actual Values')

predictions,actual
6,6
6,6
5,5
5,6
6,6
6,7
5,4
6,5
5,5
5,5


In [30]:
# logx.end_program()