In [1]:
# Import dependencies.
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [2]:
# Read in CSV file.
df = pd.read_csv("cleaned_data.csv")

In [3]:
# Recheck data types.
df.dtypes

id                           int64
program                      int64
previous_college             int64
ethnic_description           int64
gender                       int64
hs_ged                       int64
hours_attended             float64
hours_scheduled            float64
attendance_percentage      float64
gpa                        float64
default_status               int64
years_between_education    float64
age_at_grad                float64
dtype: object

In [6]:
# Determine the independent and dependent variables.

y = df["default_status"]
X = df.drop(columns=["default_status", "id"])

In [7]:
# Split into a training and testing set.

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_train.shape

(308, 11)

In [8]:
# Create a StandardScaler instance.
scaler = StandardScaler()

# Fit the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scale the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [9]:
# Instantiate a logistic regression model.

classifier = LogisticRegression(solver="lbfgs", max_iter=2000, random_state=1)
classifier

LogisticRegression(max_iter=2000, random_state=1)

In [10]:
# Train the logistic regression model.
classifier.fit(X_train, y_train)

LogisticRegression(max_iter=2000, random_state=1)

In [11]:
# Validate the logistic regression model.

predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
363,0,0
177,0,0
316,0,0
45,0,0
163,0,0
...,...,...
235,0,0
241,0,0
211,0,1
151,0,0


In [12]:
# Assess the performance.

# Create a DataFrame from the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


accuracy_score = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

In [13]:
print("Confusion Matrix")
display (matrix_df)

print(f"Accuracy Score : {accuracy_score}")

print("Classification Report")
print(report)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,82,3
Actual 1,17,1


Accuracy Score : 0.8058252427184466
Classification Report
              precision    recall  f1-score   support

           0       0.83      0.96      0.89        85
           1       0.25      0.06      0.09        18

    accuracy                           0.81       103
   macro avg       0.54      0.51      0.49       103
weighted avg       0.73      0.81      0.75       103

