In [1]:
# Import dependencies.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
# Read in CSV file.
df = pd.read_csv("../resources/cleaned_data.csv")

In [4]:
# Recheck data types.
df.dtypes

id                           int64
program                      int64
previous_college             int64
ethnic_description           int64
gender                       int64
hs_ged                       int64
attendance_percentage      float64
gpa                        float64
default_status               int64
years_between_education    float64
age_at_grad                float64
dtype: object

In [5]:
# Determine the independent and dependent variables.

y = df["default_status"]
X = df.drop(columns=["default_status", "id"])

In [6]:
# Split into a training and testing set.

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_train.shape

(305, 9)

In [7]:
# Create a StandardScaler instance.
scaler = StandardScaler()

# Fit the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scale the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [8]:
# Create a random forest classifier.

rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [9]:
# Fit the model.

rf_model = rf_model.fit(X_train, y_train)

In [10]:
# Make predictions using the testing data.

predictions = rf_model.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
201,1,0
323,1,0
178,1,0
54,1,0
341,1,0
...,...,...
24,1,0
184,1,0
316,1,0
128,1,0


In [11]:
# Assess the performance.

# Create a DataFrame from the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


accuracy_score = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

In [12]:
print("Confusion Matrix")
display (matrix_df)

print(f"Accuracy Score : {accuracy_score}")

print("Classification Report")
print(report)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,13,71
Actual 1,2,16


Accuracy Score : 0.28431372549019607
Classification Report
              precision    recall  f1-score   support

           0       0.87      0.15      0.26        84
           1       0.18      0.89      0.30        18

    accuracy                           0.28       102
   macro avg       0.53      0.52      0.28       102
weighted avg       0.75      0.28      0.27       102



In [13]:
# Calculate feature importance.

importances = rf_model.feature_importances_
importances

array([0.04149024, 0.02743513, 0.08976192, 0.03028136, 0.02343883,
       0.19479941, 0.20930338, 0.19244011, 0.19104963])

In [14]:
# Sort by feature importance.

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.2093033798770197, 'gpa'),
 (0.19479941420421917, 'attendance_percentage'),
 (0.19244010832004316, 'years_between_education'),
 (0.19104962843609255, 'age_at_grad'),
 (0.08976191745292456, 'ethnic_description'),
 (0.041490235513844294, 'program'),
 (0.030281358352284907, 'gender'),
 (0.027435129092093873, 'previous_college'),
 (0.023438828751477844, 'hs_ged')]

In [None]:
# Rerun the model taking out ethnic_description, program, gender, previous_college, hs_ged.