In [1]:
# Import dependencies.
import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read in CSV file.
df = pd.read_csv("cleaned_data.csv")

In [3]:
# Recheck data types.
df.dtypes

id                           int64
program                      int64
previous_college             int64
ethnic_description           int64
gender                       int64
hs_ged                       int64
hours_attended             float64
hours_scheduled            float64
attendance_percentage      float64
gpa                        float64
default_status               int64
years_between_education    float64
age_at_grad                float64
dtype: object

In [4]:
# Determine the independent and dependent variables.

y = df["default_status"]
X = df.drop(columns=["default_status", "id"])

In [5]:
# Split into a training and testing set.

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_train.shape

(308, 11)

In [6]:
# Create a StandardScaler instance.
scaler = StandardScaler()

# Fit the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scale the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier.

rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [8]:
# Fit the model.

rf_model = rf_model.fit(X_train, y_train)

In [9]:
# Make predictions using the testing data.

predictions = rf_model.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
363,1,0
177,1,0
316,1,0
45,1,0
163,1,0
...,...,...
235,1,0
241,1,0
211,1,1
151,1,0


In [10]:
# Assess the performance.

# Create a DataFrame from the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


accuracy_score = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
print("Confusion Matrix")
display (matrix_df)

print(f"Accuracy Score : {accuracy_score}")

print("Classification Report")
print(report)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,0,85
Actual 1,0,18


Accuracy Score : 0.17475728155339806
Classification Report
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        85
           1       0.17      1.00      0.30        18

    accuracy                           0.17       103
   macro avg       0.09      0.50      0.15       103
weighted avg       0.03      0.17      0.05       103



In [12]:
# Calculate feature importance.

importances = rf_model.feature_importances_
importances

array([0.01116018, 0.02669706, 0.0835315 , 0.03530128, 0.01521203,
       0.15956751, 0.01936901, 0.14638987, 0.16826798, 0.17117788,
       0.16332569])

In [13]:
# Sort by feature importance.

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.17117787583782748, 'years_between_education'),
 (0.1682679835340459, 'gpa'),
 (0.1633256873125593, 'age_at_grad'),
 (0.159567514102367, 'hours_attended'),
 (0.1463898694651249, 'attendance_percentage'),
 (0.08353150404928472, 'ethnic_description'),
 (0.03530128400808956, 'gender'),
 (0.026697062202820046, 'previous_college'),
 (0.019369013485421, 'hours_scheduled'),
 (0.015212028001765767, 'hs_ged'),
 (0.011160178000694276, 'program')]

In [None]:
# Rerun the model taking out hours_attended, ethnic_description, gender, previous_college, hours_scheduled,
## hs_ged, program.