In [1]:
# Import dependencies.
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
# Read in CSV file.
df = pd.read_csv("../Resources/cleaned_data.csv")

In [3]:
# Recheck data types.
df.dtypes

id                           int64
program                      int64
previous_college             int64
ethnic_description           int64
gender                       int64
hs_ged                       int64
attendance_percentage      float64
gpa                        float64
default_status               int64
years_between_education    float64
age_at_grad                float64
dtype: object

In [4]:
# Determine the independent and dependent variables.

y = df["default_status"]
X = df.drop(columns=["default_status", "id", "ethnic_description", "program", "gender",
                     "previous_college", "hs_ged", "age_at_grad", "years_between_education",])

In [5]:
# Split into a training and testing set.

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

X_train.shape

(305, 2)

In [6]:
# Create a StandardScaler instance.
scaler = StandardScaler()

# Fit the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scale the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [7]:
# Create a random forest classifier.

rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [8]:
# Fit the model.

rf_model = rf_model.fit(X_train, y_train)

In [9]:
# Make predictions using the testing data.

predictions = rf_model.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
201,1,0
323,1,0
178,1,0
54,1,0
341,1,0
...,...,...
24,1,0
184,1,0
316,1,0
128,1,0


In [10]:
# Assess the performance.

# Create a DataFrame from the confusion matrix.
matrix = confusion_matrix(y_test, predictions)
matrix_df = pd.DataFrame(
    matrix, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


accuracy_score = accuracy_score(y_test, predictions)
report = classification_report(y_test, predictions)

In [11]:
print("Confusion Matrix")
display (matrix_df)

print(f"Accuracy Score : {accuracy_score}")

print("Classification Report")
print(report)

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,18,66
Actual 1,2,16


Accuracy Score : 0.3333333333333333
Classification Report
              precision    recall  f1-score   support

           0       0.90      0.21      0.35        84
           1       0.20      0.89      0.32        18

    accuracy                           0.33       102
   macro avg       0.55      0.55      0.33       102
weighted avg       0.78      0.33      0.34       102



In [12]:
# Calculate feature importance.

importances = rf_model.feature_importances_
importances

array([0.59020007, 0.40979993])

In [13]:
# Sort by feature importance.

sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5902000701496927, 'attendance_percentage'), (0.40979992985030733, 'gpa')]

## Next steps
Review confusion matrix, accurracy score, and classification report to determine appropriate next stpes.