In [1]:
import pickle
import pandas as pd

In [None]:
MODEL_PATH                       = 'model/cat_base_optuna.pkl'
INPUT_CSV                        = 'data/data_student.csv'
OUTPUT_FULL_PRED_CSV             = 'data/data_student_predict.csv'
OUTPUT_ENROLLED_STATUS_CSV       = 'data/data_student_predict_enrolled_status.csv'
DELIMITER                        = ';'

In [3]:
with open(MODEL_PATH, 'rb') as f:
    model = pickle.load(f)

In [4]:
df = pd.read_csv(INPUT_CSV, sep=DELIMITER)

In [5]:
X = df.drop(columns=['Status'], errors='ignore')
probas = model.predict_proba(X)

In [6]:

label_order = ['Dropout', 'Graduate', 'Enrolled']
mapping     = {code: label for code, label in zip(model.classes_, label_order)}

In [7]:
for idx, code in enumerate(model.classes_):
    df[f'Prob_{mapping[code]}'] = probas[:, idx]

In [8]:
prob_cols = [f'Prob_{mapping[c]}' for c in model.classes_]
df['Predicted_Status'] = df[prob_cols].idxmax(axis=1).str.replace('Prob_', '')

In [9]:
mask = df['Status'] == 'Enrolled'
sum_prob = df.loc[mask, 'Prob_Dropout'] + df.loc[mask, 'Prob_Graduate']

df.loc[mask, 'Future_Prob_Dropout']  = df.loc[mask, 'Prob_Dropout']  / sum_prob
df.loc[mask, 'Future_Prob_Graduate'] = df.loc[mask, 'Prob_Graduate'] / sum_prob

df.loc[mask, 'Future_Prediction'] = (
    df.loc[mask, ['Future_Prob_Dropout','Future_Prob_Graduate']]
      .apply(lambda x: 'Graduate' if x['Future_Prob_Graduate'] > x['Future_Prob_Dropout'] else 'Dropout', axis=1)
)

In [10]:
df.loc[~mask, ['Future_Prob_Dropout','Future_Prob_Graduate','Future_Prediction']] = pd.NA

In [11]:
df.to_csv(OUTPUT_FULL_PRED_CSV, sep=DELIMITER, index=False)

In [12]:
df_enrolled = df[df['Status'] == 'Enrolled'].copy()
df_enrolled.to_csv(OUTPUT_ENROLLED_STATUS_CSV, sep=DELIMITER, index=False)

In [13]:
df.shape, df_enrolled.shape

((4424, 44), (794, 44))

In [15]:
df['Status'].value_counts()

Status
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64

In [16]:
df_enrolled['Status'].value_counts()

Status
Enrolled    794
Name: count, dtype: int64

In [17]:
df['Predicted_Status'].value_counts()

Predicted_Status
Enrolled    3379
Dropout     1045
Name: count, dtype: int64

In [19]:
df_enrolled['Future_Prediction'].value_counts()

Future_Prediction
Dropout     438
Graduate    356
Name: count, dtype: int64