In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.decomposition import PCA, KernelPCA
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, roc_curve, roc_auc_score, confusion_matrix

# Imbalance
from imblearn.over_sampling import RandomOverSampler, SMOTEN

### 1. Loading dataset.

In [2]:
data = pd.read_parquet("./cleaned_data/grade_summary.parquet")

In [3]:
df = data.copy()

In [4]:
df.band.unique()

['GOOD', 'EXCELLENT', 'LOW', 'VERY LOW', 'MEDIUM']
Categories (5, object): ['VERY LOW' < 'LOW' < 'MEDIUM' < 'GOOD' < 'EXCELLENT']

In [5]:
df.loc[df['band'] == 'LOW', 'band'] = 'MEDIUM'

In [6]:
df.loc[df['band'] == 'VERY LOW', 'band'] = 'LOW'

In [7]:
df['band'] = df['band'].cat.add_categories('HIGH')

In [8]:
df.loc[df['band'].isin(['GOOD', 'EXCELLENT']), 'band'] = 'HIGH'

In [9]:
df['band'] = df['band'].cat.remove_unused_categories()

In [10]:
df.isna().sum().sum()

np.int64(0)

In [11]:
df.band.value_counts()

band
MEDIUM    274
HIGH      170
LOW       122
Name: count, dtype: int64

In [12]:
# Remove very low
remove_very_low = df.loc[df['band'] == 'VERY LOW']

In [13]:
df.drop(index=remove_very_low.index, inplace=True)

In [14]:
curriculars = ['lect', 'esp', 'ingl', 'mat', 'qui', 'econ', 'poli']

In [15]:
not_curriculars = ['edufi', 'ere', 'tecn', 'compo']

In [16]:
df = df[curriculars + not_curriculars + ['band']]

The classes in this dataset are imbalanced, being largest in `LOW`.

## 2. Baseline models.

In [17]:
# Splitting data
X = df.drop(columns=['band']).astype(int)
y = df['band'].to_numpy().ravel()

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [19]:
pd.DataFrame(np.unique(y_test, return_counts=True)).T

Unnamed: 0,0,1
0,HIGH,53
1,LOW,42
2,MEDIUM,75


In [20]:
labels = np.unique(y_test)[:]

In [21]:
params_lr = [
    # For `liblinear` solver, which handles both L1 and L2 penalties.
    {
        'penalty' : ['l1', 'l2'],
        'C' : [0.01, 0.1, 1, 10, 100],
        'solver' : ['liblinear']
    },
    # Solver that support Elastic-Net (saga).
    {
        'penalty': ['elasticnet'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['saga'],
        'l1_ratio': [0.25, 0.5, 0.75]       
    },
    # Solver that only support L2 penalty.
    {
        'penalty': ['l2'],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['lbfgs', 'newton-cg', 'sag']        
    }
]

params_rf = {
        'n_estimators' : [50, 100, 200, 300, 500],
        'criterion' : ['gini', 'log_loss', 'entropy'],
        'class_weight' : ['balanced', 'balanced_subsample']
    }

In [22]:
np.unique(y_train, return_counts=True)

(array(['HIGH', 'LOW', 'MEDIUM'], dtype=object), array([117,  80, 199]))

In [23]:
# Fundamental Logistic Regression.
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

In [24]:
print(classification_report(y_test, y_pred_lr, labels=labels))

              precision    recall  f1-score   support

        HIGH       0.92      0.89      0.90        53
         LOW       0.79      0.64      0.71        42
      MEDIUM       0.75      0.85      0.80        75

    accuracy                           0.81       170
   macro avg       0.82      0.79      0.80       170
weighted avg       0.82      0.81      0.81       170



In [25]:
cm_lr = confusion_matrix(y_test, y_pred_lr)
print(cm_lr)

[[47  0  6]
 [ 0 27 15]
 [ 4  7 64]]


In [26]:
# Fundamental Random Forest.
rf = RandomForestClassifier(class_weight='balanced')

In [27]:
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

In [28]:
print(f"Accuracy score: {accuracy_score(y_test, y_pred_rf):.2f}")
cm_rf = confusion_matrix(y_test, y_pred_rf)
print(cm_rf)

Accuracy score: 0.79
[[47  1  5]
 [ 0 23 19]
 [ 3  7 65]]


In [29]:
print(classification_report(y_test, y_pred_rf, labels=labels))

              precision    recall  f1-score   support

        HIGH       0.94      0.89      0.91        53
         LOW       0.74      0.55      0.63        42
      MEDIUM       0.73      0.87      0.79        75

    accuracy                           0.79       170
   macro avg       0.80      0.77      0.78       170
weighted avg       0.80      0.79      0.79       170



### 2.1 GridSearchCV

In [30]:
lr_gcv = GridSearchCV(
    estimator=lr,
    param_grid=params_lr
)

In [31]:
rf = RandomForestClassifier(class_weight='balanced')

In [32]:
rf_gcv = GridSearchCV(
    estimator=rf,
    param_grid=params_rf
)

In [33]:
lr_gcv.fit(X_train, y_train)

In [34]:
lr_gcv.best_estimator_

In [35]:
lr_gcv.best_params_

{'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}

In [36]:
lr_gcv.best_score_

np.float64(0.780253164556962)

In [37]:
rf_gcv.fit(X_train, y_train)

In [38]:
rf_gcv.best_estimator_

In [39]:
rf_gcv.best_params_

{'class_weight': 'balanced', 'criterion': 'gini', 'n_estimators': 100}

In [40]:
rf_gcv.best_score_

np.float64(0.7726898734177214)

#### 2.1.1 `LogisticRegression`

In [41]:
lr_tuned = GridSearchCV(
    estimator=LogisticRegression(max_iter=2000),
    param_grid=params_lr
)

In [42]:
X_train = X_train.astype(int)

In [43]:
smt = SMOTEN(random_state=42)
X_res, y_res = smt.fit_resample(X_train, y_train)

In [44]:
lr_tuned.fit(X_res, y_res)

In [45]:
lr_tuned.best_params_

{'C': 0.1, 'l1_ratio': 0.5, 'penalty': 'elasticnet', 'solver': 'saga'}

In [46]:
y_pred_lr_tuned = lr_tuned.predict(X_test)

In [47]:
print(classification_report(y_test, y_pred_lr_tuned, labels=labels))

              precision    recall  f1-score   support

        HIGH       0.81      0.89      0.85        53
         LOW       0.68      0.64      0.66        42
      MEDIUM       0.71      0.68      0.69        75

    accuracy                           0.74       170
   macro avg       0.73      0.74      0.73       170
weighted avg       0.73      0.74      0.73       170



#### 2.1.2 `RandomForest`

In [48]:
rf_gcv

In [49]:
rf_gcv.fit(X_res, y_res)

In [50]:
rf_gcv.best_params_

{'class_weight': 'balanced', 'criterion': 'entropy', 'n_estimators': 100}

In [51]:
y_pred_rf_smote = rf_gcv.predict(X_test)

In [52]:
np.unique(y_test, return_counts=True)

(array(['HIGH', 'LOW', 'MEDIUM'], dtype=object), array([53, 42, 75]))

In [53]:
np.unique(y_res, return_counts=True)

(array(['HIGH', 'LOW', 'MEDIUM'], dtype=object), array([199, 199, 199]))

In [54]:
print(classification_report(y_test, y_pred_rf, labels=labels))

              precision    recall  f1-score   support

        HIGH       0.94      0.89      0.91        53
         LOW       0.74      0.55      0.63        42
      MEDIUM       0.73      0.87      0.79        75

    accuracy                           0.79       170
   macro avg       0.80      0.77      0.78       170
weighted avg       0.80      0.79      0.79       170



In [55]:
print(classification_report(y_test, y_pred_rf_smote, labels=labels))

              precision    recall  f1-score   support

        HIGH       0.85      0.85      0.85        53
         LOW       0.74      0.62      0.68        42
      MEDIUM       0.72      0.79      0.75        75

    accuracy                           0.76       170
   macro avg       0.77      0.75      0.76       170
weighted avg       0.77      0.76      0.76       170



``` Options to consider ```

1. Robust evaluation using cross validation and balanced accuracy. --> DID NOT WORK
2. Change `performance` or every column and re-scale to normalized 0-1 values in order to apply SMOTE-techniques. --> DID NOT WORK
3. Use of ensemble methods in Logistic Regression. --> DID NOT WORK
4. Use of PCA to reduce dimensionality. --> DID NOT WORK