<a href="https://colab.research.google.com/github/lucasfreire01/student_performance/blob/main/student_performance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RESUME
I used the XGBosst as an upgrade of Random Florest/ Decision Tree. The goal of the project is to predict the score in which session will have, based on the session passed, the classification of students is split into 3 levels: level 3, level 12, and level 22 in the final the score obtained was 0.66

In [None]:
# Importing library

In [None]:
!pip install xgboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold, GroupKFold

In [None]:
# Loading Databases

In [None]:
train = pd.read_csv('/content/drive/MyDrive/train.csv')

In [None]:
test = pd.read_csv('/content/test.csv')

FileNotFoundError: ignored

In [None]:
targets = pd.read_csv('/content/train_labels.csv')
targets['session'] = targets.session_id.apply(lambda x: int(x.split('_')[0]))
targets['q'] = targets.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [None]:
targets

In [None]:
# Creating features to improve algorithm effectiveness by adding a new EVENTS class
# This idea is from: https://www.kaggle.com/code/kimtaehun/lightgbm-baseline-with-aggregated-log-data

In [None]:
CATS = ['event_name','text', 'fqid', 'room_fqid',]
NUMS = ['elapsed_time', 'level','page','room_coor_x', 'room_coor_y', 
        'screen_coor_x', 'screen_coor_y', 'hover_duration']
EVENTS = ['navigate_click','person_click','cutscene_click','object_click',
          'map_hover','notification_click','map_click','observation_click',
          'checkpoint']

In [None]:
def feature_engineer(train):
  dfs = []
  for c in CATS:
    tmp = train.groupby(['session_id','level_group'])[c].agg('nunique')
    tmp.name = tmp.name + '_nunique'
    dfs.append(tmp)
  
  for c in NUMS:
    tmp = train.groupby(['session_id','level_group'])[c].agg('mean')
    tmp.name = tmp.name + '_mean'
    dfs.append(tmp)
  for c in NUMS:
    tmp = train.groupby(['session_id','level_group'])[c].agg('std')
    tmp.name = tmp.name + '_std'
    dfs.append(tmp)

  for c in EVENTS: 
    train[c] = (train.event_name == c).astype('int8')

  for c in EVENTS + ['elapsed_time']:
    tmp = train.groupby(['session_id','level_group'])[c].agg('sum')
    tmp.name = tmp.name + '_sum'
    dfs.append(tmp)
  train = train.drop(EVENTS,axis=1)
  
  df = pd.concat(dfs,axis=1)
  df = df.fillna(-1)
  df = df.reset_index()
  df = df.set_index('session_id')
  return df

In [None]:
df = feature_engineer(train)

In [None]:
print(df.shape)
df.head()

In [None]:
# The model is training for each one of the 18 questions. With that, we have these insides: level_group = 0-4 for training the model of questions 1-3
#                                                                                           level_group = 5-12 for training the model of questions 4-13
#                                                                                           level_group = 13-22 for training the model of questions 14-18

# This insides was based of test inference I just imported the parameters to train model

In [None]:
FEATURES = [c for c in df.columns if  c != 'level_group']
print('We will try with', len(FEATURES), 'features')
ALL_USERS = df.index.unique()
print ('We will try with', len(ALL_USERS), 'users')

In [None]:
# starting use GroupKfold with 5 group Kfold
gkf = GroupKFold(n_splits=5)
oof = pd.DataFrame(data=np.zeros((len(ALL_USERS),18)), index=ALL_USERS)
models = {}

for i, (train_index, test_index) in enumerate(gkf.split(X=df, groups=df.index)):
    print('#'*25)
    print('### Fold',i+1)
    print('#'*25)
    
    xgb_params = {
    'objective' : 'binary:logistic',
    'eval_metric':'logloss',
    'learning_rate': 0.05,
    'max_depth': 4,
    'n_estimators': 1000,
    'early_stopping_rounds': 50,
    'tree_method':'hist',
    'subsample':0.8,
    'colsample_bytree': 0.4,
    'use_label_encoder' : False}
    
    for t in range(1,19):
        if t<=3: grp = '0-4'
        elif t<=13: grp = '5-12'
        elif t<=22: grp = '13-22'
        train_x = df.iloc[train_index]
        train_x = train_x.loc[train_x.level_group == grp]
        train_users = train_x.index.values
        train_y = targets.loc[targets.q==t].set_index('session').loc[train_users]
        
        valid_x = df.iloc[test_index]
        valid_x = valid_x.loc[valid_x.level_group == grp]
        valid_users = valid_x.index.values
        valid_y = targets.loc[targets.q==t].set_index('session').loc[valid_users]
               
        clf =  XGBClassifier(**xgb_params)
        clf.fit(train_x[FEATURES].astype('float32'), train_y['correct'],
                eval_set=[ (valid_x[FEATURES].astype('float32'), valid_y['correct']) ],
                verbose=0)
        
        models[f'{grp}_{t}'] = clf
        oof.loc[valid_users, t-1] = clf.predict_proba(valid_x[FEATURES].astype('float32'))[:,1]
        
    print()

In [None]:
# I needed to corvert the predicts in a matrix binary with 1s and 0s, the metric use was F1_score that doing a mean between predicts and recall.
# gonna found the ideal limit for p when p > threshold = 1, when p < 1 maximize the F1_score

In [None]:
# Put the true labels with 18 columns
true = oof.copy()
for k in range(18):
    tmp = targets.loc[targets.q == k+1].set_index('session').loc[ALL_USERS]
    true[k] = tmp.correct.values

In [None]:
# Find the better threshold to convert predicts in 1s and 0s
score = []; thresholds = []
best_score = 0; best_thresholds = 0
for threshold in np.arange(0.4,0.81,0.01):
    print(f'{threshold:.02f}, ',end=' ')
    preds = (oof.values.reshape((-1))>threshold).astype('int')
    m = f1_score(true.values.reshape((-1)), preds, average='macro')      
    score.append(m)
    thresholds.append(threshold)
    if m>best_score:
        best_score = m
        best_thresholds = threshold

In [None]:
# Plot a graphic threshold vs f1_score
import matplotlib.pyplot as plt
plt.figure(figsize=(20,5))
plt.plot(thresholds,score,'-o',color='blue')
plt.scatter([best_thresholds], [best_score], color='blue', s=300, alpha=1)
plt.xlabel('Threshold',size=14)
plt.ylabel('Validation F1 Score',size=14)
plt.title(f'Threshold vs. F1_Score with Best F1_Score = {best_score:.3f} at Best Threshold = {best_thresholds:.3f}',size=18)
plt.show()

In [None]:
# Computing the f1_score for each question
print('When we using optimal threshold...')
for k in range(18):
  m = f1_score(true[k].values, (oof[k].values>best_thresholds).astype('int'), average='macro')
  print(f'Q{k}: F1 = ',m)

# Computing the f1_score overall
m = f1_score(true.values.reshape((-1)), (oof.values.reshape((-1)) >best_thresholds).astype('int'), average='macro')
print('Overall F1 ==> ', m)

In [None]:
df = pd.read_csv('/content/sample_submission.csv')
print(df.shape)
df.head()

In [None]:
print(df.correct.mean())