# Baseline Model

Objectives:
- Develop an initial baseline ML model to serve as a benchmark. 
- Use the competition-provided example notebook as a starting point. 

In [1]:
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

### Exploratory Data Analysis

In [2]:
# Reference: https://www.kaggle.com/competitions/predict-student-performance-from-game-play/discussion/384359
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group':'category'}

dataset_df = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train.csv', dtype=dtypes)
print("Full train dataset shape is {}".format(dataset_df.shape))

Full train dataset shape is (26296946, 20)


In [3]:
# What fraction are nulls?
dataset_df.isnull().mean()

session_id        0.000000
index             0.000000
elapsed_time      0.000000
event_name        0.000000
name              0.000000
level             0.000000
page              0.978532
room_coor_x       0.078841
room_coor_y       0.078841
screen_coor_x     0.078841
screen_coor_y     0.078841
hover_duration    0.923860
text              0.634287
fqid              0.314653
room_fqid         0.000000
text_fqid         0.634283
fullscreen        0.000000
hq                0.000000
music             0.000000
level_group       0.000000
dtype: float64

In [4]:
labels = pd.read_csv('/kaggle/input/predict-student-performance-from-game-play/train_labels.csv')
labels['session'] = labels.session_id.apply(lambda x: int(x.split('_')[0]))
labels['q'] = labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]))

In [5]:
# Inspect
labels.head(5)

Unnamed: 0,session_id,correct,session,q
0,20090312431273200_q1,1,20090312431273200,1
1,20090312433251036_q1,0,20090312433251036,1
2,20090312455206810_q1,1,20090312455206810,1
3,20090313091715820_q1,0,20090313091715820,1
4,20090313571836404_q1,1,20090313571836404,1


In [6]:
# Total # of questions
labels['q'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18])

**Objective:** Train models (maybe one for each question?) to predict the `correct` label (1 or 0) for each session.

In [7]:
CATEGORICAL = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid']
NUMERICAL = ['elapsed_time', 'level', 'page', 'room_coor_x', 'room_coor_y',
             'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [8]:
# Reference: https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

def feature_engineer(dataset_df):
    dfs = []
    for c in CATEGORICAL:
        tmp = dataset_df.groupby(['session_id', 'level_group'])[
            c].agg('nunique')
        tmp.name = tmp.name + '_nunique'
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id', 'level_group'])[c].agg('mean')
        dfs.append(tmp)
    for c in NUMERICAL:
        tmp = dataset_df.groupby(['session_id', 'level_group'])[c].agg('std')
        tmp.name = tmp.name + '_std'
        dfs.append(tmp)
    dataset_df = pd.concat(dfs, axis=1)
    dataset_df = dataset_df.fillna(-1)
    dataset_df = dataset_df.reset_index()
    dataset_df = dataset_df.set_index('session_id')
    return dataset_df

In [9]:
df = feature_engineer(dataset_df)
print("Full prepared dataset shape: {}".format(df.shape))

Full prepared dataset shape: (70686, 22)


In [10]:
# Inspect
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70686 entries, 20090312431273200 to 22100221145014656
Data columns (total 22 columns):
 #   Column              Non-Null Count  Dtype   
---  ------              --------------  -----   
 0   level_group         70686 non-null  category
 1   event_name_nunique  70686 non-null  int64   
 2   name_nunique        70686 non-null  int64   
 3   fqid_nunique        70686 non-null  int64   
 4   room_fqid_nunique   70686 non-null  int64   
 5   text_fqid_nunique   70686 non-null  int64   
 6   elapsed_time        70686 non-null  float64 
 7   level               70686 non-null  float64 
 8   page                70686 non-null  float64 
 9   room_coor_x         70686 non-null  float32 
 10  room_coor_y         70686 non-null  float32 
 11  screen_coor_x       70686 non-null  float32 
 12  screen_coor_y       70686 non-null  float32 
 13  hover_duration      70686 non-null  float32 
 14  elapsed_time_std    70686 non-null  float64 
 15  level_st

In [11]:
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]


train_x, valid_x = split_dataset(df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

56547 examples in training, 14139 examples in testing.


In [12]:
# Fetch the unique list of user sessions in the validation dataset. We assigned
# `session_id` as the index of our feature engineered dataset. Hence fetching
# the unique values in the index column will give us a list of users in the
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is:
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s.
prediction_df = pd.DataFrame(data=np.zeros(
    (len(VALID_USER_LIST), 18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict = {}

In [13]:
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1, 19):

    # Select level group for the question based on the q_no.
    if q_no <= 3:
        grp = '0-4'
    elif q_no <= 13:
        grp = '5-12'
    elif q_no <= 22:
        grp = '13-22'
    print("### q_no", q_no, "grp", grp)

    # Filter the rows in the datasets based on the selected level group.
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q == q_no].set_index(
        'session').loc[train_users]
    valid_labels = labels.loc[labels.q == q_no].set_index(
        'session').loc[valid_users]

    # Add the label to the filtered datasets.
    # train_df["correct"] = train_labels["correct"]
    # valid_df["correct"] = valid_labels["correct"]

    # Model pipeline
    features = list(train_df.columns)
    cat_features = ['level_group']
    num_features = [f for f in features if f not in cat_features]

    ohe = ColumnTransformer([
        ('ohe_features', OneHotEncoder(), cat_features),
        ('scaled_num', StandardScaler(), num_features)
    ])

    pipe = Pipeline([('ohe', ohe),
                    ('est', RandomForestClassifier())])

    # Train
    model = pipe.fit(train_df, train_labels['correct'])

    # Store the model
    models[f'{grp}_{q_no}'] = model

    # Evaluate the trained model on the validation dataset and store the
    # evaluation accuracy in the `evaluation_dict`.
    evaluation_dict[q_no] = accuracy_score(
        valid_labels['correct'], model.predict(valid_df))

    # Use the trained model to make predictions on the validation dataset and
    # store the predicted values in the `prediction_df` dataframe.
    predict = model.predict(valid_df)
    prediction_df.loc[valid_users, q_no-1] = predict.flatten()

    #

### q_no 1 grp 0-4
### q_no 2 grp 0-4
### q_no 3 grp 0-4
### q_no 4 grp 5-12
### q_no 5 grp 5-12
### q_no 6 grp 5-12
### q_no 7 grp 5-12
### q_no 8 grp 5-12
### q_no 9 grp 5-12
### q_no 10 grp 5-12
### q_no 11 grp 5-12
### q_no 12 grp 5-12
### q_no 13 grp 5-12
### q_no 14 grp 13-22
### q_no 15 grp 13-22
### q_no 16 grp 13-22
### q_no 17 grp 13-22
### q_no 18 grp 13-22


### Inspect Accuracy of Individual Models

In [14]:
for name, value in evaluation_dict.items():
    print(f"question {name}: accuracy {value:.4f}")

print("\nAverage accuracy", sum(evaluation_dict.values())/18)

question 1: accuracy 0.7305
question 2: accuracy 0.9756
question 3: accuracy 0.9353
question 4: accuracy 0.7938
question 5: accuracy 0.6132
question 6: accuracy 0.7885
question 7: accuracy 0.7481
question 8: accuracy 0.6274
question 9: accuracy 0.7660
question 10: accuracy 0.5916
question 11: accuracy 0.6510
question 12: accuracy 0.8699
question 13: accuracy 0.7182
question 14: accuracy 0.7316
question 15: accuracy 0.5952
question 16: accuracy 0.7450
question 17: accuracy 0.6964
question 18: accuracy 0.9514

Average accuracy 0.7515854492302614


### Generate Submission

In [15]:
import jo_wilder
env = jo_wilder.make_env()
iter_test = env.iter_test()

In [16]:
# Reference
# https://www.kaggle.com/code/philculliton/basic-submission-demo
# https://www.kaggle.com/code/cdeotte/random-forest-baseline-0-664/notebook

limits = {'0-4':(1,4), '5-12':(4,14), '13-22':(14,19)}

for (test, sample_submission) in iter_test:
    test_df = feature_engineer(test)
    grp = test_df.level_group.values[0]
    a,b = limits[grp]
    for t in range(a,b):
        model = models[f'{grp}_{t}']
        predictions = model.predict(test_df)
        mask = sample_submission.session_id.str.contains(f'q{t}')
        sample_submission.loc[mask,'correct'] = predictions.flatten()
    
    env.predict(sample_submission)

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [17]:
! head submission.csv

session_id,correct
20090109393214576_q1,1
20090109393214576_q2,1
20090109393214576_q3,1
20090109393214576_q4,1
20090109393214576_q5,1
20090109393214576_q6,1
20090109393214576_q7,1
20090109393214576_q8,1
20090109393214576_q9,1
