In [1]:
import pandas as pd
import numpy as np
from keras import backend as K

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
pd.set_option('display.max_column', 50)

In [2]:
FOLDERPATH = "./data/"

CATEGORICAL_COLS = ['event_name', 'name','level', 'page', "fullscreen", "hq", "music", 'level_group']
ID_CATEGORICAL_COLS = ['fqid', 'room_fqid', 'text_fqid']

NUMERICAL_COLS = ['elapsed_time', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']

In [3]:
dtypes={
    'elapsed_time':np.int32,
    'event_name':'category',
    'name':'category',
    'level':np.uint8,
    'room_coor_x':np.float32,
    'room_coor_y':np.float32,
    'screen_coor_x':np.float32,
    'screen_coor_y':np.float32,
    'hover_duration':np.float32,
    'text':'category',
    'fqid':'category',
    'room_fqid':'category',
    'text_fqid':'category',
    'fullscreen':'category',
    'hq':'category',
    'music':'category',
    'level_group' : str}

df_train = pd.read_csv(FOLDERPATH + "train.csv", dtype=dtypes)
print(f"The train dataset contains {df_train.shape[0]} rows and {df_train.shape[1]} columns")

df_train.head(10)

The train dataset contains 26296946 rows and 20 columns


Unnamed: 0,session_id,index,elapsed_time,event_name,name,level,page,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,fqid,room_fqid,text_fqid,fullscreen,hq,music,level_group
0,20090312431273200,0,0,cutscene_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,undefined,intro,tunic.historicalsociety.closet,tunic.historicalsociety.closet.intro,0,0,1,0-4
1,20090312431273200,1,1323,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,"Whatcha doing over there, Jo?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
2,20090312431273200,2,831,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,Just talking to Teddy.,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
3,20090312431273200,3,1147,person_click,basic,0,,-413.991394,-159.314682,380.0,494.0,,I gotta run to my meeting!,gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
4,20090312431273200,4,1863,person_click,basic,0,,-412.991394,-159.314682,381.0,494.0,,"Can I come, Gramps?",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
5,20090312431273200,5,3423,person_click,basic,0,,-412.991394,-157.314682,381.0,492.0,,"Sure thing, Jo. Grab your notebook and come up...",gramps,tunic.historicalsociety.closet,tunic.historicalsociety.closet.gramps.intro_0_...,0,0,1,0-4
6,20090312431273200,6,5197,person_click,basic,0,,478.485077,-199.97168,593.0,485.0,,"See you later, Teddy.",teddy,tunic.historicalsociety.closet,tunic.historicalsociety.closet.teddy.intro_0_cs_0,0,0,1,0-4
7,20090312431273200,7,6180,person_click,basic,0,,503.355133,-168.619919,609.0,453.0,,I get to go to Gramps's meeting!,teddy,tunic.historicalsociety.closet,tunic.historicalsociety.closet.teddy.intro_0_cs_0,0,0,1,0-4
8,20090312431273200,8,7014,person_click,basic,0,,510.733429,-157.720642,615.0,442.0,,Now where did I put my notebook?,teddy,tunic.historicalsociety.closet,tunic.historicalsociety.closet.teddy.intro_0_cs_0,0,0,1,0-4
9,20090312431273200,9,7946,person_click,basic,0,,512.048035,-153.743637,616.0,438.0,,\u00f0\u0178\u02dc\u00b4,teddy,tunic.historicalsociety.closet,tunic.historicalsociety.closet.teddy.intro_0_cs_0,0,0,1,0-4


In [4]:
df_train = df_train.loc[df_train.level_group == '0-4']
print(df_train.shape)

(3981005, 20)


In [5]:
for col in ['room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']:
    # Scaling the coordinates and durations
    df_train[col] = (df_train[col] - df_train[col].min()) / (df_train[col].max() - df_train[col].min())
    df_train[col] = df_train[col].fillna(0)

In [6]:
CATEGORICAL_COLS = ['event_name', 'name','level', 'page', "fullscreen", "hq", "music"]
ID_CATEGORICAL_COLS = ['fqid', 'room_fqid', 'text_fqid']

In [7]:
df = df_train.copy()

In [8]:
del(df_train)

In [9]:
for col in CATEGORICAL_COLS:
    # Get dummies for the current column
    dummies = pd.get_dummies(df[col], prefix=col)
    # Convert dummy variables to integers (0 or 1)
    dummies = dummies.astype(int)
    # Concatenate with the original DataFrame
    df = pd.concat([df, dummies], axis=1)
    # Drop the original column after getting dummies
    df.drop(col, axis=1, inplace=True)

for col in ID_CATEGORICAL_COLS:
    # Get dummies for the current column
    dummies = pd.get_dummies(df[col], prefix=col)
    # Convert dummy variables to integers (0 or 1)
    dummies = dummies.astype(int)
    # Concatenate with the original DataFrame
    df = pd.concat([df, dummies], axis=1)
    # Drop the original column after getting dummies
    df.drop(col, axis=1, inplace=True)
print(df.shape)

(3981005, 313)


In [10]:
df.head(5)

Unnamed: 0,session_id,index,elapsed_time,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,text,level_group,event_name_checkpoint,event_name_cutscene_click,event_name_map_click,event_name_map_hover,event_name_navigate_click,event_name_notebook_click,event_name_notification_click,event_name_object_click,event_name_object_hover,event_name_observation_click,event_name_person_click,name_basic,name_close,name_next,name_open,...,text_fqid_tunic.library.frontdesk.worker.preflag,text_fqid_tunic.library.frontdesk.worker.wells,text_fqid_tunic.library.frontdesk.worker.wells_recap,text_fqid_tunic.library.microfiche.reader.paper2.bingo,text_fqid_tunic.library.microfiche.reader_flag.paper2.bingo,text_fqid_tunic.wildlife.center.coffee,text_fqid_tunic.wildlife.center.crane_ranger.crane,text_fqid_tunic.wildlife.center.expert.recap,text_fqid_tunic.wildlife.center.expert.removed_cup,text_fqid_tunic.wildlife.center.remove_cup,text_fqid_tunic.wildlife.center.tracks.hub.deer,text_fqid_tunic.wildlife.center.wells.animals,text_fqid_tunic.wildlife.center.wells.animals2,text_fqid_tunic.wildlife.center.wells.nodeer,text_fqid_tunic.wildlife.center.wells.nodeer_recap,text_fqid_tunic.drycleaner.frontdesk.block_0,text_fqid_tunic.kohlcenter.halloffame.block_0,text_fqid_tunic.library.frontdesk.block_badge,text_fqid_tunic.historicalsociety.collection.gramps.look_0,text_fqid_tunic.library.microfiche.block_0,text_fqid_tunic.wildlife.center.fox.concern,text_fqid_tunic.library.frontdesk.block_badge_2,text_fqid_tunic.historicalsociety.entry.gramps.hub,text_fqid_tunic.humanecology.frontdesk.block_1,text_fqid_tunic.drycleaner.frontdesk.block_1
0,20090312431273200,0,0,0.33637,0.38087,0.19802,0.347887,0.0,undefined,0-4,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,20090312431273200,1,1323,0.33637,0.38087,0.19802,0.347887,0.0,"Whatcha doing over there, Jo?",0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,20090312431273200,2,831,0.33637,0.38087,0.19802,0.347887,0.0,Just talking to Teddy.,0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,20090312431273200,3,1147,0.33637,0.38087,0.19802,0.347887,0.0,I gotta run to my meeting!,0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,20090312431273200,4,1863,0.336788,0.38087,0.198541,0.347887,0.0,"Can I come, Gramps?",0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [11]:
train_labels = pd.read_csv(FOLDERPATH + "train_labels.csv")
print(f"Loading {train_labels.shape[0]} labels.")

print(f"Preprocessing session IDs...")

train_labels['session'] = train_labels.session_id.apply(lambda x: int(x.split('_')[0]) )
train_labels['q'] = train_labels.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )

print(f"Labels contains {train_labels['session'].nunique()} sessions with answers to {train_labels['q'].nunique()} questions which is overall {train_labels.shape[0]} rows.")

train_labels["session_id"] = train_labels["session"]
train_labels = train_labels[["session_id", "q", "correct"]]
train_labels[["session_id", "q", "correct"]].head()
print("Creating pivot table...")
pivot_df = train_labels.pivot_table(index='session_id', columns='q', values='correct', aggfunc='first')
# Rename columns to 'q' followed by the question number
pivot_df.columns = ['q' + str(col) for col in pivot_df.columns]
# Fill NaN values with 0 if needed
pivot_df = pivot_df.fillna(0).reset_index()
pivot_df.head()

Loading 424116 labels.
Preprocessing session IDs...
Labels contains 23562 sessions with answers to 18 questions which is overall 424116 rows.
Creating pivot table...


Unnamed: 0,session_id,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,q11,q12,q13,q14,q15,q16,q17,q18
0,20090312431273200,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
1,20090312433251036,0,1,1,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1
2,20090312455206810,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1
3,20090313091715820,0,1,1,1,1,0,1,1,1,0,0,1,0,1,0,1,1,1
4,20090313571836404,1,1,1,1,1,1,1,1,1,1,1,0,1,0,1,1,1,1


In [12]:
trainset = pd.merge(df.drop(columns=['text', 'index']), pivot_df, how='left', on="session_id")
print(trainset.shape)
trainset.head()

(3981005, 329)


Unnamed: 0,session_id,elapsed_time,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,level_group,event_name_checkpoint,event_name_cutscene_click,event_name_map_click,event_name_map_hover,event_name_navigate_click,event_name_notebook_click,event_name_notification_click,event_name_object_click,event_name_object_hover,event_name_observation_click,event_name_person_click,name_basic,name_close,name_next,name_open,name_prev,name_undefined,...,text_fqid_tunic.historicalsociety.collection.gramps.look_0,text_fqid_tunic.library.microfiche.block_0,text_fqid_tunic.wildlife.center.fox.concern,text_fqid_tunic.library.frontdesk.block_badge_2,text_fqid_tunic.historicalsociety.entry.gramps.hub,text_fqid_tunic.humanecology.frontdesk.block_1,text_fqid_tunic.drycleaner.frontdesk.block_1,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,q11,q12,q13,q14,q15,q16,q17,q18
0,20090312431273200,0,0.33637,0.38087,0.19802,0.347887,0.0,0-4,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
1,20090312431273200,1323,0.33637,0.38087,0.19802,0.347887,0.0,0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
2,20090312431273200,831,0.33637,0.38087,0.19802,0.347887,0.0,0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
3,20090312431273200,1147,0.33637,0.38087,0.19802,0.347887,0.0,0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
4,20090312431273200,1863,0.336788,0.38087,0.198541,0.347887,0.0,0-4,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1


In [13]:
cols = trainset.columns.to_list()
features = cols[1:-18]

print("Features columns:", features)

Features columns: ['elapsed_time', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration', 'level_group', 'event_name_checkpoint', 'event_name_cutscene_click', 'event_name_map_click', 'event_name_map_hover', 'event_name_navigate_click', 'event_name_notebook_click', 'event_name_notification_click', 'event_name_object_click', 'event_name_object_hover', 'event_name_observation_click', 'event_name_person_click', 'name_basic', 'name_close', 'name_next', 'name_open', 'name_prev', 'name_undefined', 'level_0', 'level_1', 'level_2', 'level_3', 'level_4', 'page_0.0', 'page_1.0', 'fullscreen_0', 'fullscreen_1', 'hq_0', 'hq_1', 'music_0', 'music_1', 'fqid_archivist', 'fqid_archivist_glasses', 'fqid_block', 'fqid_block_0', 'fqid_block_magnify', 'fqid_block_nelson', 'fqid_block_tocollection', 'fqid_block_tomap1', 'fqid_block_tomap2', 'fqid_boss', 'fqid_businesscards', 'fqid_businesscards.card_0.next', 'fqid_businesscards.card_1.next', 'fqid_businesscards.card_bingo.bingo', 

In [14]:
from keras import initializers
from keras import regularizers
from keras import constraints
from keras.layers import Dot, Softmax, Layer, Reshape
import tensorflow as tf

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Input, Concatenate, Softmax, Multiply
from sklearn.preprocessing import StandardScaler
import tensorflow.keras.backend as K

from keras.layers import Layer, Dot, Softmax
import keras.backend as K
from tensorflow.keras.layers import Bidirectional

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight(shape=(input_shape[-1],),
                                 initializer=self.init,
                                 name='W',
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight(shape=(input_shape[1],),
                                     initializer='zero',
                                     name='b',
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim


# This is NN LSTM Model creation
def model_lstm(input_shape):
    # The shape was explained above, must have this order
    inp = Input(shape=(input_shape[1], input_shape[2],))
    # This is the LSTM layer
    # Bidirecional implies that the 160 chunks are calculated in both ways, 0 to 159 and 159 to zero
    # although it appear that just 0 to 159 way matter, I have tested with and without, and tha later worked best
    # 128 and 64 are the number of cells used, too many can overfit and too few can underfit
    x = Bidirectional(LSTM(128, return_sequences=True))(inp)
    # The second LSTM can give more fire power to the model, but can overfit it too
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    # Attention is a new tecnology that can be applyed to a Recurrent NN to give more meanings to a signal found in the middle
    # of the data, it helps more in longs chains of data. A normal RNN give all the responsibility of detect the signal
    # to the last cell. Google RNN Attention for more information :)
    x = Attention(input_shape[1])(x)
    # A intermediate full connected (Dense) can help to deal with nonlinears outputs
    x = Dense(64, activation="relu")(x)
    # A binnary classification as this must finish with shape (1,)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    # Pay attention in the addition of matthews_correlation metric in the compilation, it is a success factor key
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    return model

In [15]:
print(trainset.shape)
trainset.drop_duplicates(subset=features[2:], keep='last', inplace=True)
print(trainset.shape)

columns_to_drop = trainset[features[1:]].columns[trainset[features[1:]].nunique() == 1]
# Drop columns with only one unique value
trainset = trainset.drop(columns=columns_to_drop)

print(trainset.shape)

(3981005, 329)
(3101816, 329)
(3101816, 124)


In [16]:
trainset.head()

Unnamed: 0,session_id,elapsed_time,room_coor_x,room_coor_y,screen_coor_x,screen_coor_y,hover_duration,event_name_checkpoint,event_name_cutscene_click,event_name_map_click,event_name_map_hover,event_name_navigate_click,event_name_notebook_click,event_name_notification_click,event_name_object_click,event_name_object_hover,event_name_observation_click,event_name_person_click,name_basic,name_close,name_next,name_open,name_prev,name_undefined,level_0,...,text_fqid_tunic.historicalsociety.entry.wells.talktogramps,text_fqid_tunic.historicalsociety.stacks.outtolunch,text_fqid_tunic.kohlcenter.halloffame.plaque.face.date,text_fqid_tunic.kohlcenter.halloffame.togrampa,text_fqid_tunic.kohlcenter.halloffame.block_0,text_fqid_tunic.historicalsociety.collection.gramps.look_0,text_fqid_tunic.historicalsociety.entry.gramps.hub,q1,q2,q3,q4,q5,q6,q7,q8,q9,q10,q11,q12,q13,q14,q15,q16,q17,q18
0,20090312431273200,0,0.33637,0.38087,0.19802,0.347887,0.0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
3,20090312431273200,1147,0.33637,0.38087,0.19802,0.347887,0.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
4,20090312431273200,1863,0.336788,0.38087,0.198541,0.347887,0.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
5,20090312431273200,3423,0.336788,0.382631,0.198541,0.346479,0.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1
6,20090312431273200,5197,0.709686,0.34506,0.309015,0.341549,0.0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1


In [17]:
cols = trainset.columns.to_list()
features = cols[1:-18]

print("Features columns:", features)

Features columns: ['elapsed_time', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration', 'event_name_checkpoint', 'event_name_cutscene_click', 'event_name_map_click', 'event_name_map_hover', 'event_name_navigate_click', 'event_name_notebook_click', 'event_name_notification_click', 'event_name_object_click', 'event_name_object_hover', 'event_name_observation_click', 'event_name_person_click', 'name_basic', 'name_close', 'name_next', 'name_open', 'name_prev', 'name_undefined', 'level_0', 'level_1', 'level_2', 'level_3', 'level_4', 'page_0.0', 'page_1.0', 'fullscreen_0', 'fullscreen_1', 'hq_0', 'hq_1', 'music_0', 'music_1', 'fqid_block_0', 'fqid_block_tocollection', 'fqid_block_tomap1', 'fqid_block_tomap2', 'fqid_boss', 'fqid_chap1_finale', 'fqid_chap1_finale_c', 'fqid_cs', 'fqid_directory', 'fqid_doorblock', 'fqid_gramps', 'fqid_groupconvo', 'fqid_intro', 'fqid_janitor', 'fqid_notebook', 'fqid_outtolunch', 'fqid_photo', 'fqid_plaque', 'fqid_plaque.face.date', 

In [19]:
from sklearn.linear_model import LogisticRegression

print("Using LogReg\nFiltering for level_qroups:")

logreg_accuracies = []
logreg_f1_scores = []

gbc_accuracies = []
gbc_f1_scores = []

lstm_accuracies = []
lstm_f1_scores = []

lstm_a_accuracies = []
lstm_a_f1_scores = []

for i in range(1, 4):
    question = f"q{i}"
    
    train_session_ids, test_session_ids = train_test_split(pivot_df['session_id'], test_size=0.2, random_state=42, stratify=pivot_df[question])

    X_train = trainset[trainset['session_id'].isin(train_session_ids)]
    X_test = trainset[trainset['session_id'].isin(test_session_ids)]

    print("X_train shape: ", X_train.shape)
    print("X_test shape: ", X_test.shape)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train[features])
    X_test_scaled = scaler.transform(X_test[features])
    y_train = X_train[question]
    y_test = X_test[question]

    # Create and train the logistic regression model
    lr_model = LogisticRegression()
    lr_model.fit(X_train_scaled, y_train)
    
    # Evaluate the Logreg model
    accuracy = lr_model.score(X_test_scaled, y_test)
    predictions = lr_model.predict(X_test_scaled)
    f1 = f1_score(y_test, predictions)

    print("\tLogreg F1 Score:", f1)
    print("\tLogreg Accuracy:", accuracy)
    logreg_accuracies.append(accuracy)
    logreg_f1_scores.append(f1)

    model = Sequential([
        LSTM(64, input_shape=(X_train_scaled.shape[1], 1)),
        Dense(1, activation='sigmoid')
    ])
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
    timesteps = 105
    # Reshape features for LSTM input (add a new axis for time steps)
    X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], timesteps, 1)
    X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], timesteps, 1)

    # Train the model
    history = model.fit(X_train_reshaped, y_train, epochs=2, batch_size=64, validation_data=(X_test_reshaped, y_test))

    predictions = model.predict(X_test_reshaped)
    accuracy = accuracy_score(X_test[question], (predictions > 0.5).astype(int))
    # Calculate F1 score
    f1 = f1_score(X_test[question], (predictions > 0.5).astype(int))
    print("\tLSTM F1 Score:", f1)
    print("\tLSTM Accuracy:", accuracy)
    lstm_accuracies.append(accuracy)
    lstm_f1_scores.append(f1)


Using LogReg
Filtering for level_qroups:
X_train shape:  (2481001, 124)
X_test shape:  (620815, 124)
	Logreg F1 Score: 0.8184965442010065
	Logreg Accuracy: 0.6928569702729477
Epoch 1/2
Epoch 2/2
 1456/38766 [>.............................] - ETA: 23:53 - loss: 0.6106 - accuracy: 0.6998

In [23]:
print("Using LogReg\nFiltering for level_qroups:")

logreg_accuracies = []
logreg_f1_scores = []

lstm_a_accuracies = []
lstm_a_f1_scores = []

for i in range(1, 4):
    question = f"q{i}"
    
    train_session_ids, test_session_ids = train_test_split(pivot_df['session_id'], test_size=0.2, random_state=42, stratify=pivot_df[question])

    X_train = trainset[trainset['session_id'].isin(train_session_ids)]
    X_test = trainset[trainset['session_id'].isin(test_session_ids)]

    print("X_train shape: ", X_train.shape)
    print("X_test shape: ", X_test.shape)
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train[features])
    X_test_scaled = scaler.transform(X_test[features])
    timesteps = 105
    X_train_reshaped = X_train_scaled.reshape(X_train_scaled.shape[0], timesteps, 1)
    X_test_reshaped = X_test_scaled.reshape(X_test_scaled.shape[0], timesteps, 1)
    
    y_train = X_train[question]
    y_test = X_test[question]
    model = model_lstm(X_train_reshaped.shape)
    
    model.fit(X_train_reshaped, y_train, batch_size=128, epochs=2, validation_data=[X_test_reshaped, y_test])
    predictions = model.predict(X_test_reshaped)
    accuracy = accuracy_score(X_test[question], (predictions > 0.5).astype(int))
    # Calculate F1 score
    f1 = f1_score(X_test[question], (predictions > 0.5).astype(int))
    print("\tLSTM Attention F1 Score:", f1)
    print("\tLSTM Attention Accuracy:", accuracy)
    lstm_a_accuracies.append(accuracy)
    lstm_a_f1_scores.append(f1)

Using LogReg
Filtering for level_qroups:
X_train shape:  (2481001, 124)
X_test shape:  (620815, 124)
Epoch 1/2
Epoch 2/2