In [1]:

import jo_wilder_310
env = jo_wilder_310.make_env()
iter_test = env.iter_test()

In [2]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import os
import math 
import json
import glob
import torch.nn as nn
import torch
import torch.nn.functional as F
from torch.utils.data import DataLoader
from joblib import Parallel, delayed
from tqdm import tqdm
from collections import defaultdict
from sklearn.model_selection import StratifiedKFold, GroupKFold
import polars as pl
import multiprocessing 
from catboost import CatBoostClassifier,CatBoostRegressor,Pool
import lightgbm as lgb
import gc
import sys
import time
import pickle
warnings.simplefilter(action='ignore', category=pd.errors.PerformanceWarning)
pd.set_option('mode.chained_assignment', None)


class Parameter(object):
    def __init__(self):
        # data
        # This constructor initializes several variables that will be used as parameters in later computations.
        # The values are set to their defaults upon the creation of a Parameter object.

        self.user_dir =  './user_data/'  # The directory where user data will be stored.
        self.k_folds = 5  # The number of folds to use in k-fold cross-validation.
        self.n_jobs = 2  # The number of CPU cores to use for parallel execution.
        self.random_seed = 27  # The seed value for the random number generator.
        self.seq_length = 2560  # The length of the sequences for sequence-based computations.
        self.cpu_cnt = multiprocessing.cpu_count()  # The number of CPUs available on the system.
        self.use_cuda = torch.cuda.is_available()  # Boolean flag to determine if CUDA is available for use.
        self.gpu = 0  # The GPU to use (0 indicates the first GPU).
        self.print_freq = 1000  # The frequency at which messages are printed.
        self.lr = 0.003  # The learning rate for the optimizer.
        self.weight_decay = 0  # The weight decay (regularization term) to apply in the optimizer.
        self.optim = 'Adam'  # The type of optimizer to use.
        self.base_epoch = 30  # The base number of epochs for training.

    def get(self, name):
        # This function returns the value of the attribute specified by the name argument.
        return getattr(self, name)

    def set(self, **kwargs):
        # This function sets the value of one or more attributes.
        # The attribute names and their new values are provided as keyword arguments.
        for k, v in kwargs.items():
            setattr(self, k, v)

    def __str__(self):
        # This function is a special method that returns a string representation of the Parameter object.
        # It is called by built-in functions and operators that need to convert the object into a string.
        # Here, it is returning a string that lists all the attribute names and their values.
        return '\n'.join(['%s:%s' % item for item in self.__dict__.items()])

parameter = Parameter()
    
def processing_df(df, cat_dict):
    df['level_group'] = df['level_group'].map({'0-4':0 ,'5-12':1, '13-22':2})
    df = cat_encoder(df, cat_dict)
    return df
    

def get_features(df):
    df = pl.from_pandas(df)
    df = df.with_columns([(pl.col('elapsed_time') - pl.col('elapsed_time').shift(1)).over(['session_id']).alias('elapsed_time_diff'),
                          (pl.col('room_coor_x') - pl.col('room_coor_x').shift(1))
                             .over(['session_id']).abs().alias('room_coor_x_diff'),
                          (pl.col('room_coor_y') - pl.col('room_coor_y').shift(1))
                             .over(['session_id']).abs().alias('room_coor_y_diff'),
                           ((pl.col('room_coor_x') - pl.col('room_coor_x').shift(1))
                             .over(['session_id']) ** 2 + (pl.col('room_coor_y') - pl.col('room_coor_y').shift(1))
                             .over(['session_id']) ** 2).sqrt().alias('room_dist'),
                          (pl.col('screen_coor_x') - pl.col('screen_coor_x').shift(1))
                             .over(['session_id']).abs().alias('screen_coor_x_diff'),
                          (pl.col('screen_coor_y') - pl.col('screen_coor_y').shift(1))
                             .over(['session_id']).abs().alias('screen_coor_y_diff'),
                            ((pl.col('screen_coor_x') - pl.col('screen_coor_x').shift(1))
                             .over(['session_id']) ** 2 + (pl.col('screen_coor_y') - pl.col('screen_coor_y').shift(1))
                             .over(['session_id']) ** 2).sqrt().alias('screen_dist'),
                    pl.col('index').count().over(['session_id', 'level_group']).alias('index_cnt'),
                     (pl.col('elapsed_time').max() - pl.col('elapsed_time').min()).over(['session_id', 'level_group']).alias('elapsed_time_range'),
                    ((pl.col('index') - pl.col('index').min()) / (pl.col('index').max() - pl.col('index').min())).over(['session_id', 'level_group']).alias('index2'),
                          ((pl.col('elapsed_time') - pl.col('elapsed_time').min()) / (pl.col('elapsed_time').max() - pl.col('elapsed_time').min())).over(['session_id', 'level_group']).alias('elapsed_time2'),
                
                    ])
    num_cols = ['index', 'index2', 'elapsed_time', 'elapsed_time2', 'elapsed_time_diff', 'page',  'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration'] + ['fullscreen','hq','music'] + ['room_dist', 'screen_dist'] + ['room_coor_x_diff', 'room_coor_y_diff'] + ['screen_coor_x_diff', 'screen_coor_y_diff']
    cat_cols = ['level', 'level_group', 'event_name', 'name', 'room_fqid', 'fqid', 'text_fqid', 'text']
    
         
    for col in ['event_name', 'name', 'room_fqid', 'fqid', 'text_fqid', 'text']:
        df = df.with_columns([(pl.col('elapsed_time') - pl.col('elapsed_time').shift(1)).over(['session_id',col]).alias('elapsed_time_diff_{}'.format(col)),
                          (pl.col('room_coor_x') - pl.col('room_coor_x').shift(1))
                             .over(['session_id',col]).abs().alias('room_coor_x_diff_{}'.format(col)),
                          (pl.col('room_coor_y') - pl.col('room_coor_y').shift(1))
                             .over(['session_id',col]).abs().alias('room_coor_y_diff_{}'.format(col)),
                           ((pl.col('room_coor_x') - pl.col('room_coor_x').shift(1))
                             .over(['session_id',col]) ** 2 + (pl.col('room_coor_y') - pl.col('room_coor_y').shift(1))
                             .over(['session_id',col]) ** 2).sqrt().alias('room_dist_{}'.format(col)),
                          (pl.col('screen_coor_x') - pl.col('screen_coor_x').shift(1))
                             .over(['session_id',col]).abs().alias('screen_coor_x_diff_{}'.format(col)),
                          (pl.col('screen_coor_y') - pl.col('screen_coor_y').shift(1))
                             .over(['session_id',col]).abs().alias('screen_coor_y_diff_{}'.format(col)),
                            ((pl.col('screen_coor_x') - pl.col('screen_coor_x').shift(1))
                             .over(['session_id',col]) ** 2 + (pl.col('screen_coor_y') - pl.col('screen_coor_y').shift(1))
                             .over(['session_id',col]) ** 2).sqrt().alias('screen_dist_{}'.format(col))])
               
    all_aggs = []
    for col in ['event_name', 'name', 'room_fqid', 'fqid', 'text_fqid', 'text']:
        all_aggs += [pl.col(col).n_unique().over(['session_id', 'level_group']).alias('{}_uni'.format(col)),
                    pl.col(col).n_unique().over(['session_id', 'level']).alias('{}_uni2'.format(col)),
                    pl.col('elapsed_time').rank('dense').over(['session_id', 'level_group', col]).alias('{}_elapsed_time_rank'.format(col)),
                    (pl.col('elapsed_time').rank('dense').over(['session_id', 'level_group', col]) / pl.col('elapsed_time').count().over(['session_id', 'level_group', col])).alias('{}_elapsed_time_rank2'.format(col)),
                    
                    ]
        
    for col in ['elapsed_time_diff']:
        all_aggs += [pl.col(col).std().over(['session_id', 'level_group']).alias('{}_std'.format(col)),
                    pl.col(col).mean().over(['session_id', 'level_group']).alias('{}_mean'.format(col)),
                    pl.col(col).max().over(['session_id', 'level_group']).alias('{}_max'.format(col)),
                    pl.col(col).min().over(['session_id', 'level_group']).alias('{}_min'.format(col)),
                    pl.col(col).rank('min').over(['session_id', 'level_group']).alias('{}_rank'.format(col)),
                    (pl.col(col).rank('min').over(['session_id', 'level_group']) / pl.col(col).count().over(['session_id', 'level_group'])).alias('{}_rank2'.format(col)),
                    pl.col(col).n_unique().over(['session_id', 'level_group']).alias('{}_uni'.format(col)),]
        all_aggs += [pl.col(col).quantile(p, "nearest").over(['session_id', 'level_group']).alias('{}_p{}'.format(col,p)) for p in [0.1, 0.2, 0.3, 0.4, 0.5,0.6, 0.7, 0.8, 0.9]]
        
        
    for col in ['elapsed_time_diff']:
        for col2 in ['level', 'event_name', 'name', 'room_fqid', 'fqid', 'text_fqid', 'text']:
            all_aggs += [pl.col(col).std().over(['session_id', 'level_group',col2]).alias('{}_std_col_{}'.format(col, col2)),
                    pl.col(col).mean().over(['session_id', 'level_group',col2]).alias('{}_mean_col_{}'.format(col, col2)),
                    pl.col(col).max().over(['session_id', 'level_group',col2]).alias('{}_max_col_{}'.format(col, col2)),
                    pl.col(col).min().over(['session_id', 'level_group',col2]).alias('{}_min_col_{}'.format(col, col2)),
                    pl.col(col).rank('min').over(['session_id', 'level_group',col2]).alias('{}_rank_col_{}'.format(col, col2)),
                    (pl.col(col).rank('min').over(['session_id', 'level_group',col2]) / pl.col(col).count().over(['session_id', 'level_group',col2])).alias('{}_rank2_col_{}'.format(col, col2)),
                    pl.col(col).count().over(['session_id', 'level_group',col2]).alias('{}_cnt_col_{}'.format(col, col2)),
                    pl.col(col).n_unique().over(['session_id', 'level_group',col2]).alias('{}_uni_col_{}'.format(col, col2)),]
            all_aggs += [pl.col(col).quantile(p, "nearest").over(['session_id', 'level_group', col2]).alias('{}_{}_p{}'.format(col,col2, p)) for p in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]]
            

    df = df.with_columns(all_aggs)
    df = df.to_pandas()
    return df



def cat_encoder(df, cat_dict):
    # Categorical columns to encode
    cat_cols = ['level', 'event_name', 'name', 'room_fqid', 'fqid', 'text_fqid', 'text']
    
    for col in cat_cols:
        if col in ['level']:
            # If the column is 'level', fill missing values with 0 and convert to int
            df[col] = df[col].fillna(0).astype(int)
            continue
        
        # For other columns, convert values to strings, map them using the provided dictionary, fill missing values with 0, and convert to int
        df[col] = df[col].astype(str).map(cat_dict[col]).fillna(0).astype(int)
    
    return df


def normalize(df_np, all_cols, standard_dict):
    # Replace infinite values with NaN
    df_np[np.isinf(df_np)] = np.nan
        
    def standard_each(v1, f, mode=1):
        # Standardize a single value using the provided mean and standard deviation
        df_mean, df_std = f[0], f[1]
        v1 = (v1 - df_mean) / df_std
        v1 = np.clip(v1, -30, 30)  # Clip the standardized value between -30 and 30
        return v1
    
    # Parallelize the standardization process across multiple columns using joblib
    res = Parallel(n_jobs=4)(delayed(standard_each)(df_np[:, i], standard_dict[all_cols[i]]) for i in range(len(all_cols)))
    
    # Concatenate the standardized values into a single numpy array
    df_np = np.concatenate([x.reshape((-1, 1)) for x in res], axis=1)
    
    # Replace NaN values with 0.0
    df_np[np.isnan(df_np)] = 0.0
    
    return df_np

In [3]:
l1_texts = ['undefined', 'Whatcha doing over there, Jo?', 'Just talking to Teddy.', 'I gotta run to my meeting!', 'Can I come, Gramps?', 'Sure thing, Jo. Grab your notebook and come upstairs!', 'See you later, Teddy.', "I get to go to Gramps's meeting!", 'Now where did I put my notebook?', '\\u00f0\\u0178\\u02dc\\u00b4', 'null', 'I love these photos of me and Teddy!', 'Found it!', 'Gramps is in trouble for losing papers?', "This can't be right!", 'Gramps is a great historian!', "Hmm. Button's still not working.", "Let's get started. The Wisconsin Wonders exhibit opens tomorrow!", 'Who wants to investigate the shirt artifact?', "Not Leopold here. He's been losing papers lately.", 'Hey!', "It's true, they do keep going missing lately.", 'See?', 'Besides, I already figured out the shirt.', "It's a women's basketball jersey!", 'That settles it.', 'Wells, finish up your report.', "Leopold, why don't you help me set up in the Capitol?", 'We need to talk about that missing paperwork.', 'Will do, Boss.', "Hey Jo, let's take a look at the shirt!", 'Your grampa is waiting for you in the collection room.', "Why don't you go catch up with your grampa?", 'What a fascinating artifact!', "Wow, that's so cool, Gramps!", 'Can I take a closer look?', "Hmmm. Shouldn't you be doing your homework?", "It's already all done!", 'Plus, my teacher said I could help you out for extra credit!', "Well, that's good enough for me.", 'Go ahead, take a peek at the shirt!', 'This looks like a clue!', "I'll record this in my notebook.", 'Find anything?', 'Yes! This old slip from 1916.', 'I knew it!', "I'm not so sure that this is a basketball jersey.", 'Wait, you mean Wells is wrong?!', 'Could be. But we need evidence!', "Why don't you head to the Basketball Center and rustle up some clues?", 'Sure!', "I'll be at the Capitol. Let me know if you find anything!", 'Better check back later.', "That's it!", "The slip is from 1916 but the team didn't start until 1974!", 'Our shirt is too old to be a basketball jersey!', 'I need to get to the Capitol and tell Gramps!', 'I should see what Grampa is up to!', 'Ugh. Meetings are so boring.', 'Grab your notebook and come upstairs!', 'Hang tight, Teddy.', "I'll hurry back and then we can go exploring!", 'Well, Leopold here is always losing papers...', 'Ha. Told you so!', 'Can we hurry up, Gramps?', 'Teddy and I were gonna go climb that huge tree out back!', "Hmmm. Don't forget about your homework.", 'Your teacher said you missed 7 assignments in a row!', 'So? History is boring!', 'I suppose historians are boring, too?', "No way, Gramps. You're the best!", 'Then do it for me!', 'Your teacher said you could help me for extra credit.', 'A boring old shirt.', 'Just this old slip from 1916.', 'Do I have to?', 'Um... what did you want me to do again?', 'Head over to the Basketball Center.', 'Hopefully you can find some clues!', 'Meetings are BORING!', "I feel like I'm forgetting something.", 'Gramps is the best historian ever!', 'This button never works!', "Why don't you go play with your grampa?", "Look at that! It's the bee's knees!", "Well, I did SOME of those. I just couldn't find them!", 'Did you do all of them?', 'No... because history is boring!', 'Hooray, a boring old shirt.', 'Hot Dog! I knew it!', 'Ooh, I like clues!', 'Hopefully you can rustle up some clues!', 'I should go talk to Gramps!', 'Yes! This cool old slip from 1916.', 'I should see what Gramps is up to!', 'Gramps said to look for clues. Better look around.', 'Have a look at the artifact!', 'Come on, Jo!', "Meet me back in my office and we'll get started!"]
l2_texts = ['undefined', 'Whatcha doing over there, Jo?', 'Just talking to Teddy.', 'I gotta run to my meeting!', 'Can I come, Gramps?', 'Sure thing, Jo. Grab your notebook and come upstairs!', 'See you later, Teddy.', "I get to go to Gramps's meeting!", 'Now where did I put my notebook?', '\\u00f0\\u0178\\u02dc\\u00b4', 'null', 'I love these photos of me and Teddy!', 'Found it!', 'Gramps is in trouble for losing papers?', "This can't be right!", 'Gramps is a great historian!', "Hmm. Button's still not working.", "Let's get started. The Wisconsin Wonders exhibit opens tomorrow!", 'Who wants to investigate the shirt artifact?', "Not Leopold here. He's been losing papers lately.", 'Hey!', "It's true, they do keep going missing lately.", 'See?', 'Besides, I already figured out the shirt.', "It's a women's basketball jersey!", 'That settles it.', 'Wells, finish up your report.', "Leopold, why don't you help me set up in the Capitol?", 'We need to talk about that missing paperwork.', 'Will do, Boss.', "Hey Jo, let's take a look at the shirt!", 'Your grampa is waiting for you in the collection room.', "Why don't you go catch up with your grampa?", 'What a fascinating artifact!', "Wow, that's so cool, Gramps!", 'Can I take a closer look?', "Hmmm. Shouldn't you be doing your homework?", "It's already all done!", 'Plus, my teacher said I could help you out for extra credit!', "Well, that's good enough for me.", 'Go ahead, take a peek at the shirt!', 'This looks like a clue!', "I'll record this in my notebook.", 'Find anything?', 'Yes! This old slip from 1916.', 'I knew it!', "I'm not so sure that this is a basketball jersey.", 'Wait, you mean Wells is wrong?!', 'Could be. But we need evidence!', "Why don't you head to the Basketball Center and rustle up some clues?", 'Sure!', "I'll be at the Capitol. Let me know if you find anything!", 'Better check back later.', "That's it!", "The slip is from 1916 but the team didn't start until 1974!", 'Our shirt is too old to be a basketball jersey!', 'I need to get to the Capitol and tell Gramps!', 'What are you still doing here,  Jolie?', 'Go find your grampa and get to work!', 'Oh no!', 'What happened here?!', "I don't know!", 'I got here and the whole place was a mess!', 'Can you help me tidy up?', "Teddy's scarf! Somebody must've taken him!", 'Try not to panic, Jo.', 'Maybe he just got scared and ran off.', 'But he never goes anywhere without his scarf!', "I think he's in trouble!", 'Is this your coffee, Gramps?', "Nope, that's from Bean Town. I only drink Holdgers!", "Who could've done this?", "It must've been Wells.", "He's always trying to get you in trouble, and he doesn't like animals!", 'Slow down, Jo.', 'But what if Wells kidnapped Teddy?', 'Then we need evidence.', "You're right, Gramps. Let's investigate!", "I'm afraid my papers have gone missing in this mess.", "You'll have to get started without me.", "Okay. I'll find Teddy!", "And I'll figure out the shirt, too.", 'I knew I could count on you, Jo!', "Why don't you go upstairs and see the archivist?", "He's our expert record keeper.", 'I need your help!', 'Who are you?', "I'm Leopold's grandkid!", "Sorry, I'm too busy for kids right now.", 'Now if only I could read this thing.', "Can't believe I lost my reading glasses.", 'I bet the archivist could use this!', "Ah, that's better!", 'Did you have a question?', 'Yes! I was wondering-', 'Wait a minute!', 'Where did you get that coffee?', "Oh, that's from Bean Town.", 'I ran into Wells there this morning.', 'Wells? I knew it!', 'Do you know anything about this slip?', 'I found it on an old shirt.', 'An old shirt? Try the university.', 'You can talk to a textile expert there.', "What's a textile expert?", 'They study clothes and fabric.', 'Great! Thanks for the help!', 'Head over to the university.', 'Hello there!', 'Wow! What is all this stuff?', "It's our Norwegian Craft exhibit!", 'Can I give you the tour?', "Sorry, I'm in a hurry.", 'Do you know what this slip is?', 'Looks like a dry cleaning receipt.', 'Thanks.', 'Now I Just need to find all the cleaners from way back in 1916.', 'Maybe I can help!', "I've got a stack of business cards from my favorite cleaners.", "Why don't you take a look?", 'This place was around in 1916! I can start there!', "You haven't seen any badgers around here, have you?", 'Badgers? No.', 'Okay. Thanks anyway.', 'Hi! How can I help you?', 'I need to find the owner of this slip.', "Well, I can't show our log books to just anybody.", 'Please?', "It's for Grampa Leo. He's a historian!", 'Leo... you mean Leopold?', 'Your gramps is awesome! Always full of stories.', "Guess it couldn't hurt to let you take a look.", "Here's the log book.", "It's a match!", 'Theodora Youmans must be the owner!', 'Do you know who Theodora Youmans is?', "Hmmm... not sure. Why don't you try the library?", 'Thanks for the help!', 'Oh, hello there!', 'How can I help you?', 'Have you seen a badger around here?', "I'm afraid not.", 'Please let me know if you do.', "I'm also looking for Theodora Youmans. Have you heard of her?", 'Theodora Youmans? Of course!', "Check out our microfiche. It's right through that door.", 'Youmans was a suffragist!', 'She helped get votes for women!', 'Wells! What was he doing here? I should ask the librarian.', 'What was Wells doing here?', 'He was looking for a taxidermist.', "What's a taxidermist?", 'Not sure. Here, let me look it up.', '\\Taxidermy: the art of preparing, stuffing, and mounting the skins of animals.\\', 'Oh no... Teddy!', 'Can you help me find Wells?', 'You could ask the archivist. He knows everybody!', "Jolie! I was hoping you'd stop by. Any news on the shirt artifact?", "I haven't quite figured it out just yet...", "Well, get on it. I'm counting on you and your gramps to figure this out!", 'Can you help me? I need to find Wells!', "I haven't seen him.", 'Please? This is really important.', "Sorry, can't help you.", 'Do you have any info on Theodora Youmans?', 'Theodora Youmans? Is that who owned the shirt?', 'Yep.', "Why didn't you say so?", 'Youmans was a suffragist here in Wisconsin.', 'She led marches and helped women get the right to vote!', "Wait a sec. Women couldn't vote?!", 'Nope. But Youmans and other suffragists worked hard to change that.', 'Thanks to them, Wisconsin was the first state to approve votes for women!', 'Wow!', "Here's a call number to find more info in the Stacks.", 'Where are the Stacks?', 'Right outside the door.', 'Hey, this is Youmans!', "And look! She's wearing the shirt!", 'I should go to the Capitol and tell everyone!', 'I should see what Grampa is up to!', 'What should I do first?', 'Head upstairs and talk to the archivist. He might be able to help!', 'Ugh. Meetings are so boring.', 'Grab your notebook and come upstairs!', 'Hang tight, Teddy.', "I'll hurry back and then we can go exploring!", 'Well, Leopold here is always losing papers...', 'Ha. Told you so!', 'Can we hurry up, Gramps?', 'Teddy and I were gonna go climb that huge tree out back!', "Hmmm. Don't forget about your homework.", 'Your teacher said you missed 7 assignments in a row!', 'So? History is boring!', 'I suppose historians are boring, too?', "No way, Gramps. You're the best!", 'Then do it for me!', 'Your teacher said you could help me for extra credit.', 'A boring old shirt.', 'Just this old slip from 1916.', 'Do I have to?', 'What the-', 'I have an idea.', "He's wrong about old shirts and his name rhymes with \\smells\\...", 'BUT WELLS STOLE TEDDY!', 'Could be. But we need evidence.', "Fine. Let's investigate!", "Don't worry, Gramps. I'll find Teddy!", "Please let me know if you do. It's important!", 'I need to find Wells right away! Do you know where he is?', 'I need to find Wells!!!', "I can't calm down. This is important!", 'Ugh. Fine.', 'Um... what did you want me to do again?', 'Head over to the Basketball Center.', 'Hopefully you can find some clues!', 'I love these photos of me and Teddy.', 'I should stay and look for clues!', 'Where should I go again?', 'You could try the archivist. Maybe he can help you find Wells!', 'Hi, Mrs. M.', "I don't need that right now.", 'Meetings are BORING!', "I feel like I'm forgetting something.", 'Gramps is the best historian ever!', 'This button never works!', "Why don't you go play with your grampa?", "Look at that! It's the bee's knees!", "Well, I did SOME of those. I just couldn't find them!", 'Did you do all of them?', 'No... because history is boring!', 'Hooray, a boring old shirt.', 'Hot Dog! I knew it!', 'Ooh, I like clues!', 'Hopefully you can rustle up some clues!', 'I got here and the whole place was ransacked!', 'Hold your horses, Jo.', '*grumble grumble*', 'And you are?', "I don't have time for kids.", 'Now if only I could read this thing. Blasted tiny letters...', 'Knew what?', 'Did you have a question or not?', 'Yes!', "You're still here? I'm trying to work!", 'Run along to the university.', 'Ooh, thanks!', 'Now I just need to find all the cleaners from wayyyy back in 1916.', 'Yikes... this could take a while.', 'Hi! *cough*', 'Can you help-', '*cough cough*', 'Can you help me-', '*COUGH COUGH COUGH*', 'Um, are you okay?', "Oh, I'm fine! Just a little hoarse.", 'Ha! What do you call a pony with a sore throat?', 'Huh?', 'A little horse!', "Ha! You're funny.", 'I got that one from my Gramps!', 'Can you help me? I need to find the owner of this slip.', "Yup, that's him!", "Unless you're too busy horsing around.", 'Ha! Good one.', "You look like you're on a mission.", 'Two missions, actually!', 'Oh my!', 'I need to find Wells right away!! Do you know where he is?', "Calm down, kid. I haven't seen him.", 'I should help Gramps clean.', "Maybe there's a clue in this mess!", "Poor Gramps! I should make sure he's okay.", 'The archivist said I should look in the stacks.', 'I should go talk to Gramps!', 'Yeah. Thanks anyway.', 'What are you waiting for? The Stacks are right outside the door.', 'Yes! This cool old slip from 1916.', 'Are you okay?', "Weren't you going to check out our microfiche?", "I'm sure you'll find Theodora in there somewhere!", 'Well? What are you still doing here?', 'So much cleaning to do...', 'I used to have a magnifying glass around here\\u00e2\\u20ac\\u00a6', "But I hear the museum's got one on the loose!", "Did you drop something, Dear? There's a card on the floor.", 'Take a look!', "It's such a nice fall day.", 'I should see what Gramps is up to!', 'I found it!', 'Theodora wearing the shirt!', 'You better get to the capitol!', 'Nice decorations.', 'Nice seeing you, Jolie!', 'Did you drop something, Dear?', 'Gramps said to look for clues. Better look around.', 'I should find out if she can help me!', 'Ooh, nice decorations!', 'The libarian said I could find some information on Youmans in here...', 'Have a look at the artifact!', 'I should ask the librarian why Wells was here.', "I wonder if there's a clue in those business cards...", 'Thanks. Did you figure out the shirt?', 'Welcome back, Jolie. Did you figure out the shirt?', 'I should check that logbook to see who owned this slip...', 'AND I know who took Teddy!', 'Who is Teddy?', "And where's your grampa?", 'Sorry for the delay, Boss.', 'I had some cleaning up to do in my office.', 'Mrs. M, I think Wells kidnapped Teddy.', "And he messed up Gramps's office, too!", 'One step at a time, Jo.', 'Did you figure out the shirt?', 'I knew you could do it, Jo!', 'Now can I tell you what happened to Teddy?', 'He needs our help!', "Sorry I'm late.", "Wells! Where's Teddy? Is he okay?", 'I figured out that you kidnapped him!', 'Easy, Jo.', "Why don't you prove your case?", "It'll be okay, Jo. We'll find Teddy!", 'Nice work on the shirt, Jolie!', 'Leopold, can you run back to the museum?', 'Sounds good, Boss.', 'Jo, meet me back at my office.', 'I hope you find your badger, kid.', 'Come on, Jo!', "Meet me back in my office and we'll get started!", 'Here I am!', 'Wells sabotaged Gramps!', 'AND he stole Teddy!']
l3_texts = ['undefined', 'Whatcha doing over there, Jo?', 'Just talking to Teddy.', 'I gotta run to my meeting!', 'Can I come, Gramps?', 'Sure thing, Jo. Grab your notebook and come upstairs!', 'See you later, Teddy.', "I get to go to Gramps's meeting!", 'Now where did I put my notebook?', '\\u00f0\\u0178\\u02dc\\u00b4', 'null', 'I love these photos of me and Teddy!', 'Found it!', 'Gramps is in trouble for losing papers?', "This can't be right!", 'Gramps is a great historian!', "Hmm. Button's still not working.", "Let's get started. The Wisconsin Wonders exhibit opens tomorrow!", 'Who wants to investigate the shirt artifact?', "Not Leopold here. He's been losing papers lately.", 'Hey!', "It's true, they do keep going missing lately.", 'See?', 'Besides, I already figured out the shirt.', "It's a women's basketball jersey!", 'That settles it.', 'Wells, finish up your report.', "Leopold, why don't you help me set up in the Capitol?", 'We need to talk about that missing paperwork.', 'Will do, Boss.', "Hey Jo, let's take a look at the shirt!", 'Your grampa is waiting for you in the collection room.', "Why don't you go catch up with your grampa?", 'What a fascinating artifact!', "Wow, that's so cool, Gramps!", 'Can I take a closer look?', "Hmmm. Shouldn't you be doing your homework?", "It's already all done!", 'Plus, my teacher said I could help you out for extra credit!', "Well, that's good enough for me.", 'Go ahead, take a peek at the shirt!', 'This looks like a clue!', "I'll record this in my notebook.", 'Find anything?', 'Yes! This old slip from 1916.', 'I knew it!', "I'm not so sure that this is a basketball jersey.", 'Wait, you mean Wells is wrong?!', 'Could be. But we need evidence!', "Why don't you head to the Basketball Center and rustle up some clues?", 'Sure!', "I'll be at the Capitol. Let me know if you find anything!", 'Better check back later.', "That's it!", "The slip is from 1916 but the team didn't start until 1974!", 'Our shirt is too old to be a basketball jersey!', 'I need to get to the Capitol and tell Gramps!', 'What are you still doing here,  Jolie?', 'Go find your grampa and get to work!', 'Oh no!', 'What happened here?!', "I don't know!", 'I got here and the whole place was a mess!', 'Can you help me tidy up?', "Teddy's scarf! Somebody must've taken him!", 'Try not to panic, Jo.', 'Maybe he just got scared and ran off.', 'But he never goes anywhere without his scarf!', "I think he's in trouble!", 'Is this your coffee, Gramps?', "Nope, that's from Bean Town. I only drink Holdgers!", "Who could've done this?", "It must've been Wells.", "He's always trying to get you in trouble, and he doesn't like animals!", 'Slow down, Jo.', 'But what if Wells kidnapped Teddy?', 'Then we need evidence.', "You're right, Gramps. Let's investigate!", "I'm afraid my papers have gone missing in this mess.", "You'll have to get started without me.", "Okay. I'll find Teddy!", "And I'll figure out the shirt, too.", 'I knew I could count on you, Jo!', "Why don't you go upstairs and see the archivist?", "He's our expert record keeper.", 'I need your help!', 'Who are you?', "I'm Leopold's grandkid!", "Sorry, I'm too busy for kids right now.", 'Now if only I could read this thing.', "Can't believe I lost my reading glasses.", 'I bet the archivist could use this!', "Ah, that's better!", 'Did you have a question?', 'Yes! I was wondering-', 'Wait a minute!', 'Where did you get that coffee?', "Oh, that's from Bean Town.", 'I ran into Wells there this morning.', 'Wells? I knew it!', 'Do you know anything about this slip?', 'I found it on an old shirt.', 'An old shirt? Try the university.', 'You can talk to a textile expert there.', "What's a textile expert?", 'They study clothes and fabric.', 'Great! Thanks for the help!', 'Head over to the university.', 'Hello there!', 'Wow! What is all this stuff?', "It's our Norwegian Craft exhibit!", 'Can I give you the tour?', "Sorry, I'm in a hurry.", 'Do you know what this slip is?', 'Looks like a dry cleaning receipt.', 'Thanks.', 'Now I Just need to find all the cleaners from way back in 1916.', 'Maybe I can help!', "I've got a stack of business cards from my favorite cleaners.", "Why don't you take a look?", 'This place was around in 1916! I can start there!', "You haven't seen any badgers around here, have you?", 'Badgers? No.', 'Okay. Thanks anyway.', 'Hi! How can I help you?', 'I need to find the owner of this slip.', "Well, I can't show our log books to just anybody.", 'Please?', "It's for Grampa Leo. He's a historian!", 'Leo... you mean Leopold?', 'Your gramps is awesome! Always full of stories.', "Guess it couldn't hurt to let you take a look.", "Here's the log book.", "It's a match!", 'Theodora Youmans must be the owner!', 'Do you know who Theodora Youmans is?', "Hmmm... not sure. Why don't you try the library?", 'Thanks for the help!', 'Oh, hello there!', 'How can I help you?', 'Have you seen a badger around here?', "I'm afraid not.", 'Please let me know if you do.', "I'm also looking for Theodora Youmans. Have you heard of her?", 'Theodora Youmans? Of course!', "Check out our microfiche. It's right through that door.", 'Youmans was a suffragist!', 'She helped get votes for women!', 'Wells! What was he doing here? I should ask the librarian.', 'What was Wells doing here?', 'He was looking for a taxidermist.', "What's a taxidermist?", 'Not sure. Here, let me look it up.', '\\Taxidermy: the art of preparing, stuffing, and mounting the skins of animals.\\', 'Oh no... Teddy!', 'Can you help me find Wells?', 'You could ask the archivist. He knows everybody!', "Jolie! I was hoping you'd stop by. Any news on the shirt artifact?", "I haven't quite figured it out just yet...", "Well, get on it. I'm counting on you and your gramps to figure this out!", 'Can you help me? I need to find Wells!', "I haven't seen him.", 'Please? This is really important.', "Sorry, can't help you.", 'Do you have any info on Theodora Youmans?', 'Theodora Youmans? Is that who owned the shirt?', 'Yep.', "Why didn't you say so?", 'Youmans was a suffragist here in Wisconsin.', 'She led marches and helped women get the right to vote!', "Wait a sec. Women couldn't vote?!", 'Nope. But Youmans and other suffragists worked hard to change that.', 'Thanks to them, Wisconsin was the first state to approve votes for women!', 'Wow!', "Here's a call number to find more info in the Stacks.", 'Where are the Stacks?', 'Right outside the door.', 'Hey, this is Youmans!', "And look! She's wearing the shirt!", 'I should go to the Capitol and tell everyone!', 'Jo!', 'Check out the next artifact!', 'What is it?', "I think it's a flag! Pretty interesting, huh?", "It's really cool, Gramps. But I'm worried about Teddy.", "He's still missing!", "We'll find him, Jo.", 'Want to look for more clues?', "We'll find Teddy.", 'We just have to keep our eyes open!', 'Hey, look at those scratches!', 'The kidnapper probably took Teddy on the elevator!', "You're right, Jo!", "Why isn't the button working?", "We'll need a key card.", 'I had one, but Teddy chewed it up.', "I've got Wells's ID!", 'What should we do next?', "I need to take the artifact upstairs. Why don't you investigate those scratch marks?", "Okay. I'll try.", 'Teddy, here I come!', 'I wonder whose glasses these are.', 'Teddy!!!', "Hang on. I'll get you out of there!", 'Whoever lost these glasses probably took Teddy!', 'How can I find out whose glasses these are?', 'Oh! There was a staff directory in the entryway!', "I'll go look at everyone's pictures!", 'Those are the same glasses!', "The archivist must've taken Teddy!", "Yes! It's the key for Teddy's cage!", 'I found the key!', "Come on, let's get out of here!", "Here's your scarf back!", 'What are you doing down here?', 'And how did that badger get free?', "I'm here to rescue my friend!", "What's going on here?", 'Thanks for coming, Boss.', 'I told you!', 'I captured a badger in our museum!', "He's been eating my lunch every day this week!", 'He has??', "I've seen him eating homework and important papers, too.", "Jolie- keep your badger under control, or he'll have to go.", 'And you, Frank-', "You can't just steal Jolie's pet.", 'Ugh. Fine.', 'Alright, Jolie. Back to work.', "Come on, Teddy. Let's go help Gramps!", "Let's go help Gramps!", 'Gramps must be up in the collection room.', "Let's go find him!", "Teddy! I'm glad to see you.", 'The archivist had him locked up!', 'Poor badger.', "You're becoming quite the detective, Jo.", 'Notice any clues about this flag?', 'Well... it looks hand-stitched.', 'Good catch!', 'Go on, tell the boss what you found!', "I'm telling you, Boss. Taxidermy is the way to go!", 'Nonsense. I want live animals at the exhibit, not stuffed ones.', "Ah, Jolie! I'm glad you're here.", "I'm putting you in charge of the flag case.", 'Make sure to get some old photos for the exhibit, like last time!', "Wait! Can't I do it?", 'The symbol on the flag looks sort of like a deer hoof.', 'It could be an early design for the Wisconsin state flag!', 'Wells, you already have a job to do.', 'What now, kid?', 'Do you really think that symbol is a deer hoof?', 'Not sure.', 'Do you know where I can find a deer expert?', 'Hmm. You could try the Aldo Leopold Wildlife Center.', 'I have to head over there and check out the animals.', "I'll ride with you!", "Come on, kid. Let's go.", 'Head over to the Wildlife Center!', "I'm sure they'll be able to help.", 'People sure drink a lot of coffee around here.', "I can't believe this.", 'Ugh...', 'Oh no! What happened to that crane?', 'Her beak is stuck in a coffee cup.', "It's lucky we found her.", 'Ugh! Those cups are all over the place.', "I need to get her free. She won't hold still!", 'Can Teddy and I help?', 'Sure! Give it a try.', 'Careful. That beak is sharp!', 'We need to calm her down, Teddy.', 'Any ideas?', '\\u00f0\\u0178\\u00a6\\u2014', 'Oh yeah, cranes eat insects!', 'Luckily there are tons of insects around here...', 'Got one!', "Maybe she'll let me take off the cup!", "It's OK, girl! Look, I found you a cricket!", 'You did it! Thanks, kid.', 'Can I help you with anything?', "I'm investigating this symbol.", 'Does it look like a deer hoof?', "There's a diagram of animal tracks over there.", 'Go take a look!', "That hoofprint doesn't match the flag!", 'Thanks for your help, kid!', "So? What'd you find out?", "Looks like it's not a deer hoof.", "Oh no. If I don't impress the boss soon,  I'm gonna get fired!", 'Hey, Wells...', 'I think I might be able to help you.', "No thanks. I don't need help from kids.", 'Are you sure? I know where you can find a real, live badger for the exhibit!', 'Wait! What?! Really?', 'Wells, meet Teddy.', '\\u00f0\\u0178\\u02dc\\u0160', "He says he'd be willing to help out.", 'Yes!!!', 'We still need to figure out that flag. Do you know anyone who could help?', "Hmm. Let's see...", 'Actually, I went to school with somebody who LOVES old flags.', "Why don't you go talk to her? I'll let her know you're coming.", 'Hey, nice dog! What breed is he?', "Actually, he's a badger.", "Oh, cool! I've never seen a badger in real life.", "You've got a million flags here!", "Yep. I'm a vexillophile!", "What's a vexillophile? ", 'It just means flag expert. How can I help?', "I'm investigating this flag.", 'Can you take a look?', "Hey, I've seen that symbol before! Check it out!", '\\Ecology flag, by Ron Cobb.\\', "It's an ecology flag!", 'Do you know what this flag was used for?', "I'm not sure.", "If I were you, I'd go to the library and do some digging.", 'Good idea. Thanks!', 'Welcome back, Dear! How can I help you?', 'I need to learn more about this flag!', 'It has something to do with ecology.', 'Hmm... those stripes remind me of the American flag.', 'Your flag must have been part of a national movement!', "Go check the microfiche. Maybe you'll find something!", "Hey! That's Governor Nelson in front of our flag!", 'I found the flag! Governor Nelson used it on the first Earth Day!', 'Wow! You figured it out!', 'Now I just need some old photos, like last time.', 'The boss is gonna love it!', 'You could try the archives.', 'Though the archivist might be too busy to help...', 'Okay. Thanks!', 'What are you doing here?', "We're looking for some photos.", "It's for the flag display!", 'Wait a minute...', "YOU'RE the new history detective everybody's talking about?", "Teddy's helping too.", 'What kind of photos do you need?', 'Something to do with ecology and Wisconsin.', "Here's a call number for the Stacks. Go find some photos.", 'Look at all those activists!', 'This is perfect for the exhibit.', 'I should go to the Capitol and tell Mrs. M!', 'I should see what Grampa is up to!', 'What should I do first?', 'Head upstairs and talk to the archivist. He might be able to help!', "It's locked!", "Jolie! I was hoping you'd stop by. Any news on the flag artifact?", "Well, get on it. I'm counting on you to figure this out!", 'Nice seeing you, Jolie!', "It's such a nice fall day.", 'I love these photos of me and Teddy.', "Why don't you go talk to the boss?", "She's right outside.", 'My friend is a flag expert.', 'She should be able to help you out.', 'There are some old newspapers loaded up in the microfiche.', 'The Stacks are right outside the door. Go find some photos!', 'Ugh. Meetings are so boring.', 'Grab your notebook and come upstairs!', 'Hang tight, Teddy.', "I'll hurry back and then we can go exploring!", 'Well, Leopold here is always losing papers...', 'Ha. Told you so!', 'Can we hurry up, Gramps?', 'Teddy and I were gonna go climb that huge tree out back!', "Hmmm. Don't forget about your homework.", 'Your teacher said you missed 7 assignments in a row!', 'So? History is boring!', 'I suppose historians are boring, too?', "No way, Gramps. You're the best!", 'Then do it for me!', 'Your teacher said you could help me for extra credit.', 'A boring old shirt.', 'Just this old slip from 1916.', 'Do I have to?', 'What the-', 'I have an idea.', "He's wrong about old shirts and his name rhymes with \\smells\\...", 'BUT WELLS STOLE TEDDY!', 'Could be. But we need evidence.', "Fine. Let's investigate!", "Don't worry, Gramps. I'll find Teddy!", "Please let me know if you do. It's important!", 'I need to find Wells right away! Do you know where he is?', 'I need to find Wells!!!', "I can't calm down. This is important!", "I don't have time for this, Gramps.", 'Teddy is still missing!', "Let's follow those scratch marks!", "I can't go with you. I need to take the artifact upstairs.", "It's okay, Gramps. I'll go by myself.", 'You stole Teddy! How could you?!', "No he hasn't!", "Yes, he has. I've seen him eating homework and important papers, too.", 'Come on, Teddy.', "Let's go find Gramps!", 'I think I can help with your animal problem.', "Ha! I don't need your help.", "Fine. Then I guess you don't want a real, live badger for the exhibit.", "Oh, trust me. He'll make time.", 'Um... what did you want me to do again?', 'Head over to the Basketball Center.', 'Hopefully you can find some clues!', 'I should stay and look for clues!', 'Where should I go again?', 'You could try the archivist. Maybe he can help you find Wells!', 'Hi, Mrs. M.', 'Head back to the museum. Your gramps is waiting for you.', "I don't need that right now.", 'Meetings are BORING!', "I feel like I'm forgetting something.", 'Gramps is the best historian ever!', 'This button never works!', "Why don't you go play with your grampa?", "Look at that! It's the bee's knees!", "Well, I did SOME of those. I just couldn't find them!", 'Did you do all of them?', 'No... because history is boring!', 'Hooray, a boring old shirt.', 'Hot Dog! I knew it!', 'Ooh, I like clues!', 'Hopefully you can rustle up some clues!', 'I got here and the whole place was ransacked!', 'Hold your horses, Jo.', '*grumble grumble*', 'And you are?', "I don't have time for kids.", 'Now if only I could read this thing. Blasted tiny letters...', 'Knew what?', 'Did you have a question or not?', 'Yes!', "You're still here? I'm trying to work!", 'Run along to the university.', 'Ooh, thanks!', 'Now I just need to find all the cleaners from wayyyy back in 1916.', 'Yikes... this could take a while.', 'Hi! *cough*', 'Can you help-', '*cough cough*', 'Can you help me-', '*COUGH COUGH COUGH*', 'Um, are you okay?', "Oh, I'm fine! Just a little hoarse.", 'Ha! What do you call a pony with a sore throat?', 'Huh?', 'A little horse!', "Ha! You're funny.", 'I got that one from my Gramps!', 'Can you help me? I need to find the owner of this slip.', "Yup, that's him!", "Unless you're too busy horsing around.", 'Ha! Good one.', "You look like you're on a mission.", 'Two missions, actually!', 'Oh my!', 'I need to find Wells right away!! Do you know where he is?', "Calm down, kid. I haven't seen him.", "I think it's a flag! Pretty spiffy, eh?", "Great Scott, you're right!", "Jo! I can't go with you. I need to take the artifact upstairs.", '\\u00f0\\u0178\\u02dc\\u00ad', '\\u00e2\\u009d\\u00a4\\u00ef\\u00b8\\u008f', 'GRRRRRRR', 'GAH! And what is THAT doing out of its cage?!', '\\u00f0\\u0178\\u02dc\\u0090', 'Teddy! Did you really eat his lunch?', "Did you steal Gramps's paperwork too?!", 'And my homework?!?!', 'See?!', "That thing's a monster!", "I don't have time for this.", 'YEAH!', 'Wait- me?', "You can't just steal Jolie's pet. Don't you know badgers are protected animals?", 'Besides, he looks friendly to me.', 'Wha?!', '\\u00f0\\u0178\\u02dc\\u009d', "Teddy! I'm sure glad to see you.", 'Gadzooks! Poor critter.', 'Aha! Good catch, Jo.', 'Not sure. Do I look like a deer expert to you?', 'Ugh. I have to head over there and check out the animals.', 'FINE. That possum better not scratch my leather seats...', "He's a badger!", '\\u00f0\\u0178\\u00a7\\u02dc', 'Yoga does sound nice.', "But cranes can't do yoga, Teddy!", '\\u00f0\\u0178\\u008d\\u00a9', "Cranes don't eat donuts!", 'Besides, you just ate my last snack.', "Gah. I can't believe this.", "I'm a historian, not a zookeeper!", 'And this place is dirty, and itchy, and-', 'I love it!', "Of course you do. You've got a rodent following you around.", "Actually, badgers aren't rodents-", 'Whatever.', 'Great. Just great. Could this day get any worse?!', "Yes!!! I'm saved!", 'A real, live ferret!', "He's. A. Badger.", 'And we still need to figure out that flag!', "Fine, fine. Let's see...", 'A vexy-wha?', 'Ooh... \\Ecology flag, by Ron Cobb.\\', 'The boss is gonna love it!!!', "You again! Don't let him hurt me!", '\\u00f0\\u0178\\u2122\\u201e', "Actually, we're just here for some photos.", 'Guess so!', 'YOU?!', "Just please, don't let your badger eat them!", 'I should help Gramps clean.', "Maybe there's a clue in this mess!", "Poor Gramps! I should make sure he's okay.", 'The archivist said I should look in the stacks.', 'There should be some info about that symbol in my book.', 'I should go talk to Gramps!', 'Yeah. Thanks anyway.', 'What are you waiting for? The Stacks are right outside the door.', 'Yes! This cool old slip from 1916.', 'Are you okay?', "I'll be in the collection room. Come find me when you're ready to check out the artifact.", 'Good luck!', 'What?!', 'Can I ride with you?', "Don't worry, he won't! (And he's a badger, by the way.)", 'Ugh... I think that lynx is looking at me funny.', "Don't worry, Teddy won't eat your lunch anymore!", "We're just looking for photos for the flag display.", "Weren't you going to check out our microfiche?", "I'm sure you'll find Theodora in there somewhere!", "But I hear the museum's got one on the loose!", 'Well? What are you still doing here?', 'So much cleaning to do...', 'I should check out that pair of glasses.', 'I should ask the librarian where to go next.', "Check out the archives. They've got tons of old photos!", 'I used to have a magnifying glass around here\\u00e2\\u20ac\\u00a6', "Come on, kid. You're slowing me down.", "Did you drop something, Dear? There's a card on the floor.", 'Take a look!', 'I should see what Gramps is up to!', 'I found it!', 'Theodora wearing the shirt!', 'You better get to the capitol!', 'Nice decorations.', 'Did you drop something, Dear?', 'Gramps said to look for clues. Better look around.', 'I should find out if she can help me!', 'Ooh, nice decorations!', 'The libarian said I could find some information on Youmans in here...', 'Have a look at the artifact!', 'What is it, Teddy?', 'Oh no... they got sick from polluted water?', 'Poor foxes!', 'I should ask the librarian why Wells was here.', "I wonder if there's a clue in those business cards...", 'Thanks. Did you figure out the shirt?', 'Jolie! Where have you been?', 'The exhibit opens tomorrow.', 'Welcome back, Jolie. Did you figure out the shirt?', 'Wells got in trouble for littering at the Wildlife Center.', 'I should check that logbook to see who owned this slip...', 'AND I know who took Teddy!', 'Who is Teddy?', "And where's your grampa?", 'Sorry for the delay, Boss.', 'I had some cleaning up to do in my office.', 'Mrs. M, I think Wells kidnapped Teddy.', "And he messed up Gramps's office, too!", 'One step at a time, Jo.', 'Did you figure out the shirt?', 'I knew you could do it, Jo!', 'Now can I tell you what happened to Teddy?', 'He needs our help!', "Sorry I'm late.", "Wells! Where's Teddy? Is he okay?", 'I figured out that you kidnapped him!', 'Easy, Jo.', "Why don't you prove your case?", "It'll be okay, Jo. We'll find Teddy!", 'Nice work on the shirt, Jolie!', 'Leopold, can you run back to the museum?', 'Sounds good, Boss.', 'Jo, meet me back at my office.', 'I hope you find your badger, kid.', 'Thanks!', "Are you going home now? Tomorrow's the big day!", 'He got a park named after him? Cool!', 'Come on, Jo!', "Meet me back in my office and we'll get started!", 'Here I am!', 'Wells sabotaged Gramps!', 'AND he stole Teddy!']

l1_texts_map = {j:i for i, j in enumerate(l1_texts)}
l2_texts_map = {j:i for i, j in enumerate(l2_texts)}
l3_texts_map = {j:i for i, j in enumerate(l3_texts)}

texts_map = {'0-4': l1_texts_map, '5-12': l2_texts_map, '13-22': l3_texts_map}

def map_level_group(q):
    #  takes a parameter q and assigns a level group based on its value. If q is less than 4, it returns '0-4'. If q is between 4 and 13 (inclusive), it returns '5-12'. Otherwise, it returns '13-22'. This function is used to map a question number to a level group.
    if q<4:
        return '0-4'
    elif q>=4 and q<=13:
        return '5-12'
    else:
        return '13-22'
    
def add_features(df, logs, level_limit, texts_map):
    columns = [
        (pl.col('elapsed_time') - pl.col('elapsed_time').shift(1)).over(['session']).alias('elapsed_time_diff1'),
        (pl.col('elapsed_time') - pl.col('elapsed_time').shift(2)).over(['session']).alias('elapsed_time_diff2'),
        (pl.col('elapsed_time').shift(-1) - pl.col('elapsed_time')).over(['session']).alias('elapsed_time_diff3'),
        ((pl.col('room_coor_x') - pl.col('room_coor_x').shift(1))
         .over(['session']) ** 2 
         + (pl.col('room_coor_y') - pl.col('room_coor_y').shift(1))
         .over(['session']) ** 2).sqrt().alias('room_dist'),

        ((pl.col('screen_coor_x') - pl.col('screen_coor_x').shift(1))
         .over(['session']) ** 2
         + (pl.col('screen_coor_y') - pl.col('screen_coor_y').shift(1))
         .over(['session']) ** 2).sqrt().alias('screen_dist'),
        (pl.col('index')/(pl.col('index').max().over(['session'])-pl.col('index').min().over(['session']))).alias('index_ratio'),
        (pl.col('elapsed_time').max().over(['session'])).alias('end_time')
    ]
    tmp_logs = logs.filter(pl.col('level')<=level_limit)
    tmp_logs = tmp_logs.with_columns(columns)
    
    gp = tmp_logs.groupby(['session'], maintain_order=True).agg([pl.col('elapsed_time').first().suffix('_min'),
                                                                 pl.col('elapsed_time').last().suffix('_max'),
                                                                 pl.col('elapsed_time_diff1').count().alias('event_cnt'),
                                                                 pl.col('elapsed_time_diff1').n_unique().suffix('_nuniq'),
                                                                 pl.col('elapsed_time_diff1').mean().suffix('_mean'),
                                                                 pl.col('elapsed_time_diff1').max().suffix('_max'),
                                                                 pl.col('elapsed_time_diff1').min().suffix('_min'),
                                                                 pl.col('elapsed_time_diff1').std().suffix('_std'),
                                                                 pl.col('elapsed_time_diff1').median().suffix('_med'),
                                                                 pl.col('elapsed_time_diff1').sum().suffix('_sum'),
                                                                 pl.col('elapsed_time_diff1').first().suffix('_first'),
                                                                 pl.col('elapsed_time_diff1').last().suffix('_last'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.1, 'nearest').suffix('_q1'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.2, 'nearest').suffix('_q2'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.3, 'nearest').suffix('_q3'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.4, 'nearest').suffix('_q4'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.5, 'nearest').suffix('_q5'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.6, 'nearest').suffix('_q6'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.7, 'nearest').suffix('_q7'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.8, 'nearest').suffix('_q8'),
                                                                 pl.col('elapsed_time_diff1').quantile(0.9, 'nearest').suffix('_q9'),
                                                                ])
    gp = gp.with_columns((pl.col('elapsed_time_max')-pl.col('elapsed_time_min')).alias('level_group_timedelta'))
    gp.drop(['elapsed_time_max', 'elapsed_time_min'])
    df = df.join(gp, on=['session'], how='left')
    
    nuniq_cols = ['event_name', 'name', 'level', 'page', 'room_coor_x',
                  'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration', 'text', 
                  'fqid', 'room_fqid', 'text_fqid']
    agg_list = [pl.col(col).n_unique().suffix('_nuniq') for col in nuniq_cols]
    gp = tmp_logs.groupby(['session'], maintain_order=True).agg([*agg_list])
    df = df.join(gp, on=['session'], how='left')
    
    for key in ['level', 'room_fqid', 'fqid', 'text_fqid', 'event_name']:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('elapsed_time').first().alias('elapsed_time_min'),
                                                     pl.col('elapsed_time').last().alias('elapsed_time_max')])
        gp = gp.with_columns((pl.col('elapsed_time_max')-pl.col('elapsed_time_min')).alias(f'{key}_timedelta'))
        gp.drop(['elapsed_time_max', 'elapsed_time_min'])
        gp = gp.pivot(index=['session'], columns=key, values=f'{key}_timedelta', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={i}_timedelta' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_timedelta' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
    for key in ['level', 'room_fqid', 'fqid', 'text_fqid', 'event_name',
                'text'
               ]:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('elapsed_time_diff1').mean()])
        gp = gp.pivot(index=['session'], columns=key, values=f'elapsed_time_diff1', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={texts_map.get(f, f)}_elapsed_time_diff1' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_elapsed_time_diff1' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
    for key in ['level', 'room_fqid', 'fqid', 'text_fqid', 'event_name',
                'text'
               ]:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('elapsed_time_diff1').count()])
        gp = gp.pivot(index=['session'], columns=key, values=f'elapsed_time_diff1', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={texts_map.get(f, f)}_cnt' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_cnt' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
    for key in ['level', 'room_fqid', 'fqid', 'text_fqid', 'event_name']:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('index').min().alias('elapsed_time_min'),
                                                     pl.col('index').max().alias('elapsed_time_max')])
        gp = gp.with_columns((pl.col('elapsed_time_max')-pl.col('elapsed_time_min')).alias(f'{key}_index_gap'))
        gp.drop(['elapsed_time_max', 'elapsed_time_min'])
        gp = gp.pivot(index=['session'], columns=key, values=f'{key}_index_gap', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={i}_index_gap' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_index_gap' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
               
    for key in ['room_fqid', 'fqid', 'text_fqid']:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('event_name').n_unique()])
        gp = gp.pivot(index=['session'], columns=key, values=f'event_name', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={i}_event_name_nuniq' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_event_name_nuniq' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
    if level_limit>=5 and level_limit<=12:
        for key in ['room_fqid', 'fqid', 'text_fqid']:
            gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('event_name').count()])
            gp = gp.groupby(['session'], maintain_order=True).agg([
                 pl.col('event_name').mean().suffix(f'_{key}_cnt_mean'),
                                                                     pl.col('event_name').max().suffix(f'_{key}_cnt_max'),
                                                                     pl.col('event_name').min().suffix(f'_{key}_cnt_min'),
                                                                     pl.col('event_name').std().suffix(f'_{key}_cnt_std'),
                                                                     pl.col('event_name').median().suffix(f'_{key}_cnt_med'),
                                                                     pl.col('event_name').sum().suffix(f'_{key}_cnt_sum'),
                                                                     pl.col('event_name').first().suffix(f'_{key}_cnt_first'),
                                                                     pl.col('event_name').last().suffix(f'_{key}_cnt_last'),

            ])
            df = df.join(gp, on=['session'], how='left')
        
    for window in [5, 10, 15, 30]:
        nuniq_cols = ['event_name', 'name', 'level', 'page', 'room_coor_x',
                      'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration', 'text', 
                      'fqid', 'room_fqid', 'text_fqid']
        agg_list = [pl.col(f).n_unique().suffix(f'_nuniq_{window}sec') for f in nuniq_cols]
        tmp = tmp_logs.filter(pl.col('elapsed_time')>pl.col('end_time')-window*1000)
        gp = tmp.groupby(['session'], maintain_order=True).agg([*agg_list])
        df = df.join(gp, on=['session'], how='left')
        
    for col in ['event_name', 'name', 'page', 'room_coor_x',
                'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration', 'text', 
                'fqid', 'text_fqid']:
        gp = tmp_logs.groupby(['session', 'room_fqid'], maintain_order=True).agg([pl.col(col).n_unique()])
        gp = gp.pivot(index=['session'], columns='room_fqid', values=col, aggregate_function='sum')
        gp.columns = ['session'] + [f'room_fqid={f}_{col}_nuniq' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
    for key in [
        'event_name',
               ]:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('elapsed_time_diff1').last()])
        gp = gp.pivot(index=['session'], columns=key, values=f'elapsed_time_diff1', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={i}_elapsed_time_diff1_last' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_elapsed_time_diff1_last' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
    for key in ['level', 'room_fqid', 'fqid', 'text_fqid', 'event_name',
                'text'
               ]:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('elapsed_time_diff1').std()])
        gp = gp.pivot(index=['session'], columns=key, values=f'elapsed_time_diff1', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={texts_map.get(f, f)}_elapsed_time_diff1_std' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_elapsed_time_diff1_std' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
        
    for key in ['level', 'room_fqid', 'fqid', 'text_fqid', 'event_name',
                'text'
               ]:
        gp = tmp_logs.groupby(['session', key], maintain_order=True).agg([pl.col('elapsed_time_diff1').max()])
        gp = gp.pivot(index=['session'], columns=key, values=f'elapsed_time_diff1', aggregate_function='sum')
        if key=='text':
            gp.columns = ['session'] + [f'{key}={texts_map.get(f, f)}_elapsed_time_diff1_max' for i,f in enumerate(gp.columns[1:])]
        else:
            gp.columns = ['session'] + [f'{key}={f}_elapsed_time_diff1_max' for f in gp.columns[1:]]
        df = df.join(gp, on=['session'], how='left')
        
    return df

In [4]:
columns = [

    pl.col("page").cast(pl.Float32),
    (
        (pl.col("elapsed_time") - pl.col("elapsed_time").shift(1))
         .fill_null(0)
         .clip(0, 1e9)
         .over(["session_id", "level_group"])
         .alias("elapsed_time_diff")
    ),
    (
        (pl.col("elapsed_time") - pl.col("elapsed_time").shift(-1))
         .abs()
         .fill_null(0)
         .clip(0, 1e9)
         .over(["session_id", "level_group"])
         .alias("elapsed_time_diff2")
    ),
    (
        (pl.col("screen_coor_x") - pl.col("screen_coor_x").shift(1)) 
         .abs()
         .over(["session_id", "level_group"])
        .alias("location_x_diff") 
    ),
    (
        (pl.col("screen_coor_x") - pl.col("screen_coor_x").shift(-1)) 
         .abs()
         .over(["session_id", "level_group"])
        .alias("location_x_diff2") 
    ),
    (
        (pl.col("screen_coor_y") - pl.col("screen_coor_y").shift(1)) 
         .abs()
         .over(["session_id", "level_group"])
        .alias("location_y_diff") 
    ),
]

CATS = ['event_name', 'name', 'fqid', 'room_fqid', 'text_fqid','text']
NUMS = ['page', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y',
        'hover_duration', 'elapsed_time_diff']

name_feature = ['basic', 'undefined', 'close', 'open', 'prev', 'next']
event_name_feature = ['cutscene_click', 'person_click', 'navigate_click',
       'observation_click', 'notification_click', 'object_click',
       'object_hover', 'map_hover', 'map_click', 'checkpoint',
       'notebook_click']

fqid_lists = ['worker', 'archivist', 'gramps', 'wells', 'toentry', 'confrontation', 'crane_ranger', 'groupconvo', 'flag_girl', 'tomap', 'tostacks', 'tobasement', 'archivist_glasses', 'boss', 'journals', 'seescratches', 'groupconvo_flag', 'cs', 'teddy', 'expert', 'businesscards', 'ch3start', 'tunic.historicalsociety', 'tofrontdesk', 'savedteddy', 'plaque', 'glasses', 'tunic.drycleaner', 'reader_flag', 'tunic.library', 'tracks', 'tunic.capitol_2', 'trigger_scarf', 'reader', 'directory', 'tunic.capitol_1', 'journals.pic_0.next', 'unlockdoor', 'tunic', 'what_happened', 'tunic.kohlcenter', 'tunic.humanecology', 'colorbook', 'logbook', 'businesscards.card_0.next', 'journals.hub.topics', 'logbook.page.bingo', 'journals.pic_1.next', 'journals_flag', 'reader.paper0.next', 'tracks.hub.deer', 'reader_flag.paper0.next', 'trigger_coffee', 'wellsbadge', 'journals.pic_2.next', 'tomicrofiche', 'journals_flag.pic_0.bingo', 'plaque.face.date', 'notebook', 'tocloset_dirty', 'businesscards.card_bingo.bingo', 'businesscards.card_1.next', 'tunic.wildlife', 'tunic.hub.slip', 'tocage', 'journals.pic_2.bingo', 'tocollectionflag', 'tocollection', 'chap4_finale_c', 'chap2_finale_c', 'lockeddoor', 'journals_flag.hub.topics', 'tunic.capitol_0', 'reader_flag.paper2.bingo', 'photo', 'tunic.flaghouse', 'reader.paper1.next', 'directory.closeup.archivist', 'intro', 'businesscards.card_bingo.next', 'reader.paper2.bingo', 'retirement_letter', 'remove_cup', 'journals_flag.pic_0.next', 'magnify', 'coffee', 'key', 'togrampa', 'reader_flag.paper1.next', 'janitor', 'tohallway', 'chap1_finale', 'report', 'outtolunch', 'journals_flag.hub.topics_old', 'journals_flag.pic_1.next', 'reader.paper2.next', 'chap1_finale_c', 'reader_flag.paper2.next', 'door_block_talk', 'journals_flag.pic_1.bingo', 'journals_flag.pic_2.next', 'journals_flag.pic_2.bingo', 'block_magnify', 'reader.paper0.prev', 'block', 'reader_flag.paper0.prev', 'block_0', 'door_block_clean', 'reader.paper2.prev', 'reader.paper1.prev', 'doorblock', 'tocloset', 'reader_flag.paper2.prev', 'reader_flag.paper1.prev', 'block_tomap2', 'journals_flag.pic_0_old.next', 'journals_flag.pic_1_old.next', 'block_tocollection', 'block_nelson', 'journals_flag.pic_2_old.next', 'block_tomap1', 'block_badge', 'need_glasses', 'block_badge_2', 'fox', 'block_1']
text_lists = ['tunic.historicalsociety.cage.confrontation', 'tunic.wildlife.center.crane_ranger.crane', 'tunic.historicalsociety.frontdesk.archivist.newspaper', 'tunic.historicalsociety.entry.groupconvo', 'tunic.wildlife.center.wells.nodeer', 'tunic.historicalsociety.frontdesk.archivist.have_glass', 'tunic.drycleaner.frontdesk.worker.hub', 'tunic.historicalsociety.closet_dirty.gramps.news', 'tunic.humanecology.frontdesk.worker.intro', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation', 'tunic.historicalsociety.basement.seescratches', 'tunic.historicalsociety.collection.cs', 'tunic.flaghouse.entry.flag_girl.hello', 'tunic.historicalsociety.collection.gramps.found', 'tunic.historicalsociety.basement.ch3start', 'tunic.historicalsociety.entry.groupconvo_flag', 'tunic.library.frontdesk.worker.hello', 'tunic.library.frontdesk.worker.wells', 'tunic.historicalsociety.collection_flag.gramps.flag', 'tunic.historicalsociety.basement.savedteddy', 'tunic.library.frontdesk.worker.nelson', 'tunic.wildlife.center.expert.removed_cup', 'tunic.library.frontdesk.worker.flag', 'tunic.historicalsociety.frontdesk.archivist.hello', 'tunic.historicalsociety.closet.gramps.intro_0_cs_0', 'tunic.historicalsociety.entry.boss.flag', 'tunic.flaghouse.entry.flag_girl.symbol', 'tunic.historicalsociety.closet_dirty.trigger_scarf', 'tunic.drycleaner.frontdesk.worker.done', 'tunic.historicalsociety.closet_dirty.what_happened', 'tunic.wildlife.center.wells.animals', 'tunic.historicalsociety.closet.teddy.intro_0_cs_0', 'tunic.historicalsociety.cage.glasses.afterteddy', 'tunic.historicalsociety.cage.teddy.trapped', 'tunic.historicalsociety.cage.unlockdoor', 'tunic.historicalsociety.stacks.journals.pic_2.bingo', 'tunic.historicalsociety.entry.wells.flag', 'tunic.humanecology.frontdesk.worker.badger', 'tunic.historicalsociety.stacks.journals_flag.pic_0.bingo', 'tunic.historicalsociety.closet.intro', 'tunic.historicalsociety.closet.retirement_letter.hub', 'tunic.historicalsociety.entry.directory.closeup.archivist', 'tunic.historicalsociety.collection.tunic.slip', 'tunic.kohlcenter.halloffame.plaque.face.date', 'tunic.historicalsociety.closet_dirty.trigger_coffee', 'tunic.drycleaner.frontdesk.logbook.page.bingo', 'tunic.library.microfiche.reader.paper2.bingo', 'tunic.kohlcenter.halloffame.togrampa', 'tunic.capitol_2.hall.boss.haveyougotit', 'tunic.wildlife.center.wells.nodeer_recap', 'tunic.historicalsociety.cage.glasses.beforeteddy', 'tunic.historicalsociety.closet_dirty.gramps.helpclean', 'tunic.wildlife.center.expert.recap', 'tunic.historicalsociety.frontdesk.archivist.have_glass_recap', 'tunic.historicalsociety.stacks.journals_flag.pic_1.bingo', 'tunic.historicalsociety.cage.lockeddoor', 'tunic.historicalsociety.stacks.journals_flag.pic_2.bingo', 'tunic.historicalsociety.collection.gramps.lost', 'tunic.historicalsociety.closet.notebook', 'tunic.historicalsociety.frontdesk.magnify', 'tunic.humanecology.frontdesk.businesscards.card_bingo.bingo', 'tunic.wildlife.center.remove_cup', 'tunic.library.frontdesk.wellsbadge.hub', 'tunic.wildlife.center.tracks.hub.deer', 'tunic.historicalsociety.frontdesk.key', 'tunic.library.microfiche.reader_flag.paper2.bingo', 'tunic.flaghouse.entry.colorbook', 'tunic.wildlife.center.coffee', 'tunic.capitol_1.hall.boss.haveyougotit', 'tunic.historicalsociety.basement.janitor', 'tunic.historicalsociety.collection_flag.gramps.recap', 'tunic.wildlife.center.wells.animals2', 'tunic.flaghouse.entry.flag_girl.symbol_recap', 'tunic.historicalsociety.closet_dirty.photo', 'tunic.historicalsociety.stacks.outtolunch', 'tunic.library.frontdesk.worker.wells_recap', 'tunic.historicalsociety.frontdesk.archivist_glasses.confrontation_recap', 'tunic.capitol_0.hall.boss.talktogramps', 'tunic.historicalsociety.closet.photo', 'tunic.historicalsociety.collection.tunic', 'tunic.historicalsociety.closet.teddy.intro_0_cs_5', 'tunic.historicalsociety.closet_dirty.gramps.archivist', 'tunic.historicalsociety.closet_dirty.door_block_talk', 'tunic.historicalsociety.entry.boss.flag_recap', 'tunic.historicalsociety.frontdesk.archivist.need_glass_0', 'tunic.historicalsociety.entry.wells.talktogramps', 'tunic.historicalsociety.frontdesk.block_magnify', 'tunic.historicalsociety.frontdesk.archivist.foundtheodora', 'tunic.historicalsociety.closet_dirty.gramps.nothing', 'tunic.historicalsociety.closet_dirty.door_block_clean', 'tunic.capitol_1.hall.boss.writeitup', 'tunic.library.frontdesk.worker.nelson_recap', 'tunic.library.frontdesk.worker.hello_short', 'tunic.historicalsociety.stacks.block', 'tunic.historicalsociety.frontdesk.archivist.need_glass_1', 'tunic.historicalsociety.entry.boss.talktogramps', 'tunic.historicalsociety.frontdesk.archivist.newspaper_recap', 'tunic.historicalsociety.entry.wells.flag_recap', 'tunic.drycleaner.frontdesk.worker.done2', 'tunic.library.frontdesk.worker.flag_recap', 'tunic.humanecology.frontdesk.block_0', 'tunic.library.frontdesk.worker.preflag', 'tunic.historicalsociety.basement.gramps.seeyalater', 'tunic.flaghouse.entry.flag_girl.hello_recap', 'tunic.historicalsociety.closet.doorblock', 'tunic.drycleaner.frontdesk.worker.takealook', 'tunic.historicalsociety.basement.gramps.whatdo', 'tunic.library.frontdesk.worker.droppedbadge', 'tunic.historicalsociety.entry.block_tomap2', 'tunic.library.frontdesk.block_nelson', 'tunic.library.microfiche.block_0', 'tunic.historicalsociety.entry.block_tocollection', 'tunic.historicalsociety.entry.block_tomap1', 'tunic.historicalsociety.collection.gramps.look_0', 'tunic.library.frontdesk.block_badge', 'tunic.historicalsociety.cage.need_glasses', 'tunic.library.frontdesk.block_badge_2', 'tunic.kohlcenter.halloffame.block_0', 'tunic.capitol_0.hall.chap1_finale_c', 'tunic.capitol_1.hall.chap2_finale_c', 'tunic.capitol_2.hall.chap4_finale_c', 'tunic.wildlife.center.fox.concern', 'tunic.drycleaner.frontdesk.block_0', 'tunic.historicalsociety.entry.gramps.hub', 'tunic.humanecology.frontdesk.block_1', 'tunic.drycleaner.frontdesk.block_1']
room_lists = ['tunic.historicalsociety.entry', 'tunic.wildlife.center', 'tunic.historicalsociety.cage', 'tunic.library.frontdesk', 'tunic.historicalsociety.frontdesk', 'tunic.historicalsociety.stacks', 'tunic.historicalsociety.closet_dirty', 'tunic.humanecology.frontdesk', 'tunic.historicalsociety.basement', 'tunic.kohlcenter.halloffame', 'tunic.library.microfiche', 'tunic.drycleaner.frontdesk', 'tunic.historicalsociety.collection', 'tunic.historicalsociety.closet', 'tunic.flaghouse.entry', 'tunic.historicalsociety.collection_flag', 'tunic.capitol_1.hall', 'tunic.capitol_0.hall', 'tunic.capitol_2.hall']

def feature_engineer_pl(x, grp, use_extra, feature_suffix):
    # The function takes four arguments: x (DataFrame), grp (grouping column), use_extra (boolean flag), and feature_suffix (suffix to be appended to the feature names).
    # The aggs list is initialized to store the aggregation expressions for feature engineering.
    # Multiple aggregation expressions are added to the aggs list. These expressions calculate various statistics and counts based on different columns in the DataFrame. Some examples include computing quantiles, means, minimum and maximum values, counts, and sums for different columns.
    # The DataFrame x is grouped by the "session_id" column and the aggregation expressions from aggs are applied. The resulting DataFrame is sorted by "session_id" and assigned to df.
    # If the use_extra flag is True, additional feature engineering is performed.
    # If the value "5-12" is present in the "level_group" column of x, a new aggregation expression is added to calculate the time difference between the first "5-12" level group and the last "0-4" level group. The result is stored in the "time_group_diff1" column.
    # If the value "13-22" is present in the "level_group" column of x, a new aggregation expression is added to calculate the time difference between the first "13-22" level group and the last "5-12" level group. The result is stored in the "time_group_diff2" column.
    # The resulting DataFrame df is joined with the additional features (tmp) based on the "session_id" column.
    # Finally, the function returns the feature-engineered DataFrame df.
    aggs = [
        *[pl.col(c).quantile(0.1, "nearest").alias(f"{c}_quantile1_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.2, "nearest").alias(f"{c}_quantile2_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.4, "nearest").alias(f"{c}_quantile4_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.6, "nearest").alias(f"{c}_quantile6_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.8, "nearest").alias(f"{c}_quantile8_{feature_suffix}") for c in NUMS],
        *[pl.col(c).quantile(0.9, "nearest").alias(f"{c}_quantile9_{feature_suffix}") for c in NUMS],
        *[pl.col(c).mean().alias(f"{c}_mean_{feature_suffix}") for c in NUMS],
        *[pl.col(c).min().alias(f"{c}_min_{feature_suffix}") for c in NUMS],
        *[pl.col(c).max().alias(f"{c}_max_{feature_suffix}") for c in NUMS],
        
        *[pl.col("event_name").filter(pl.col("event_name") == c).count().alias(f"{c}_event_name_counts{feature_suffix}")for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).quantile(0.1, "nearest").alias(f"{c}_ET_quantile1_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).quantile(0.2, "nearest").alias(f"{c}_ET_quantile2_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).quantile(0.4, "nearest").alias(f"{c}_ET_quantile4_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).quantile(0.6, "nearest").alias(f"{c}_ET_quantile6_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).quantile(0.8, "nearest").alias(f"{c}_ET_quantile8_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).quantile(0.9, "nearest").alias(f"{c}_ET_quantile9_{feature_suffix}") for c in event_name_feature],      
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).min().alias(f"{c}_ET_min_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("event_name")==c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in event_name_feature],
        *[pl.col("elapsed_time_diff2").filter(pl.col("event_name")==c).sum().alias(f"{c}_ET2_sum_{feature_suffix}") for c in event_name_feature],
        *[pl.col("location_x_diff").filter(pl.col("event_name")==c).mean().alias(f"{c}_LX_mean_x{feature_suffix}") for c in event_name_feature],
        *[pl.col("location_x_diff").filter(pl.col("event_name")==c).max().alias(f"{c}_LX_max_x{feature_suffix}") for c in event_name_feature],
        *[pl.col("location_x_diff").filter(pl.col("event_name")==c).min().alias(f"{c}_LX_min_x{feature_suffix}") for c in event_name_feature],
        *[pl.col("location_x_diff").filter(pl.col("event_name")==c).sum().alias(f"{c}_LX_sum_x{feature_suffix}") for c in event_name_feature],
        *[pl.col("location_x_diff2").filter(pl.col("event_name")==c).sum().alias(f"{c}_LX2_sum_x{feature_suffix}") for c in event_name_feature],
        *[pl.col("location_y_diff").filter(pl.col("event_name")==c).sum().alias(f"{c}_LY_sum_x{feature_suffix}") for c in event_name_feature],
     
        *[pl.col("name").filter(pl.col("name") == c).count().alias(f"{c}_name_counts{feature_suffix}")for c in name_feature],   
        *[pl.col("elapsed_time_diff").filter(pl.col("name")==c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name")==c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name")==c).min().alias(f"{c}_ET_min_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff").filter(pl.col("name")==c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in name_feature],
        *[pl.col("elapsed_time_diff2").filter(pl.col("name")==c).sum().alias(f"{c}_ET2_sum_{feature_suffix}") for c in name_feature],
        *[pl.col("location_x_diff").filter(pl.col("name")==c).sum().alias(f"{c}_LX_sum_{feature_suffix}") for c in name_feature],
        *[pl.col("location_x_diff2").filter(pl.col("name")==c).sum().alias(f"{c}_LX2_sum_{feature_suffix}") for c in name_feature],
        *[pl.col("location_y_diff").filter(pl.col("name")==c).sum().alias(f"{c}_LY_sum_{feature_suffix}") for c in name_feature],
        
        *[pl.col("room_fqid").filter(pl.col("room_fqid") == c).count().alias(f"{c}_room_fqid_counts{feature_suffix}")for c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).min().alias(f"{c}_ET_min_{feature_suffix}") for c in room_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in room_lists],
        *[pl.col("elapsed_time_diff2").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_ET2_sum_{feature_suffix}") for c in room_lists],
        *[pl.col("location_x_diff").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_LX_sum_{feature_suffix}") for c in room_lists],
        *[pl.col("location_x_diff2").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_LX2_sum_{feature_suffix}") for c in room_lists],
        *[pl.col("location_y_diff").filter(pl.col("room_fqid") == c).sum().alias(f"{c}_LY_sum_{feature_suffix}") for c in room_lists],
                
        *[pl.col("fqid").filter(pl.col("fqid") == c).count().alias(f"{c}_fqid_counts{feature_suffix}")for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).min().alias(f"{c}_ET_min_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in fqid_lists],
        *[pl.col("elapsed_time_diff2").filter(pl.col("fqid") == c).sum().alias(f"{c}_ET2_sum_{feature_suffix}") for c in fqid_lists],
        *[pl.col("location_x_diff").filter(pl.col("fqid") == c).sum().alias(f"{c}_LX_sum_{feature_suffix}") for c in fqid_lists],
        *[pl.col("location_x_diff2").filter(pl.col("fqid") == c).sum().alias(f"{c}_LX2_sum_{feature_suffix}") for c in fqid_lists],
        *[pl.col("location_y_diff").filter(pl.col("fqid") == c).sum().alias(f"{c}_LY_sum_{feature_suffix}") for c in fqid_lists],
       
        *[pl.col("text_fqid").filter(pl.col("text_fqid") == c).count().alias(f"{c}_text_fqid_counts{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).mean().alias(f"{c}_ET_mean_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).max().alias(f"{c}_ET_max_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).min().alias(f"{c}_ET_min_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_ET_sum_{feature_suffix}") for c in text_lists],
        *[pl.col("elapsed_time_diff2").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_ET2_sum_{feature_suffix}") for c in text_lists],
        *[pl.col("location_x_diff").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_LX_sum_{feature_suffix}") for c in text_lists],
        *[pl.col("location_x_diff2").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_LX2_sum_{feature_suffix}") for c in text_lists],
        *[pl.col("location_y_diff").filter(pl.col("text_fqid") == c).sum().alias(f"{c}_LY_sum_{feature_suffix}") for c in text_lists],       
    ]
    
    df = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
    
    if use_extra:
        if '5-12' in x["level_group"].unique():
            aggs = [
                (pl.col("elapsed_time").filter((pl.col("level_group")=="5-12")).first()-pl.col("elapsed_time").filter((pl.col("level_group")=="0-4")).last()).alias("time_group_diff1"),
            ]
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')

        if '13-22' in x["level_group"].unique():
            aggs = [
                (pl.col("elapsed_time").filter((pl.col("level_group")=="13-22")).first()-pl.col("elapsed_time").filter((pl.col("level_group")=="5-12")).last()).alias("time_group_diff2"), 
            
            ]
            tmp = x.groupby(["session_id"], maintain_order=True).agg(aggs).sort("session_id")
            df = df.join(tmp, on="session_id", how='left')
        
    return df

In [5]:
%%writefile models.py
import torch.nn as nn
import torch
import torch.nn.functional as F
import math 
# This code defines a Transformer block and a model that combines Transformer and LSTM blocks for sequence processing. The model takes both dense and categorical features as input and produces an output based on different modes. The Transformer block is used to capture the dependencies within the sequences, and the LSTM block is used for further sequence processing.
class TransformerBlock(nn.Module):
    def __init__(self, emb_s=32, head_cnt=8, dp1=0.1, dp2=0.1):
        super().__init__()
        # The constructor initializes several components used in a transformer block, 
        # including the multi-head self-attention mechanism and feed-forward network.
        emb = emb_s * head_cnt
        self.kqv = nn.Linear(emb_s, 3 * emb_s, bias=False)  # Linear transformation for keys, queries, and values.
        self.dp = nn.Dropout(dp1)  # Dropout layer.
        self.proj = nn.Linear(emb, emb, bias=False)  # Projection layer for the output of the self-attention mechanism.
        self.head_cnt = head_cnt  # The number of attention heads.
        self.emb_s = emb_s  # The dimension of embeddings.
        self.ln1 = nn.LayerNorm(emb)  # Layer normalization.
        self.ln2 = nn.LayerNorm(emb)  # Layer normalization.
        self.alpha1 = nn.Parameter(torch.ones(1) * 0.1, requires_grad=True)  # Scaling factor for the self-attention mechanism.
        self.alpha2 = nn.Parameter(torch.ones(1) * 0.1, requires_grad=True)  # Scaling factor for the feed-forward network.

        # The feed-forward network, consisting of two linear layers and a ReLU activation function.
        self.ff = nn.Sequential(
            nn.Linear(emb, 4 * emb),
            nn.ReLU(),
            nn.Linear(4 * emb, emb),
            nn.Dropout(dp2),
        )

    def resmha(self, x, prev=None, mask=None):
        # This function implements the residual multi-head attention mechanism.
        B, T, _ = x.shape  # Batch size, sequence length, and embedding size.
        x = x.reshape(B, T, self.head_cnt, self.emb_s)
        k, q, v = torch.split(self.kqv(x), self.emb_s, dim=-1)  # Split the input into keys, queries, and values.

        # Calculate the attention scores. If a previous attention score matrix is given, add it to the current scores.
        if prev is not None:
            att_score = torch.einsum('bihk,bjhk->bijh', q, k) / self.emb_s ** 0.5 + prev
        else:
            att_score = torch.einsum('bihk,bjhk->bijh', q, k) / self.emb_s ** 0.5

        # If a mask is provided, apply it to the attention scores.
        if mask is not None:
            att_score = att_score.masked_fill(mask.unsqueeze(1).unsqueeze(-1)==0, -1e9)
        prev = att_score

        # Apply the softmax function to the attention scores to obtain the attention weights, 
        # and use them to calculate the weighted sum of the values.
        att = F.softmax(prev, dim=2)
        res = torch.einsum('btih,bihs->bths', att, v).reshape(B, T, -1)

        # Apply dropout and a linear transformation to the result.
        return self.dp(self.proj(res)), prev

    def forward(self, x, prev=None, mask=None):
        # This function implements the forward pass of the transformerSure, let's continue from where it left off:
        # First, it applies the residual multi-head self-attention mechanism to the input.
        rmha, prev = self.resmha(x, prev=prev, mask=mask)

        # Then, it adds the output of the self-attention mechanism (scaled by alpha1) to the input and applies layer normalization.
        x = self.ln1(x + self.alpha1 * rmha)

        # Finally, it applies the feed-forward network to the result (scaled by alpha2), adds the output to the input, 
        # and applies layer normalization again.
        x = self.ln2(x + self.alpha2 * self.ff(x))

        # The function returns the final output and the attention scores.
        return x, prev


class MyModelV1(nn.Module):
    def __init__(self, model_name, in_dim1, n_categories: list, cat_dim=16, hidden_dim=256, use_cat=True):
        super(MyModelV1, self).__init__()
        # The constructor initializes several components used in the model, 
        # including the Transformer and LSTM blocks, and a feed-forward network for the final output.
        
        # The model name determines the number of layers, attention heads, and embedding size in the model.
        if model_name == 'v1':
            layer_cnt, head_cnt, embedding_dim = 1, 8, 64
        # The model parameters are the same for 'v2', 'v3', and 'v4'
        elif model_name in ['v2', 'v3', 'v4']:
            layer_cnt, head_cnt, embedding_dim = 1, 8, 64
        else:
            layer_cnt, head_cnt, embedding_dim = 1, 12, 64
        
        # If categorical features are used, initialize the embedding layers for them.
        if use_cat:
            self.cats = nn.ModuleList([
                nn.Embedding(x, cat_dim) for x in n_categories])
            self.cat_dim = cat_dim * len(n_categories)

            # The first fully connected layer takes the concatenated dense and categorical features as input.
            self.fc = nn.Sequential( 
                nn.Linear(in_dim1 + self.cat_dim, embedding_dim * head_cnt),
            )
        else:
            # If categorical features are not used, the first fully connected layer only takes the dense features as input.
            self.fc = nn.Sequential( 
                nn.Linear(in_dim1, embedding_dim * head_cnt),
            )
        self.use_cat = use_cat
        
        # Initialize the Transformer and LSTM blocks.
        self.transformer_encoder = nn.Sequential(
            *[TransformerBlock(emb_s=embedding_dim, head_cnt=head_cnt, dp1=0.1, dp2=0.1) for _ in range(layer_cnt)])
        self.lstm_encoder = nn.Sequential(
            *[nn.LSTM(head_cnt * embedding_dim, head_cnt * embedding_dim // 2, num_layers=1, 
                              dropout=0.1, batch_first=True,
                              bidirectional=True) for _ in range(layer_cnt)])

        # Initialize the final fully connected layers for different modes.
        self.last_fc1 = nn.Linear(embedding_dim * head_cnt , 3)
        self.last_fc2 = nn.Linear(embedding_dim * head_cnt , 10)
        self.last_fc3 = nn.Linear(embedding_dim * head_cnt , 5)
        
        # Initialize the sigmoid activation function for the final output.
        self.sig = nn.Sigmoid()

    def forward(self, dense_features, cat_features, mode=0):
        # This function implements the forward pass of the model.
        
        # If categorical features are used, concatenate them with the dense features.
        if self.use_cat:
            cats = [embedding(cat_features[:, :, i]) for i, embedding in enumerate(self.cats)]
            x_cat_emb = torch.cat(cats, 2) 
            dense_features = torch.cat([dense_features, x_cat_emb], 2)
        
        # Apply the first fully connected layer to the features.
        res = self.fc(dense_features)

        # Apply the Transformer and LSTM blocks to the result.
        prev = None
        for i in range(self.layer_cnt):
            res, prev = self.transformer_encoder[i](res, prev)
            res,_ = self.lstm_encoder[i](res)
        
        res = res.mean(1)
                
        if mode == 0:
            output = self.last_fc1(res)
        elif mode == 1:
            output = self.last_fc2(res)
        elif mode == 2:
            output = self.last_fc3(res)
        output = self.sig(output)
        return output
    

Writing models.py


In [6]:
parameter = Parameter()
parameter.set(**{'batch_size': 1, 'n_jobs': 1})

def random_sample(dense_features, cat_features):
    # Randomly select indices for sampling
    init_size = len(dense_features)
    index = np.random.choice(range(init_size), size=parameter.seq_length, replace=False)
    index.sort()
    
    # Sample the dense and categorical features based on the selected indices
    dense_features, cat_features = dense_features[index], cat_features[index]
    return dense_features, cat_features

def get_preds(dense_features, cat_features, my_model, mode=0):
    res = []
    for m in my_model:
        with torch.no_grad():
            # Forward pass through the model to get predictions
            y_pred = m.forward(dense_features, cat_features, mode).detach().numpy().flatten()
            res.append(y_pred)
    
    # Average the predictions from multiple models
    res = np.mean(res, axis=0)
    return res

def load_pkl(path):
    with open(path, 'rb') as f:
        # Load and return the object stored in the pickle file
        obj = pickle.load(f)
        f.close()
    return obj


cat_cols = ['level', 'event_name', 'name', 'room_fqid', 'fqid', 'text_fqid', 'text']
standard_dict = load_pkl('/kaggle/input/mastudentsmodels/standard_dict.pkl')
feature_cols = standard_dict['feature_cols']
cat_dict = load_pkl('/kaggle/input/mastudentsmodels/cat_dict.pkl')
model_paths = ['../input/mastudentsmodels/fold{}.pth.tar'.format(i) for i in range(4)
              ]
my_models = [torch.load(path, map_location=torch.device('cpu')) for path in model_paths]

def pred0(test, sample_submission, q_mode=0):
    # Convert the test DataFrame to a pandas DataFrame
    test = test.to_pandas()
    
    # Apply processing to the test DataFrame using the cat_dict
    test = processing_df(test, cat_dict)
    
    # Extract features from the test DataFrame
    test = get_features(test)
    
    # Normalize the dense features using the standard_dict
    dense_features = normalize(test[feature_cols].values, feature_cols, standard_dict)
    
    # Get the categorical features from the test DataFrame
    cat_features = test[cat_cols].values
    
    # If the number of dense features is greater than parameter.seq_length,
    # perform random sampling to reduce the number of samples
    if len(dense_features) > parameter.seq_length:
        dense_features, cat_features = random_sample(dense_features, cat_features)
        
    # Convert the dense features to a torch tensor of type float32 and add a batch dimension
    dense_features = torch.tensor(dense_features, dtype=torch.float32).unsqueeze(0)
    
    # Convert the categorical features to a torch tensor of type long and add a batch dimension
    cat_features = torch.tensor(cat_features, dtype=torch.long).unsqueeze(0)
    
    # Get predictions using the dense and categorical features, the my_models, and the specified q_mode
    res = get_preds(dense_features, cat_features, my_models, q_mode)
    
    # Assign the predictions to the "pred0" column of the sample_submission DataFrame
    sample_submission["pred0"] = res
    
    return sample_submission


level_limit = {'0-4':4, '5-12':12, '13-22':22}
lgb_models1 = [lgb.Booster(model_file=f'/kaggle/input/mastudentsmodels/group0-4_lgb_fold{f}.txt') for f in range(4)]
lgb_models2 = [lgb.Booster(model_file=f'/kaggle/input/mastudentsmodels/group5-12_lgb_fold{f}.txt') for f in range(4)]
lgb_models3 = [lgb.Booster(model_file=f'/kaggle/input/mastudentsmodels/group13-22_lgb_fold{f}.txt') for f in range(4)]
lgb_models = {'0-4': lgb_models1, '5-12': lgb_models2, '13-22': lgb_models3}

def pred1(sample_submission, test, level_limit, texts_map):
    # Rename the 'session_id' column in the test DataFrame to 'session'
    test = test.rename({'session_id': 'session'})
    
    # Extract session ID, question number, and level group from the sample_submission DataFrame
    sample_submission['session'] = sample_submission['session_id'].apply(lambda x: int(x.split('_')[0]))
    sample_submission['question'] = sample_submission['session_id'].apply(lambda x: int(x.split('_')[-1].replace('q', '')))
    sample_submission['level_group'] = sample_submission['question'].apply(lambda x: map_level_group(x))
    
    # Convert the sample_submission DataFrame to a PySpark DataFrame
    sample_submission = pl.from_pandas(sample_submission)
    
    # Add features using the add_features function
    features = add_features(sample_submission, test, level_limit, texts_map)
    
    # Fill missing columns in features with NaN values
    for x in set(lgb_models[curr_lg][0].feature_name()) - set(features.columns):
        features = features.with_columns(pl.lit(np.nan).alias(str(x)))
    
    # Convert the features DataFrame to a pandas DataFrame
    features = features.to_pandas()
    
    preds = 0
    for model in lgb_models[curr_lg]:
        # Make predictions using each model in lgb_models
        preds += model.predict(features[model.feature_name()]) / 4
    
    # Create a DataFrame with session ID, question number, and predictions
    sub = features[['session', 'question']].copy()
    sub['pred1'] = preds
    sub['session_id'] = sub['session'].astype(str) + '_q' + sub['question'].astype(str)
    sub = sub[['session_id', 'pred1']]
    
    return sub



history_df = [None,None]  
for (test, sample_submission) in iter_test:
    test = test.sort_values(by=['index'], ascending=True)
    sample_submission['question'] = sample_submission['session_id'].apply(lambda x: int(x.split('_')[-1].replace('q', '')))#.astype(int)
    sample_submission = sample_submission.sort_values(by=['question'])
    del sample_submission['question']
    
    curr_lg = test['level_group'].iloc[0]
    test = pl.from_pandas(test)
    
    if curr_lg == '0-4':
        q_mode = 0
        history_df[0] = test.clone()
    elif curr_lg == '5-12':
        q_mode = 1
        test = pl.concat([history_df[0], test])
        history_df[1] = test.clone()
    elif curr_lg == '13-22':
        q_mode = 2
        test = pl.concat([history_df[1], test])
    else:
        raise ValueError(curr_lg)
    
    sub0 = pred0(test.clone(), sample_submission.copy(), q_mode)
    sub1 = pred1(sample_submission.copy(), test.clone(),  level_limit[curr_lg], texts_map[curr_lg])
    sub_df = sub0.merge(sub1, how='inner',on='session_id')
    sub_df['correct'] =  0.35 * sub_df['pred0'] + 0.65 * sub_df['pred1']
    sub_df['correct'] = np.where(sub_df['correct']>0.625,1,0)
    env.predict(sub_df[['session_id', 'correct']])
print('ok')    

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
ok
