# Feature Engineering
### Data Science Bowl 2019/2020

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime as dt
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

sns.set(style='darkgrid')
plt.style.use('dark_background')
pd.set_option('max_colwidth', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [441]:
train = pd.read_csv('./data/train.csv')
labels = pd.read_csv('./data/train_labels.csv')
specs = pd.read_csv('./data/specs.csv')
test = pd.read_csv('./data/test.csv')

print("Shape of train: {}".format(train.shape))
print("Shape of test: {}".format(test.shape))
print("Shape of labels: {}".format(labels.shape))
print("Shape of specs: {}".format(specs.shape))

Shape of train: (11341042, 11)
Shape of test: (1156414, 11)
Shape of labels: (17690, 7)
Shape of specs: (386, 3)


### A. Cleaning, Sorting, Filtering Data

In [572]:
# listing these vars which will be used to filter columns
games = list(train[train['type']=='Game']['title'].unique())
activities = list(train[train['type']=='Activity']['title'].unique())
assessments = list(train[train['type']=='Assessment']['title'].unique())
types = ['Game', 'Activity', 'Assessment']
worlds = ['MAGMAPEAK', 'TREETOPCITY', 'CRYSTALCAVES']

titles = games + activities + assessments
dont_care = [i for i in train['title'].unique() if i not in titles]
dont_care.append('NONE')
dont_care.append('Clip')

# setting dict to map install ids to game sessions
install_dict_train = dict(zip(list(train['game_session']), list(train['installation_id'])))
install_dict_test = dict(zip(list(test['game_session']), list(test['installation_id'])))

# labels of game sessions for mapping at the end
labeled_sessions = list(labels['game_session'].unique())

In [444]:
def fix_time(df):
    '''
    This function does three things to fix the time variables:
    a) converts timestamp column to datetime
    b) converts game time from milliseconds to seconds
    c) calculates the individual time taken for each event/row
    '''
    # convert timestamp to timestamp
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    # converts milliseconds to seconds
    df['game_time_s'] = df['game_time'] / 1000
    # calculates the time for each game step, replaces negatives with 0
    df['step_time'] = df['game_time_s'].diff().clip_lower(0)
    df['step_time'].fillna(0, inplace = True)
    df.drop(columns = ['game_time', 'game_time_s'], inplace = True)
    
    return df

In [445]:
# function to filter to assessed only
def filter_assessed(train_df, labels_df):
    '''
    This function takes the train dataframe and labels dataframe and filters the training
    data to only include install ids that are in the train labels.
    '''
    install_ids = list(labels_df['installation_id'].unique())
    new_df = train_df[train_df['installation_id'].isin(install_ids)]
    return new_df

In [446]:
def sort_by_time(df):
    '''
    This function is very important for calculating the cumulative sum of events leading up
    to the current game session: it sorts the dataframe by installation id and timestamp
    '''
    new_df = df.sort_values(by = ['installation_id', 'timestamp'], ascending = True)
    new_df.reset_index(inplace = True)
    new_df.drop(columns = ['index'], inplace =True)
    return new_df

### B. Features on Full (Unaggregated) Data


In [447]:
def test_answers(df, code = [4100, 4110]):
    '''
    This function calculates correct/incorrect answers (for assessments). Bird Measurer uses
    code 4110; all other assessments use code 4100.
    '''
    # assessments
    answers = df[(df['event_code'] == code)]
    correct = [1 if '"correct":true' in i else 0 for i in answers['event_data']]
    incorrect = [1 if '"correct":false' in i else 0 for i in answers['event_data']]
    
    correct_dict = dict(zip(list(answers.index), correct))
    incorrect_dict = dict(zip(list(answers.index), incorrect))

    df['test_correct'] = df.index.map(correct_dict).fillna(0)
    df['test_incorrect'] = df.index.map(incorrect_dict).fillna(0)

    return df

def test_answers_df(df):
    '''
    This is the wrapper function for the test answers function; it splits the Bird
    Measurer Assessment from the others; calculates correct/incorrect responses, and 
    rejoins the df.
    '''
    # splitting into birds/not birds
    notbirds_lab = test_answers((df[df['title']!='Bird Measurer (Assessment)']), 4100)
    birds_lab = test_answers((df[df['title']=='Bird Measurer (Assessment)']), 4110)
    # re-joining
    all_lab = pd.concat([notbirds_lab, birds_lab], axis = 0)
    
    return all_lab

In [448]:
def game_events(df):
    '''
    This function one-hot encodes specific game events so they can be counted/aggregated.
    Game events include: true/false game responses, positive/negative feedback from games,
    whether the player got an instruction, whether the player requested help, and whether the
    player beat the level.
    '''
    # whether a player did something correct/incorrect in a game
    df['session_true'] = [1 if '"correct":true' in i else 0 for i in df['event_data']]
    df['session_false'] = [1 if '"correct":false' in i else 0 for i in df['event_data']]
    # whether player got positive/negative feedback on a move
    df['session_goodjob'] = [1 if i == 3021 else 0 for i in df['event_code']]
    df['session_tryagain'] = [1 if i == 3020 else 0 for i in df['event_code']]
    # whether a player got extra instructions
    df['got_instructions'] = [1 if i == 3010 else 0 for i in df['event_code']]
    # whether a player got help
    df['requested_help'] = [1 if i == 4090 else 0 for i in df['event_code']]
    # whether beat the level
    df['beat_level'] = [1 if i == 2050 else 0 for i in df['event_code']]
    df.drop(columns=['event_count', 'event_code'], inplace =True)

    return df

In [449]:
def new_session(df):
    '''
    This function marks the beginning of a new game session. This is important for doing any 
    cumulative/aggregate calculations before aggregating the df. This function is called in 
    the activity_df function. 
    '''
    # marking the beginning of a new game session
    game_sessions = list(df['game_session'].unique())
    sessions_dict = dict(zip(game_sessions, list(range(len(game_sessions)))))
    df['new_session_id'] = df['game_session'].map(sessions_dict)
    df['new_game'] = df['new_session_id'].diff()
    df['new_game'] = [1 if i >= 1 else 0 for i in df['new_game']]
    df['new_game'].fillna(1, inplace = True)
    df['start_game'] = df['new_game'] * df['timestamp'].map(str)
    df.drop(columns=['new_session_id'], inplace =True)
    
    return df

In [450]:
def activity_df(df):
    '''
    This function dummies out the type, title, and world of the current game session and
    multiplies the dummy features by 'new game' so the 'total' dummy after aggregating each
    of these features with the rest of the dataframe is either 0 or 1. (Ex. instead of counting
    every step/row in the df as 'MAGMA PEAK' and summing to 247, we would only count it once.)
    '''
    #This function should be called after the 'new session' feature has been created.
    df = new_session(df)
    # preserve list of original columns
    original_cols = list(df.columns)
    # get lists of games, activities, assessments                    
    df = pd.get_dummies(df, columns=['type', 'title', 'world'])
    # all the numeric columns we want to be able to count over time
    new_cols = list(df.columns)
    # list of new dummied columns
    new_dum = [i for i in new_cols if i not in original_cols]
    # pruning columns
    drop = []
    for n in new_dum:
        for d in dont_care:
            if d in n:
                drop.append(n)
    # drop unnecessary dummy columns
    df.drop(columns=drop, inplace = True)
    # multiple dummy columns by counts
    new_dum2 = [i for i in new_dum if i not in drop]
    for i in new_dum2:
        df[i] = df[i] * df['new_game']
    
    return df

def start_time(df):
    df1 = df[df['new_game']==1]
    start_time_dict = dict(zip(list(df1['game_session']), list(df1['start_game'])))
    
    return start_time_dict

### C. Aggregating by Game Session

In [451]:
def agg_session(df):
    '''
    This function takes in a dataframe with one-hot encoding/all other features already
    calculated on the original rows and groups them by game session. Only do this after
    done creating features on the original df!
    '''
    sum_agg = df.groupby('game_session').sum()
    count_steps = df.groupby('game_session').count()['new_game']
    sum_agg['session_steps'] = count_steps
    sum_agg.reset_index(inplace = True)
    
    return sum_agg

In [452]:
def accuracy(df):
    '''
    This function takes an aggregated (grouped by) dataframe and calculates the test 
    accuracy and game accuracy of each game session.
    '''
    df['test_accuracy'] = df['test_correct'] / (df['test_correct'] + df['test_incorrect'])
    df['game_accuracy'] = df['session_true'] / (df['session_true'] + df['session_false'])
    df['feedback_positive'] = df['session_goodjob'] / (df['session_goodjob'] + df['session_tryagain'])
    df.fillna(0, inplace = True)
    return df

In [453]:
def map_features(df):
    '''
    This function maps features for the game session to the title/type/world they were counted
    in (Ex. the game session's total time would be recorded as time spent in Magma Peak etc.)
    '''
    all_cols = list(df.columns)
    dum_cols = []
    for i in df.columns:
        if "title_" in i or "world_" in i or "type_" in i:
            dum_cols.append(i)
    not_dum = [x for x in all_cols if x not in dum_cols]
    num_cols = []
    for i in not_dum:
        if df[i].dtype == 'int64' or df[i].dtype == 'float64':
            num_cols.append(i)
    num_cols.remove('new_game')
    for i in num_cols:
        for k in dum_cols:
            new_col = k + '_' + i
            df[new_col] = df[i].map(int) * df[k].map(int)
    return df

### D. Aggregating by Installation ID (over time)
Functions for calculating cumulative sums up until a game session, and calculating most recent scores, etc.

In [573]:
def map_install_start(df, install_dict, start_dict):
    '''
    This function maps the installation ids back to our df aggregated by game session.
    '''
    df['installation_id'] = df['game_session'].map(install_dict)
    df['start_game'] = df['game_session'].map(start_dict)
    df['start_game'] = pd.to_datetime(df['start_game'])
    df = df.sort_values(by = ['installation_id', 'start_game'], ascending = True)
    
    return df

In [455]:
def agg_install(df):
    '''
    This function calculates cumulative sums on the aggregated df by game session.
    This is important for being able to calculate all prior information to the game session
    in the training labels/installation id in the test data.
    '''
    cols = []
    for i in df.columns:
        if 'accuracy' in i or 'feedback' in i:
            pass
        elif df[i].dtype == 'int64' or df[i].dtype == 'float64':
            cols.append(i)
    for i in cols:
        cuma = df.groupby('installation_id')[i].cumsum(skipna = True).shift()
        cuma_col = i + '_cuma'
        df[cuma_col] = cuma
    df.fillna(0, inplace = True)
    return df

###  E. Post-Cumulative/Agg Functions

In [456]:
def clean_up_aggs(df):
    keep = ['feedback_positive', 'game_accuracy', 'test_accuracy']
    title_cols = [i for i in df.columns if 'title_' in i and '_cuma' in i]
    world_cols = [i for i in df.columns if 'world_' in i and'_cuma' in i]
    type_cols = [i for i in df.columns if 'type_' in i and '_cuma' in i]
    dum_cols = title_cols + world_cols + type_cols

    redo = []
    for i in dum_cols:
        for m in keep:
            if m in i:
                redo.append(i)
    df.drop(columns = redo, inplace = True)
    
    eventcount = [i for i in df.columns if "event_count" in i]
    eventcode = [i for i in df.columns if "event_code" in i]
    
    df.drop(columns=eventcount, inplace = True)
    df.drop(columns=eventcode, inplace = True)
    
    return df

#### Re-calculating Accuracy (for CumSum columns)

In [476]:
train_iterables = ['type_Activity', 'type_Assessment', 'type_Game', 'title_Air Show',
                 'title_All Star Sorting', 'title_Bird Measurer (Assessment)',
                 'title_Bottle Filler (Activity)', 'title_Bubble Bath',
                 'title_Bug Measurer (Activity)', 'title_Cart Balancer (Assessment)',
                 'title_Cauldron Filler (Assessment)', 'title_Chest Sorter (Assessment)',
                 'title_Chicken Balancer (Activity)', 'title_Chow Time', 'title_Crystals Rule',
                 'title_Dino Dive', 'title_Dino Drink', 'title_Egg Dropper (Activity)',
                 'title_Fireworks (Activity)', 'title_Flower Waterer (Activity)', 
                 'title_Happy Camel', 'title_Leaf Leader', 'title_Mushroom Sorter (Assessment)',
                 'title_Pan Balance', 'title_Sandcastle Builder (Activity)',
                 'title_Scrub-A-Dub', 'title_Watering Hole (Activity)', 'world_CRYSTALCAVES',
                 'world_MAGMAPEAK', 'world_TREETOPCITY']
test_iterables = ['type_Assessment', 'title_Bird Measurer (Assessment)',
                 'title_Cart Balancer (Assessment)', 'title_Cauldron Filler (Assessment)', 
                  'title_Chest Sorter (Assessment)', 'title_Mushroom Sorter (Assessment)',
                  'world_CRYSTALCAVES','world_MAGMAPEAK', 'world_TREETOPCITY']

In [526]:
def cumulative_accuracy(df, iterables = [train_iterables, test_iterables]):

    for i in iterables:
        tc = i + '_test_correct_cuma'
        ti = i + '_test_incorrect_cuma'
    
        st = i + '_session_true_cuma'
        sf = i + '_session_false_cuma'
    
        gj = i + '_session_goodjob_cuma'
        ta = i + '_session_tryagain_cuma'
    
        test_acc = i + '_test_accuracy_cuma'
        game_acc = i + '_game_accuracy_cuma'
        feedback = i + '_positive_feedback_cuma'
    

        df[test_acc] = df[tc] / (df[tc] + df[ti])
        df[game_acc] = df[st] / (df[st] + df[sf])
        df[feedback] = df[gj] / (df[gj] + df[ta])


    return df

#### Dropping/Filling CumSum Null Columns

In [458]:
def drop_fill_cumsum(df):
    allnull = []
    for i in df.columns:
        if df[i].isnull().sum() == df.shape[0]:
            allnull.append(i)
    df.drop(columns=allnull, inplace = True)
    df = df.groupby(['installation_id'], as_index=False).apply(lambda group: group.ffill())
    
    return df

#### Proportion of time spent on each activity/world (cumulative)

In [459]:
def cuma_proportions(df):
    time_iters = ['type_Activity_step_time_cuma', 'type_Assessment_step_time_cuma',
                  'type_Game_step_time_cuma', 'world_CRYSTALCAVES_step_time_cuma',
                  'world_MAGMAPEAK_step_time_cuma', 'world_TREETOPCITY_step_time_cuma']
    for i in time_iters:
        new_col = i + '_proportion'
        df[new_col] = df[i] / df['step_time_cuma']
    
    return df

#### Proportion of Assessments Taken  (cumulative)

In [460]:
def assessments_completed(df):
    assessments_count = ['title_Bird Measurer (Assessment)_cuma',
                         'title_Cart Balancer (Assessment)_cuma',
                         'title_Cauldron Filler (Assessment)_cuma',
                         'title_Chest Sorter (Assessment)_cuma',
                         'title_Mushroom Sorter (Assessment)_cuma']
    completed = 0
    for i in assessments_count:
        completed += df[i].astype(bool).astype(int)
    try:
        done = completed / 5
    except:
        done = 0
    df['assessments_completed'] = done
    
    return df

#### Proportion of Assessments Passed (cumulative)

In [461]:
def assessments_passed(df):
    
    test_correct = ['title_Bird Measurer (Assessment)_test_correct_cuma',
                    'title_Cart Balancer (Assessment)_test_correct_cuma',
                    'title_Cauldron Filler (Assessment)_test_correct_cuma',
                    'title_Chest Sorter (Assessment)_test_correct_cuma',
                    'title_Mushroom Sorter (Assessment)_test_correct_cuma']
    beat = 0
    for i in test_correct:
        beat += df[i].astype(bool).astype(int)
    try:
        passed = beat / 5
    except:
        passed = 0
    df['assessments_passed'] = passed
    
    return df

#### Time per step: Speed (not cumulative)

In [462]:
def speed(df):
    df['speed'] = df['step_time'] / df['session_steps']
    df['speed_cuma'] = df['step_time_cuma'] / df['session_steps_cuma']
    
    return df

#### Marking Start of New Installation Session

In [463]:
def new_session_install(df):
    '''
    This function marks the beginning of a new installation id record. 
    '''
    # marking the beginning of a new game session
    install_sessions = list(df['installation_id'].unique())
    sessions_dict = dict(zip(install_sessions, list(range(len(install_sessions)))))
    df['new_install_session_id'] = df['installation_id'].map(sessions_dict)
    df['new_install'] = df['new_install_session_id'].diff()
    df['new_install'] = [1 if i >= 1 else 0 for i in df['new_install']]
    df['new_install'].fillna(1, inplace = True)
    df.drop(columns=['new_install_session_id'], inplace =True)
    
    return df

#### How Long Since the Last Session

In [603]:
def timedelta(df):
    # deleting weird mistake rows for datetime
    zero = df[df['start_game']==0].index
    df.drop(zero, axis = 0, inplace = True)
    # shifting start game to datetime
    df['start_game'] = pd.to_datetime(df['start_game'])
    # calculating time difference
    df['time_difference'] = df['start_game'].diff()
    # removing where install is new
    df['time_difference'] = np.where((df['new_install']==1), 0, df['time_difference'])
    df['time_difference'] = pd.to_timedelta(df['time_difference'])
    df['time_difference'] = [((i.seconds//60)%60) for i in df['time_difference']]
    
    return df

#### Mark high-use devices

In [465]:
def high_use(df):
    uses = df['installation_id'].value_counts().to_dict()
    df['uses'] = df['installation_id'].map(uses)
    df['very_high_use'] = [1 if x > 950 else 0 for x in df['uses']]
    df['very_low_use'] = [1 if x < 10 else 0 for x in df['uses']]
    
    return df

#### Last Scores (not cumulative)

In [466]:
def last_test_scores(df):
    
    tested_index = df[(df['type_Assessment'] == 1)].index
    tested_vals = df.loc[tested_index, 'test_accuracy'].to_dict()
    
    for i in range(len(tested_index)):
        index = tested_index[i]
        
        # last score
        last_index = tested_index[i-1]
        last_score = tested_vals[last_index]
        df.loc[index, 'last_test_score'] = np.where((df.loc[index, 'new_install'] != 1), 
                                               last_score, -10.0)
        
        # one score before last score
        two_last_index = tested_index[i-2]
        two_last_score = tested_vals[two_last_index]
        df.loc[index, 'last_two_test_score'] = np.where((df.loc[index-1, 'new_install'] != 1) and
                                           (df.loc[index, 'new_install'] != 1), 
                                                  two_last_score, -10.0)
        try:
            # two scores before last score
            three_last_index = tested_index[i-3]
            three_last_score = tested_vals[three_last_index]
            df.loc[index, 'last_three_test_score'] = np.where((df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    three_last_score, -10.0)
        except:
            df.loc[index, 'last_three_test_score'] = -10.0
        
        try:
            # three scores before last score
            four_last_index = tested_index[i-4]
            four_last_score = tested_vals[four_last_index]
            df.loc[index, 'last_four_test_score'] = np.where((df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    four_last_score, -10.0)
            
        except:
            df.loc[index, 'last_four_test_score'] = -10.0
            
        try:   
            # four scores before last score
            five_last_index = tested_index[i-5]
            five_last_score = tested_vals[five_last_index]
            df.loc[index, 'last_five_test_score'] = np.where((df.loc[index-4, 'new_install'] != 1) and
                                                    (df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    five_last_score, -10.0)
        except:
            df.loc[index, 'last_five_test_score'] = -10.0
        
        try:
            # five scores before last score
            six_last_index = tested_index[i-6]
            six_last_score = tested_vals[six_last_index]
            df.loc[index, 'last_six_test_score'] = np.where((df.loc[index-5, 'new_install'] != 1) and
                                                    (df.loc[index-4, 'new_install'] != 1) and
                                                    (df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    six_last_score, -10.0)
        except:
            df.loc[index, 'last_six_test_score'] = -10.0
        
        try:
            # six scores before last score
            seven_last_index = tested_index[i-7]
            seven_last_score = tested_vals[seven_last_index]
            df.loc[index, 'last_seven_test_score'] = np.where((df.loc[index-6, 'new_install'] != 1) and
                                                     (df.loc[index-5, 'new_install'] != 1) and
                                                    (df.loc[index-4, 'new_install'] != 1) and
                                                    (df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    seven_last_score, -10.0)
        except:
            df.loc[index, 'last_seven_test_score'] = -10.0
            
    return df

#### Last Game Accuracy scores

In [608]:
def last_game_scores(df):

    tested_index = df[(df['type_Game'] == 1) | (df['type_Activity'] == 1)].index
    tested_vals = df.loc[tested_index, 'game_accuracy'].to_dict()
    
    for i in range(len(tested_index)):
        index = tested_index[i]
        
        # last score
        last_index = tested_index[i-1]
        last_score = tested_vals[last_index]
        df.loc[index, 'last_score'] = np.where((df.loc[index, 'new_install'] != 1), 
                                               last_score, -10.0)
        try:
            # one score before last score
            two_last_index = tested_index[i-2]
            two_last_score = tested_vals[two_last_index]
            df.loc[index, 'last_two_score'] = np.where((df.loc[index-1, 'new_install'] != 1) and
                                           (df.loc[index, 'new_install'] != 1), 
                                                       two_last_score, -10.0)
        except:
            df.loc[index, 'last_two_score'] = -10.0
            
        try:
            # two scores before last score
            three_last_index = tested_index[i-3]
            three_last_score = tested_vals[three_last_index]
            df.loc[index, 'last_three_score'] = np.where((df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    three_last_score, -10.0)
        except:
            df.loc[index, 'last_three_score'] = -10.0
        
        try:
            # three scores before last score
            four_last_index = tested_index[i-4]
            four_last_score = tested_vals[four_last_index]
            df.loc[index, 'last_four_score'] = np.where((df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    four_last_score, -10.0)
            
        except:
            df.loc[index, 'last_four_score'] = -10.0
            
        try:   
            # four scores before last score
            five_last_index = tested_index[i-5]
            five_last_score = tested_vals[five_last_index]
            df.loc[index, 'last_five_score'] = np.where((df.loc[index-4, 'new_install'] != 1) and
                                                    (df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    five_last_score, -10.0)
        except:
            df.loc[index, 'last_five_score'] = -10.0
        
        try:
            # five scores before last score
            six_last_index = tested_index[i-6]
            six_last_score = tested_vals[six_last_index]
            df.loc[index, 'last_six_score'] = np.where((df.loc[index-5, 'new_install'] != 1) and
                                                    (df.loc[index-4, 'new_install'] != 1) and
                                                    (df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    six_last_score, -10.0)
        except:
            df.loc[index, 'last_six_score'] = -10.0
        
        try:
            # six scores before last score
            seven_last_index = tested_index[i-7]
            seven_last_score = tested_vals[seven_last_index]
            df.loc[index, 'last_seven_score'] = np.where((df.loc[index-6, 'new_install'] != 1) and
                                                     (df.loc[index-5, 'new_install'] != 1) and
                                                    (df.loc[index-4, 'new_install'] != 1) and
                                                    (df.loc[index-3, 'new_install'] != 1) and
                                                    (df.loc[index-2, 'new_install'] != 1) and
                                                    (df.loc[index-1, 'new_install'] != 1) and
                                                    (df.loc[index, 'new_install'] != 1), 
                                                    seven_last_score, -10.0)
        except:
            df.loc[index, 'last_seven_score'] = -10.0
            
    return df

#### Clean Last Scores

In [468]:
def map_nans(df):
    df.replace(-10.0, np.nan, inplace = True)
    return df

#### Averages of Last Scores

In [469]:
def average(df):
    
    # setting game acc variables
    gi = df['last_score']
    gii = df['last_two_score']
    giii = df['last_three_score']
    giv = df['last_four_score']
    gv = df['last_five_score']
    gvi = df['last_six_score']
    gvii = df['last_seven_score']
    # setting test acc variables
    ti = df['last_test_score']
    tii = df['last_two_test_score']
    tiii = df['last_three_test_score']
    tiv = df['last_four_test_score']
    tv = df['last_five_test_score']
    tvi = df['last_six_test_score']
    tvii = df['last_seven_test_score']
    # averages
    try:
        df['two_avg'] = (gi + gii) / 2
    except:
        df['two_avg'] = 9999
    try:
        df['three_avg'] = (gi + gii + giii) / 3
    except:
        df['three_avg'] = 9999
    try:
        df['four_avg'] = (gi + gii + giii + giv) / 4
    except:
        df['four_avg'] = 9999
    try:
        df['five_avg'] = (gi + gii + giii + giv + gv) / 5
    except:
        df['five_avg'] = 9999
    try:
        df['six_avg'] = (gi + gii + giii + giv + gv + gvi) / 6
    except:
        df['six_avg'] = 9999
    try:
        df['seven_avg'] = (gi + gii + giii + giv + gv + gvi + gvii) / 7
    except:
        df['seven_avg'] = 9999
        
    # test scores
    try:
        df['two_test_avg'] = (ti + tii) / 2
    except:
        df['two_test_avg'] = 9999
    try:
        df['three_test_avg'] = (ti + tii + tiii) / 3
    except:
        df['three_test_avg'] = 9999
    try:
        df['four_test_avg'] = (ti + tii + tiii + tiv) / 4
    except:
        df['four_test_avg'] = 9999
    try:
        df['five_test_avg'] = (ti + tii + tiii + tiv + tv) / 5
    except:
        df['five_test_avg'] = 9999
    try:
        df['six_test_avg'] = (ti + tii + tiii + tiv + tv + tvi) / 6
    except:
        df['six_test_avg'] = 9999
    try:
        df['seven_test_avg'] = (ti + tii + tiii + tiv + tv + tvi + tvii) / 7
    except:
        df['seven_test_avg'] = 9999
        
    return df

#### Score Change Over Time

In [470]:
def slope(df):
    # setting game acc variables
    gi = df['last_score']
    gii = df['last_two_score']
    giii = df['last_three_score']
    giv = df['last_four_score']
    gv = df['last_five_score']
    gvi = df['last_six_score']
    gvii = df['last_seven_score']
    # setting test acc variables
    ti = df['last_test_score']
    tii = df['last_two_test_score']
    tiii = df['last_three_test_score']
    tiv = df['last_four_test_score']
    tv = df['last_five_test_score']
    tvi = df['last_six_test_score']
    tvii = df['last_seven_test_score']
    
    try:
        df['gslope1'] = gi - gii
    except:
        df['gslope1'] = 9999
    try:
        df['gslope2'] = gi - giii
    except:
        df['gslope2'] = 9999
    try:
        df['gslope3'] = gi - giv
    except:
        df['gslope3'] = 9999
    try:
        df['gslope4'] = gi - gv
    except:
        df['gslope4'] = 9999
    try:
        df['tslope1'] = ti - tii
    except:
        df['tslope1'] = 9999
    try:
        df['tslope2'] = ti - tiii
    except:
        df['tslope2'] = 9999
    try:
        df['tslope3'] = ti - tiv
    except:
        df['tslope3'] = 9999
    try:
        df['tslope4'] = ti - tv
    except:
        df['tslope4'] = 9999
    
    return df

#### Time to finish last assessment / Speed (time per steps)

In [618]:
def test_speed(df):
    tested_index = df[(df['type_Assessment'] == 1)].index
    step_vals = df.loc[tested_index, 'session_steps'].to_dict()
    time_vals = df.loc[tested_index, 'step_time'].to_dict()
    
    for i in range(len(tested_index)):
        index = tested_index[i]
        time = time_vals[index]
        steps = step_vals[index]
        df.loc[index, 'last_test_time'] = np.where((df.loc[index, 'new_install'] != 1), 
                                                   time, -10.0)
        df.loc[index, 'last_test_speed'] = np.where((df.loc[index, 'new_install'] != 1), 
                                                   (time/steps), -10.0)
        
        index2 = tested_index[i-1]
        time2 = time_vals[index2]
        steps2 = step_vals[index2]
        df.loc[index, 'last_test_time_2'] = np.where(((df.loc[index-1, 'new_install'] != 1) and
                                                     (df.loc[index, 'new_install'] != 1)), 
                                                   time2, -10.0)
        df.loc[index, 'last_test_speed_2'] = np.where(((df.loc[index-1, 'new_install'] != 1) and
                                                     (df.loc[index, 'new_install'] != 1)), 
                                                   (time2/steps2), -10.0)
        
        index3 = tested_index[i-2]
        time3 = time_vals[index3]
        steps3 = step_vals[index3]
        df.loc[index, 'last_test_time_3'] = np.where(((df.loc[index-2, 'new_install'] != 1) and
                                                     (df.loc[index-1, 'new_install'] != 1) and
                                                     (df.loc[index, 'new_install'] != 1)), 
                                                   time3, -10.0)
        df.loc[index, 'last_test_speed_3'] = np.where(((df.loc[index-2, 'new_install'] != 1) and
                                                      (df.loc[index-1, 'new_install'] != 1) and
                                                     (df.loc[index, 'new_install'] != 1)), 
                                                    (time3/steps3), -10.0)
        
    return df

# Prepping Final DFs

In [621]:
def all_functions(df, 
                  labels, 
                  type_ = ['train', 'test'], 
                  install_dict = [install_dict_test, install_dict_train],
                  iterables = [train_iterables, test_iterables]):
    df = fix_time(df)
    df = filter_assessed(df, labels)
    df = sort_by_time(df)
    df = test_answers_df(df)
    df = game_events(df)
    df = new_session(df)
    df = activity_df(df)
    start_dict = start_time(df)
    df = agg_session(df)
    df = accuracy(df)
    df = map_features(df)
    df = map_install_start(df, install_dict, start_dict)
    df = agg_install(df)
    df = clean_up_aggs(df)
    df = cumulative_accuracy(df, iterables)
    df = drop_fill_cumsum(df)
    df = cuma_proportions(df)
    df = assessments_completed(df)
    df = assessments_passed(df)
    df = speed(df)
    df = new_session_install(df)
    
    df = timedelta(df)
    df = high_use(df)
    df = last_test_scores(df)
    df = last_game_scores(df)
    df = map_nans(df)
    df = average(df)
    df = slope(df)
    df = test_speed(df)
    
    return df

In [673]:
def finish_train(df): 
    # dropping unnecessary columns
    start_index = list(df.columns).index('installation_id')
    new_cols = list(df.columns)[start_index:]
    new_cols.append('game_session')
    df = df[new_cols]
    df = df[df['game_session'].isin(labeled_sessions)]
    
    return df

In [659]:
def finish_test(df):
    start_index = list(df.columns).index('installation_id')
    new_cols = list(df.columns)[start_index:]
    new_cols.append('game_session')
    df = df[new_cols]
    df = df.groupby('installation_id').last()
    df['installation_id'] = df.index
    
    return df

In [663]:
train2 = all_functions(train, 
                       labels, 
                       install_dict = install_dict_train,
                       type_ = 'train', 
                       iterables = train_iterables)
train3 = finish_train(train2)

In [676]:
#train3.to_csv('./data/new_train2.csv')

In [623]:
test2 = all_functions(test, 
                      test, 
                      install_dict = install_dict_test,
                      type_ = 'test', 
                      iterables = test_iterables)
test3 = finish_test(test2)

In [662]:
#test3.to_csv('./data/test3.csv')