In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
from functools import partial
import scipy as sp

from sklearn.metrics import accuracy_score, cohen_kappa_score, recall_score, precision_score, mean_squared_error
from sklearn.preprocessing import LabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from lightgbm import LGBMRegressor

In [2]:
print("Reading Training Data...")
train_df = pd.read_csv("/kaggle/input/data-science-bowl-2019/train.csv", parse_dates = ["timestamp"])
print("Reading Training Labels...")
train_labels = pd.read_csv("/kaggle/input/data-science-bowl-2019/train_labels.csv")

Reading Training Data...
Reading Training Labels...


In [3]:
#Just look into data which is also included in the labels dataset
install_ids = train_labels["installation_id"].unique()
reduce_train = train_df[train_df["installation_id"].isin(install_ids)]
#del train_df
del train_df
#remove all observation with the world is NONE
reduce_train = reduce_train[reduce_train["world"] != "NONE"]

# Help Functions

In [4]:
#Round for game
def extract_duration(x):
    event_data = json.loads(x)
    duration = event_data["duration"]
    
    return duration

In [5]:
#Function to create the labels based on the training and test data
def feature_extract(df, df_2):
    print("Creating the features...")
    install_ids = df["installation_id"].unique()
    unique_title = df["title"].unique()
    event_code = df["event_code"].unique().tolist()
    
    for element in df_2["event_code"].unique():
        if element not in event_code:
            event_code.append(element)
    
    event_id = df_2["event_id"].unique().tolist()
    
    for element in df["event_id"].unique():
        if element not in event_id:
            event_id.append(element)
            
    all_data = []

    #Create the features for each install id on its own
    for install_id in install_ids:
        #set media and title to zero for a new installation id

        #Define the Dictionary for the features
        feature = {"Clip": 0, "Assessment": 0, "Game": 0, "Activity": 0, "NONE": 0, 
                   "TREETOPCITY": 0, "MAGMAPEAK": 0, "CRYSTALCAVES": 0, "accumulated_event_count": 0,
                  "accumulated_game_time": 0, "time_Clip": 0, "time_Assessment": 0, "time_Game": 0, "time_Activity": 0, "accumulated_game_correct": 0,
                  "accumulated_game_incorrect": 0, "game_accuracy": 0, "total_game_misclicks": 0}

        for title in unique_title:
            feature[title] =0

        for code in event_code:
            feature[code] = 0
            
        for event in event_id:
            feature[event] = 0
            
        games = ["Dino Dive", "Dino Drink", "Bubble Bath", "Scrub-A-Dub", "Air Show", "All Star Sorting", "Crystals Rule", "Chow Time",
                "Happy Camel", "Leaf Leader", "Pan Balance"]
        
        for game in games:
            feature[game + "_" + "misclicks"] = 0
            feature[game + "_" + "sum_correct"] = 0
            feature[game + "_" + "mean_correct"] = 0
            feature[game + "_" + "mean_incorrect"] = 0
            feature[game + "_" + "sum_incorrect"] = 0
            feature[game + "_" + "accuracy"] = 0
            feature[game + "_" + "sum_duration_finished_round"] = 0
            feature[game + "_" + "mean_duration_finished_round"] = 0

        feature["installation_id"] = install_id

        #Filter for the specific installation_id to create the features just for this person
        data = df[df["installation_id"] == install_id]
        data = data.sort_values("timestamp", ascending = True)

        #Session with Assessments
        session_assess = data[data["type"] == "Assessment"]["game_session"].unique().tolist()
        min_index = 0
        
        game_count = 0
        
        for session in session_assess:
            
            game_count += 1
            feature["game_session"] = session
            
            #Filter the data from the last to the next assessment
            index_max = max(data[(data["type"] == "Assessment") & (data["game_session"] == session)].index)
            index_min = min(data[(data["type"] == "Assessment") & (data["game_session"] == session)].index)
            df_filter_max = data[(data.index >= min_index) & (data.index <= index_max)]
            df_filter_min = data[(data.index >= min_index) & (data.index < index_min)]

            #Grouped data
            group_sess = df_filter_min.groupby("game_session")

            #total event count: the event count is also accumulated in the data
            feature["event_count_total"] = group_sess["event_count"].max().sum()
            feature["timestamp"] = data[data.index == index_max]["timestamp"][index_max]
            #total game time until Assessment: The game time is accumulated in the data
            feature["game_time_total"] = group_sess["game_time"].max().sum()
            feature["duration_per_game"] = feature["game_time_total"]/game_count
            feature["accumulated_game_time"] += feature["game_time_total"]
            feature["accumulated_event_count"] += feature["accumulated_event_count"]

            #Type of Media Count: Iterate over the game session to define the number of type of Media
            
            type_of_media = group_sess["type"].unique().tolist()
            media_time = group_sess["game_time"].max().tolist()

            for i in range(len(type_of_media)):
                feature[type_of_media[i][0]] += 1 
                #Time per Media
                feature["time_"+ str(type_of_media[i][0])] += media_time[i]

            #Title Action: iterate over the title to identify the amount of actions per title 
        
            title_action = df_filter_min.groupby("title")["game_time"].max()

            for i in range(len(title_action)):
                feature[title_action.index[i]] += title_action[i]

            #current Assessment: type to make clear what assessment the kid is taking
            #eature["current_assessment"] = df_filter_max[df_filter_max.index == index_max]["title"]

            #Event Code Count: Number of Event Code/specific Events taken
            event_df = df_filter_min.groupby("event_code")["event_code"].count()

            for i in range(len(event_df)):
                feature[event_df.index[i]] += event_df.tolist()[i]
                
            #Event Code Count: Number of Event Code/specific Events taken
            event_df = df_filter_min.groupby("event_id")["event_id"].count()

            for i in range(len(event_df)):
                feature[event_df.index[i]] += event_df.tolist()[i]

            #Game Time per world: Define how many action were taken in total per world
            world_group = df_filter_min.groupby("world")["game_time"].max()

            for i in range(len(world_group)):
                feature[world_group.index[i]] += world_group.tolist()[i]
                
            #Feature for game data
            #game_sess = df_filter_min[df_filter_min.type == "Game"]["game_session"].unique()
            
            #iterate over the game session to extract the features for each game title
           
            for game in games:
                game_df = df_filter_min[df_filter_min.title == game]
                
                if game_df.shape[0] == 0:
                    continue
                
                title_name = game
               
                if title_name == "Chow Time":
                    #Misclicks about the mentioned event_id, one is for clicks somewhere and the other one is when the food was dragged wrongly
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "7372e1a5") | (game_df.event_id == "d185d3ea")]["event_id"].count()
                    event_data = game_df[game_df.event_id == "4ef8cdd3"]["event_data"].apply(lambda x: json.loads(x))
                    
                    #Correct and incorrect actions per game and the mean of them per round
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1

                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["63f13dd7"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["63f13dd7"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["63f13dd7"]
                        #Duration per round finished
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "56817e2b"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["63f13dd7"]
                    
                elif title_name == "Happy Camel":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "3d8c61b0") | (game_df.event_id == "a7640a16")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "8af75982"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1
                            
                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["c2baf0bd"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["c2baf0bd"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["c2baf0bd"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "36fa3ebe"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["c2baf0bd"]

                elif title_name == "Leaf Leader":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "7dfe6d8a")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "262136f4"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1
                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
    
                    if feature["f32856e4"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["f32856e4"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["f32856e4"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "b012cd7f"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["f32856e4"]

                elif title_name == "Pan Balance":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "bc8f2793") | (game_df.event_id == "f3cd5473")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "804ee27f"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1
                    
                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["a592d54e"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["a592d54e"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["a592d54e"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "1c178d24"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["a592d54e"]

                elif title_name == "Air Show":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "bcceccc6")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "28f975ea"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1
                    
                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["1575e76c"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["1575e76c"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["1575e76c"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "f5b8c21a"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["1575e76c"]

                elif title_name == "Crystals Rule":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "5e3ea25a")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "86c924c4"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1

                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["7cf1bc53"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["7cf1bc53"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["7cf1bc53"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "3323d7e9"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["7cf1bc53"]

                elif title_name == "Scrub-A-Dub":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "cf82af56")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "5c3d2b2f"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1

                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["26fd2d99"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["26fd2d99"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["26fd2d99"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "08fd73f3"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["26fd2d99"]

                elif title_name == "Dino Drink":
                    #Round for game
                    feature["f806dc10"] = game_df[game_df.event_id == "f806dc10"]["event_id"].count()

                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "6c517a88")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "74e5f8a7"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1

                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["f806dc10"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["f806dc10"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["f806dc10"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "16dffff1"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["f806dc10"]

                elif title_name == "Bubble Bath":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "a0faea5d")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "3bb91dda"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1

                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["1beb320a"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["1beb320a"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["1beb320a"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "895865f3"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["1beb320a"]

                elif title_name == "All Star Sorting":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "587b5989") |(game_df.event_id == "d02b7a8e")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "2dc29e21"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1

                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) 
                    
                    if feature["2c4e6db0"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["2c4e6db0"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["2c4e6db0"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "ca11f653"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["2c4e6db0"]

                elif title_name =="Dino Dive":
                    #Distraction Clicks
                    feature[title_name + "_" + "misclicks"] += game_df[(game_df.event_id == "76babcde")]["event_id"].count()
                    #Correctness of actions
                    event_data = game_df[game_df.event_id == "c0415e5c"]["event_data"].apply(lambda x: json.loads(x))

                    #Identify total and mean correctness and as well accuracy
                    for event in event_data:
                        if event["correct"]:
                            feature[title_name + "_" + "sum_correct"] += 1
                        else:
                            feature[title_name + "_" + "sum_incorrect"] += 1

                    if (feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"]) == 0:
                        feature[title_name + "_" + "accuracy"] = -1
                    else:
                        feature[title_name + "_" + "accuracy"] = feature[title_name + "_"+ "sum_correct"]/(feature[title_name + "_"+ "sum_correct"] + feature[title_name + "_" + "sum_incorrect"])
                    
                    if feature["7961e599"] == 0:
                        continue
                    else:
                        feature[title_name + "_" + "mean_correct"] = feature[title_name + "_"+ "sum_correct"]/feature["7961e599"]
                        feature[title_name + "_" + "mean_incorrect"] = feature[title_name + "_"+ "sum_incorrect"]/feature["7961e599"]

                        #Total duration per finished round
                        feature[title_name + "_" + "sum_duration_finished_round"] += game_df[game_df.event_id == "00c73085"]["event_data"].apply(lambda x: extract_duration(x)).sum()
                        feature[title_name + "_" + "mean_duration_finished_round"] = feature[title_name  + "_" + "sum_duration_finished_round"]/feature["7961e599"]
                
                feature["accumulated_game_correct"] += feature[title_name + "_" + "sum_correct"]
                feature["accumulated_game_incorrect"] += feature[title_name + "_" + "sum_incorrect"]
                feature["total_game_misclicks"] += feature[title_name + "_" + "misclicks"]
                
                if (feature["accumulated_game_correct"] + feature["accumulated_game_incorrect"]) == 0:
                    continue
                else:
                    feature["game_accuracy"] = feature["accumulated_game_correct"]/(feature["accumulated_game_correct"] + feature["accumulated_game_incorrect"])
                    
                    
            all_data.append(feature.copy())

            #change counting variable to the next game session with assessment
            min_index = index_min 
    
    del feature, install_ids, unique_title, event_code, event_id
    return all_data

In [6]:
#create labels based on the label dataset
def feature_extract_2(df, df_label):
    
    install_ids = df_label["installation_id"].unique()
    game_session_label = df_label["game_session"].unique()
    timestamp = df[df["game_session"].isin(game_session_label)].groupby("game_session")["timestamp"].max()
    timestamp["game_session"] = timestamp.index

    label_df = df_label.join(other = timestamp, on =  "game_session", how = "left")
    label_df.sort_values(["installation_id", "timestamp"], ascending = True)
    
    feat_list= []

    for install_id in install_ids:

        df_filtered = label_df[label_df["installation_id"] == install_id]
        game_count = 0
        #Create a Dictionary where all the Key are set to 0
        feature_2 = {"installation_id": install_id, "Mushroom_correct": 0, "Bird_correct": 0, "Chest_correct": 0, "Cauldron_correct": 0, "Cart_correct": 0,
                     "Mushroom_incorrect": 0, "Bird_incorrect": 0, "Chest_incorrect": 0, "Cauldron_incorrect": 0, "Cart_incorrect": 0, 
                      "0_M": 0, "1_M": 0, "2_M": 0, "3_M": 0, "0_B": 0, "1_B": 0, "2_B": 0, "3_B": 0, "0_CH": 0, "1_CH": 0, "2_CH": 0, "3_CH": 0,
                      "0_CAR": 0, "1_CAR": 0, "2_CAR": 0, "3_CAR": 0, "0_CAU": 0, "1_CAU": 0, "2_CAU": 0, "3_CAU": 0,
                      "0": 0, "1": 0, "2": 0, "3": 0, "sum_correct": 0, "sum_incorrect": 0, 
                      "total_accuracy": 0, "mean_accuracy_group": 0, "game_session": "", "Mushroom_attempts": 0, "Bird_attempts": 0,
                     "Chest_attempts": 0, "Cauldron_attempts": 0, "Cart_attempts": 0, "Bird_accuracy": 0, "Mushroom_accuracy": 0,
                    "Chest_accuracy": 0, "Cauldron_accuracy": 0, "Cart_accuracy": 0, "mean_correct":0, "mean_incorrect": 0,
                    "total_attempt": 0}

        for index, value in df_filtered.iterrows():
    
            game_count += 1
            #add the game_session to the dataset
            feature_2["game_session"] = value["game_session"]
            #Add the dic to the first game session as until then no further assessment was taken before
            #Needs to be copied because if not there will be a reference to the dictionary
            feat_list.append(feature_2.copy())
            
            if (value["num_correct"] + value["num_incorrect"] == 0):
                if value["title"] == "Mushroom Sorter (Assessment)":
                    feature_2["Mushroom_accuracy"] = -1
                elif value["title"] == "Bird Measurer (Assessment)":
                    feature_2["Bird_accuracy"] = -1
                elif value["title"] == "Chest Sorter (Assessment)":
                    feature_2["Chest_accuracy"] = -1
                elif value["title"] == "Cauldron Filler (Assessment)":
                    feature_2["Cauldron_accuracy"] = -1
                else:
                    feature_2["Cart_accuracy"] = -1
                
                continue
            else:
                #Define the values based on the data
                feature_2[str(value["accuracy_group"])] += 1
                feature_2["mean_accuracy_group"] = (3 * feature_2["3"] + 2 * feature_2["2"] + feature_2["1"])/(feature_2["3"] + feature_2["2"] + feature_2["1"] + feature_2["0"])
                feature_2["sum_correct"] += value["num_correct"]
                feature_2["sum_incorrect"] += value["num_incorrect"]
                feature_2["total_attempt"] = feature_2["sum_correct"] + feature_2["sum_incorrect"]
                feature_2["total_accuracy"] = feature_2["sum_correct"]/(feature_2["sum_correct"] + feature_2["sum_incorrect"])
                feature_2["mean_correct"] = feature_2["sum_correct"]/game_count
                feature_2["mean_incorrect"] = feature_2["sum_incorrect"]/game_count

                #Define the values which are dependent on the Assessment title
                if value["title"] == "Mushroom Sorter (Assessment)":
                    feature_2["Mushroom_correct"] += value["num_correct"]
                    feature_2["Mushroom_incorrect"] += value["num_incorrect"]
                    feature_2["Mushroom_attempts"] = feature_2["Mushroom_correct"] +  feature_2["Mushroom_incorrect"]
                    feature_2[str(value["accuracy_group"]) + "_" + "M"] += 1
                    feature_2["Mushroom_accuracy"] = feature_2["Mushroom_correct"]/(feature_2["Mushroom_correct"] + feature_2["Mushroom_incorrect"])
                elif value["title"] == "Bird Measurer (Assessment)":
                    feature_2["Bird_correct"] += value["num_correct"]
                    feature_2["Bird_incorrect"] += value["num_incorrect"]
                    feature_2["Bird_attempts"] = feature_2["Bird_correct"] +  feature_2["Bird_incorrect"]
                    feature_2[str(value["accuracy_group"]) + "_" + "B"] += 1
                    feature_2["Bird_accuracy"] = feature_2["Bird_correct"]/(feature_2["Bird_correct"] + feature_2["Bird_incorrect"])
                elif value["title"] == "Chest Sorter (Assessment)":
                    feature_2["Chest_correct"] += value["num_correct"]
                    feature_2["Chest_incorrect"] += value["num_incorrect"]
                    feature_2["Chest_attempts"] = feature_2["Chest_correct"] +  feature_2["Chest_incorrect"]
                    feature_2[str(value["accuracy_group"]) + "_" + "CH"] += 1
                    feature_2["Chest_accuracy"] = feature_2["Chest_correct"]/(feature_2["Chest_correct"] + feature_2["Chest_incorrect"])
                elif value["title"] == "Cauldron Filler (Assessment)":
                    feature_2["Cauldron_correct"] += value["num_correct"]
                    feature_2["Cauldron_incorrect"] += value["num_incorrect"]
                    feature_2["Cauldron_attempts"] = feature_2["Cauldron_correct"] +  feature_2["Cauldron_incorrect"]
                    feature_2[str(value["accuracy_group"]) + "_" + "CAU"] += 1
                    feature_2["Cauldron_accuracy"] = feature_2["Cauldron_correct"]/(feature_2["Cauldron_correct"] + feature_2["Cauldron_incorrect"])
                else:
                    feature_2["Cart_correct"] += value["num_correct"]
                    feature_2["Cart_incorrect"] += value["num_incorrect"]
                    feature_2["Cart_attempts"] = feature_2["Cart_correct"] +  feature_2["Cart_incorrect"]
                    feature_2[str(value["accuracy_group"]) + "_" + "CAR"] += 1
                    feature_2["Cart_accuracy"] = feature_2["Cart_correct"]/(feature_2["Cart_correct"] + feature_2["Cart_incorrect"])
                
    del feature_2, df_filtered
    return feat_list

#Function to create the label dataset for the test dataset
def label_creation(test_data):
        
    df_filtered_1 = test_data[(test_data["event_code"] == 4100) & 
                              ((test_data["title"] == "Cart Balancer (Assessment)") |
                              (test_data["title"] == "Cauldron Filler (Assessment)") |
                              (test_data["title"] == "Mushroom Sorter (Assessment)") |
                              (test_data["title"] == "Chest Sorter (Assessment)"))]
    
    df_filtered_2 = test_data[(test_data["event_code"] == 4110) & (test_data["title"] == "Bird Measurer (Assessment)")]
    
    df_filter_submit = pd.concat([df_filtered_1, df_filtered_2], axis = 0)
    
    game_submit = df_filter_submit["game_session"].unique()
    
    df_filtered = test_data[test_data.type == "Assessment"]
    
    game_sessions = df_filtered["game_session"].unique().tolist()
    
    label_data = []
    label_dic = {}
    
    for game_session in game_sessions:
        
        df_gs = df_filtered[df_filtered["game_session"] == game_session]
        install_id = df_gs["installation_id"].unique()
        title = df_gs["title"].unique()
        
        label_dic["title"] = title[0]
        label_dic["installation_id"] = install_id[0]
        label_dic["game_session"] = game_session
        label_dic["num_correct"] = 0
        label_dic["num_incorrect"] = 0
        label_dic["accuracy"] = 0
        
        if game_session in game_submit:
            
            if title[0] == "Bird Measurer (Assessment)":
                submit_df = df_gs[df_gs.event_code == 4110]
            else:
                submit_df = df_gs[df_gs.event_code == 4100]
            
            event_data = submit_df["event_data"].apply(lambda x: json.loads(x))
            event_data = event_data.tolist()

            for dic in event_data:
                if dic["correct"]:
                    label_dic["num_correct"] += 1
                else:
                    label_dic["num_incorrect"] += 1

            label_dic["accuracy"] = label_dic["num_correct"]/(label_dic["num_correct"] + label_dic["num_incorrect"])
        
        if label_dic["accuracy"] == 1:
            label_dic["accuracy_group"] = 3
        elif label_dic["accuracy"] == 0.5:
            label_dic["accuracy_group"] = 2
        elif label_dic["accuracy"] == 0:
            label_dic["accuracy_group"] = 0
        else:
            label_dic["accuracy_group"] = 1
            
        label_data.append(label_dic.copy())
        
    del label_dic, df_filtered,df_filter_submit, df_filtered_1, df_filtered_2, df_gs, install_id, title, game_sessions
    
    return label_data

In [7]:
def create_features_test(test_data, reduce_train):    
    print("Reducing the test data...")
    reduce_test = test_data[test_data["world"] != "NONE"]
        
    print("Processing the Test Data...")
    label_list = label_creation(test_data)
    test_label_df = pd.DataFrame(label_list)
    
    all_data_test = feature_extract(reduce_test, reduce_train)
    test_feature_1 = pd.DataFrame(all_data_test)

    feature_test_2 = feature_extract_2(reduce_test, test_label_df)
    test_feature_2 = pd.DataFrame(feature_test_2)
    
    print("Create the Dataset for the test features...")
    test_feature = test_label_df.drop("installation_id", axis = 1).merge(test_feature_2, on = "game_session")
    test_feature = test_feature.merge(test_feature_1.drop("installation_id", axis = 1), on = "game_session")
    test_feature = test_feature.sort_values(by = "timestamp", ascending = True)
    test_feature = test_feature.groupby("installation_id").tail(1)
    test_feature = test_feature.reset_index(drop = True)
    
    del feature_test_2, test_feature_2, all_data_test, test_feature_1, reduce_test, label_list, test_label_df
    
    return test_feature

In [8]:
def create_features_train(test_data, reduce_train, train_labels):    
    print("Reducing the Test data...")
    reduce_test = test_data[test_data["world"] != "NONE"]
        
    print("Processing the Training Data...")    
    feature_1 = feature_extract(reduce_train, reduce_test)
    train_feature_1 = pd.DataFrame(feature_1)

    feature_2 = feature_extract_2(reduce_test, train_labels)
    train_feature_2 = pd.DataFrame(feature_2)
    
    print("Create the Dataset for the Train features...")
    train_feature = train_labels.drop("installation_id", axis = 1).merge(train_feature_2, on = "game_session")
    train_feature = train_feature.merge(train_feature_1.drop("installation_id", axis = 1), on = "game_session")
    
    del feature_2, train_feature_2, feature_1, train_feature_1, reduce_test
    
    return train_feature

In [9]:
def onehotencode(df, col):
    oh = LabelBinarizer()
    data = pd.DataFrame(oh.fit_transform(df[col]))
    data.columns = oh.classes_
    
    
    df = pd.concat([df.drop(col, axis = 1), data], axis = 1)
    
    return df

def prediction(vector, coef):
    vector[vector <= coef[0]] = 0
    vector[np.where(np.logical_and(vector > coef[0] , vector <= coef[1]))] = 1
    vector[np.where(np.logical_and(vector > coef[1], vector <= coef[2]))] = 2
    vector[vector > coef[2]] = 3
    return vector

def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e

def metrics(y_test, prediction_test, y_train, prediction_train, average: str):
    
    #Regression Problem evaluate if the metrics say much
    acc_test = accuracy_score(y_test, prediction_test)
    acc_train = accuracy_score(y_train, prediction_train)
    rec = recall_score(y_test, prediction_test, average = average)
    prec = precision_score(y_test, prediction_test, average = average)
    kappa = qwk(y_test, prediction_test)
    
    print("Accuracy Test: {}".format(acc_test))
    print("Accuracy Train: {}".format(acc_train))
    print("Recall: {}".format(rec))
    print("Precision: {}".format(prec))
    print("Quadratic Weighted Kappa: {}".format(kappa))
    
class OptimizedRounder():
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self, initial_coef, labels):
        self.coef_ = 0
        self.initial_coef = initial_coef
        self.labels = labels

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = self.labels)

        return -qwk(y, X_p)

    def fit(self, X, y):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        self.coef_ = sp.optimize.minimize(loss_partial, self.initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = self.labels)


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

# Training and Test Data

In [10]:
print("Read Test Data...")
test_df = pd.read_csv("../input/data-science-bowl-2019/test.csv", parse_dates = ["timestamp"])

Read Test Data...


In [11]:
print("Creating the Feature for the training Data")
train_feat = create_features_train(test_df, reduce_train, train_labels)
del train_labels

Creating the Feature for the training Data
Reducing the Test data...
Processing the Training Data...
Creating the features...
Create the Dataset for the Train features...


In [12]:
y = train_feat["accuracy_group"]
x = train_feat.drop(["timestamp","game_session", "installation_id", "accuracy_group", "num_incorrect", "num_correct", "accuracy"], axis = 1)

#Encode one Hot
x = onehotencode(x, "title")

features = x.columns

sc = StandardScaler()
x = sc.fit_transform(x)

# Model

In [13]:
groupk = StratifiedKFold(n_splits = 100, random_state = 42)
model = []
for train_index, test_index in groupk.split(x, y):
    print("Train the {} model.".format(len(model)+1))
    x_train, x_test = x[train_index], x[test_index]
    y_train, y_test = y[train_index], y[test_index]
    
    lgbm = LGBMRegressor(boosting_type =  "goss", 
                         n_estimators = 10000,
                        num_leaves = 100,
                        learning_rate = 0.01)
    lgbm.fit(x_train, y_train,
             eval_set = (x_test,y_test),
             early_stopping_rounds = 100)
    
    lgbm_pred_train = lgbm.predict(x_train)
    lgbm_pred = lgbm.predict(x_test)

    opt = OptimizedRounder([0.5, 1.5, 2.5], labels = [0,1,2,3])
    opt.fit(lgbm_pred_train.flatten(), y_train)
    coef = opt.coefficients()

    lgbm_pred_train = prediction(lgbm_pred_train, coef)
    lgbm_pred = prediction(lgbm_pred, coef)

    print("Kappa: {}".format(qwk(y_test, lgbm_pred)))
    
    model.append(lgbm)

Train the 1 model.
[1]	valid_0's l2: 1.56158
Training until validation scores don't improve for 100 rounds
[2]	valid_0's l2: 1.55013
[3]	valid_0's l2: 1.53795
[4]	valid_0's l2: 1.52717
[5]	valid_0's l2: 1.51605
[6]	valid_0's l2: 1.50483
[7]	valid_0's l2: 1.49517
[8]	valid_0's l2: 1.48521
[9]	valid_0's l2: 1.4759
[10]	valid_0's l2: 1.46599
[11]	valid_0's l2: 1.45591
[12]	valid_0's l2: 1.44662
[13]	valid_0's l2: 1.43743
[14]	valid_0's l2: 1.42954
[15]	valid_0's l2: 1.4202
[16]	valid_0's l2: 1.41243
[17]	valid_0's l2: 1.40459
[18]	valid_0's l2: 1.39669
[19]	valid_0's l2: 1.38873
[20]	valid_0's l2: 1.38172
[21]	valid_0's l2: 1.37383
[22]	valid_0's l2: 1.36681
[23]	valid_0's l2: 1.3603
[24]	valid_0's l2: 1.35346
[25]	valid_0's l2: 1.34772
[26]	valid_0's l2: 1.34183
[27]	valid_0's l2: 1.33599
[28]	valid_0's l2: 1.32979
[29]	valid_0's l2: 1.32363
[30]	valid_0's l2: 1.3177
[31]	valid_0's l2: 1.31125
[32]	valid_0's l2: 1.3063
[33]	valid_0's l2: 1.30002
[34]	valid_0's l2: 1.2942
[35]	valid_0's l

In [14]:
def cv_predict(model, data):
    full_prediction = np.zeros((data.shape[0],1))
    
    for i in range(len(model)):
        predict = model[i].predict(data)
        full_prediction += predict.reshape((full_prediction.shape[0], 1))
    
    return full_prediction/len(model)

In [15]:
full_prediction = np.zeros((x.shape[0],1))
full_prediction.shape
lgbm_pred.flatten().shape

(176,)

In [16]:
lgbm_pred = cv_predict(model,x)

opt = OptimizedRounder([0.5, 1.5, 2.5], labels = [0,1,2,3])
opt.fit(lgbm_pred.flatten(), y)
coef = opt.coefficients()

lgbm_pred = prediction(lgbm_pred, coef)

In [17]:
del x_train, x_test, y_train, y_test, lgbm_pred, lgbm_pred_train, x, y
coef

array([0.48323703, 1.8018223 , 1.99585507])

# Predict the Test Data

In [18]:
test_feat = create_features_test(test_df, reduce_train)
del test_df, reduce_train

Reducing the test data...
Processing the Test Data...
Creating the features...
Create the Dataset for the test features...


In [19]:
test_data = test_feat.drop(["accuracy_group","timestamp","game_session", "installation_id", "accuracy_group", "num_incorrect", "num_correct", "accuracy"], axis = 1)
install_id = test_feat["installation_id"]
del test_feat
test_data = onehotencode(test_data, "title")
test_data = sc.transform(test_data)

predict = cv_predict(model, test_data)
predict = prediction(predict, coef)
del test_data
sample_submission = pd.concat([pd.DataFrame(install_id), pd.DataFrame(predict, columns = ["accuracy_group"])], axis = 1)
del predict, install_id
sample_submission['accuracy_group'] = sample_submission['accuracy_group'].astype(int)
sample_submission.head()

Unnamed: 0,installation_id,accuracy_group
0,c798859f,3
1,2ab22ff2,1
2,4124318e,3
3,8eeba692,3
4,7a31ed2b,3


In [20]:
sample_submission["accuracy_group"].value_counts()

3    682
1    318
Name: accuracy_group, dtype: int64

In [21]:
sample_submission.to_csv("submission.csv", index = False)