# Update
* Version 2: set thresholds for previous_records_df based on temp_test_X accuracy_group distribution

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

# Any results you write to the current directory are saved as output.
from time import time
from tqdm import tqdm
from collections import Counter
from scipy import stats
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import KFold, StratifiedKFold, GroupKFold, train_test_split
import gc
import json
pd.set_option('display.max_columns', 1000)
import matplotlib.pyplot as plt
import seaborn as sns
import random
random.seed(42)
np.random.seed(42)
from scipy.stats import rankdata

In [None]:
#source: https://www.kaggle.com/artgor/quick-and-dirty-regression
from functools import partial
import scipy as sp
class OptimizedRounder(object):
    """
    An optimizer for rounding thresholds
    to maximize Quadratic Weighted Kappa (QWK) score
    # https://www.kaggle.com/naveenasaithambi/optimizedrounder-improved
    """
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        """
        Get loss according to
        using current coefficients
        
        :param coef: A list of coefficients that will be used for rounding
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        X_p = pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

        return -cohen_kappa_score(y, X_p, weights='quadratic')

    def fit(self, X, y, method='Powell', initial_coefficients=None):
        """
        Optimize rounding thresholds
        
        :param X: The raw predictions
        :param y: The ground truth labels
        """
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [np.percentile(X, 25), np.percentile(X, 50), np.percentile(X, 75)]
        if initial_coefficients:
            initial_coef = initial_coefficients
        print('initial coefficients:', initial_coef)
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method=method)

    def predict(self, X, coef):
        """
        Make predictions with specified thresholds
        
        :param X: The raw predictions
        :param coef: A list of coefficients that will be used for rounding
        """
        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])


    def coefficients(self):
        """
        Return the optimized coefficients
        """
        return self.coef_['x']

In [None]:
def read_data():
    base_path = '/kaggle/input/data-science-bowl-2019/'
    date_format =  '%Y-%m-%dT%H:%M:%S.%fZ'
    print('Reading train.csv file....')
    train = pd.read_csv(base_path+'train.csv', parse_dates=[2], infer_datetime_format=date_format)
    print('Training.csv file have {} rows and {} columns'.format(train.shape[0], train.shape[1]))

    print('Reading test.csv file....')
    test = pd.read_csv(base_path+'test.csv', parse_dates=[2], infer_datetime_format=date_format)
    print('Test.csv file have {} rows and {} columns'.format(test.shape[0], test.shape[1]))

    print('Reading train_labels.csv file....')
    train_labels = pd.read_csv(base_path+'train_labels.csv')
    print('Train_labels.csv file have {} rows and {} columns'.format(train_labels.shape[0], train_labels.shape[1]))

    print('Reading specs.csv file....')
    specs = pd.read_csv(base_path+'specs.csv')
    print('Specs.csv file have {} rows and {} columns'.format(specs.shape[0], specs.shape[1]))

    print('Reading sample_submission.csv file....')
    sample_submission = pd.read_csv(base_path+'sample_submission.csv')
    print('Sample_submission.csv file have {} rows and {} columns'.format(sample_submission.shape[0], sample_submission.shape[1]))
    return train, test, train_labels, specs, sample_submission

In [None]:
%%time
# read data
train, test, train_labels, specs, sample_submission = read_data()
concat_df = pd.concat([train, test])
game_titles = concat_df[concat_df['type'] == 'Game']['title'].unique()
activity_titles = concat_df[concat_df['type'] == 'Activity']['title'].unique()
assessment_titles = concat_df[concat_df['type'] == 'Assessment']['title'].unique()
clip_titles = concat_df[concat_df['type'] == 'Clip']['title'].unique()

In [None]:
def encode_title(train, test, train_labels):
    # encode title
    train['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), train['title'], train['event_code']))
    test['title_event_code'] = list(map(lambda x, y: str(x) + '_' + str(y), test['title'], test['event_code']))
    all_title_event_code = list(set(train["title_event_code"].unique()).union(test["title_event_code"].unique()))
    # make a list with all the unique 'titles' from the train and test set
    list_of_user_activities = list(set(train['title'].unique()).union(set(test['title'].unique())))
    # make a list with all the unique 'event_code' from the train and test set
    list_of_event_code = list(set(train['event_code'].unique()).union(set(test['event_code'].unique())))
    list_of_event_id = list(set(train['event_id'].unique()).union(set(test['event_id'].unique())))
    # make a list with all the unique worlds from the train and test set
    list_of_worlds = list(set(train['world'].unique()).union(set(test['world'].unique())))
    # create a dictionary numerating the titles
    activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))
    inv_title_map = {v:k for k,v in activities_map.items()}
    activities_labels = dict(zip(np.arange(len(list_of_user_activities)), list_of_user_activities))
    activities_world = dict(zip(list_of_worlds, np.arange(len(list_of_worlds))))
    assess_titles = list(set(train[train['type'] == 'Assessment']['title'].value_counts().index).union(set(test[test['type'] == 'Assessment']['title'].value_counts().index)))
    # replace the text titles with the number titles from the dict
    train['title'] = train['title'].map(activities_map)
    test['title'] = test['title'].map(activities_map)
    train['world'] = train['world'].map(activities_world)
    test['world'] = test['world'].map(activities_world)
    train_labels['title'] = train_labels['title'].map(activities_map)
    win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
    # then, it set one element, the 'Bird Measurer (Assessment)' as 4110, 10 more than the rest
    win_code[activities_map['Bird Measurer (Assessment)']] = 4110
    # convert text into datetime
    '''train['timestamp'] = pd.to_datetime(train['timestamp'])
    test['timestamp'] = pd.to_datetime(test['timestamp'])'''
    
    
    return train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, inv_title_map, activities_map

In [None]:
%%time
# get usefull dict with maping encode
train, test, train_labels, win_code, list_of_user_activities, list_of_event_code, activities_labels, assess_titles, list_of_event_id, all_title_event_code, inv_title_map, activities_map = encode_title(train, test, train_labels)

In [None]:
def categorise_accuracy_group(x):
    if x >= 1:
        return 3
    elif (x < 1) & (x >= 0.5):
        return 2
    elif (x < 0.5) & (x > 0):
        return 1
    elif x <= 0:
        return 0
    else:
        raise ValueError('something went wrong')

In [None]:
def classify(x):
    if x <= bound[0]:
        return 0
    elif x <= bound[1]:
        return 1
    elif x <= bound[2]:
        return 2
    else:
        return 3

In [None]:
# this is the function that convert the raw data into processed features
def get_data(user_sample, test_set=False):#, title_value_counts_dict=None):
    '''
    The user_sample is a DataFrame from train or test where the only one 
    installation_id is filtered
    And the test_set parameter is related with the labels processing, that is only requered
    if test_set=False
    '''
    user_sample = user_sample.sort_values(['timestamp', 'title', 'game_session', 'event_count']).reset_index(drop=True)
    # Constants and parameters declaration
    last_activity = 0
    
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    
    game_data = dict(zip(game_titles, np.zeros(len(game_titles))))
    activity_data = dict(zip(activity_titles, np.zeros(len(activity_titles))))
    assessment_data = dict(zip(assessment_titles, np.zeros(len(assessment_titles))))
    clip_data = dict(zip(clip_titles, np.zeros(len(clip_titles))))
    
    game_correct_attempts = {key+'_correct_attempts':0 for key in game_data.keys()}
    game_incorrect_attempts = {key+'_incorrect_attempts':0 for key in game_data.keys()}
    
    game_correct_attempts_list_dict = {key+'_correct_attempts_list':[] for key in game_data.keys()}
    game_incorrect_attempts_list_dict = {key+'_incorrect_attempts_list':[] for key in game_data.keys()}
    
    assessment_correct_attempts = {key+'_correct_attempts':0 for key in assessment_data.keys()}
    assessment_incorrect_attempts = {key+'_incorrect_attempts':0 for key in assessment_data.keys()}
    assessment_total_attempts_dict = {key+'_total_attempts':0 for key in assessment_data.keys()}
    
    assessment_correct_attempts_list_dict = {key+'_correct_attempts_list':[] for key in assessment_data.keys()}
    assessment_incorrect_attempts_list_dict = {key+'_incorrect_attempts_list':[] for key in assessment_data.keys()}
    assessment_total_attempts_list_dict = {key+'_total_attempts_list':[] for key in assessment_data.keys()}
    assessment_accuracy_list_dict = {key+'_accuracy_list':[] for key in assessment_data.keys()}
    
    activity_playtime = {key+'_playtime':0 for key in activity_data.keys()}
    
    game_accuracies_dict = {key+'_accuracy':[] for key in game_data.keys()}
    assessment_durations_dict = {key+'_duration':[] for key in assessment_data.keys()}
    
    game_durations_dict = {key+'_duration':[] for key in game_data.keys()}
    game_playtime_dict = {key+'_playtime':0 for key in game_data.keys()}
    assessment_playtime_dict = {key+'_playtime':0 for key in assessment_data.keys()}
    
    # new features: time spent in each activity
    last_session_time_sec = 0
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy = 0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0
    accumulated_pseudo_correct_attempts = 0
    accumulated_pseudo_incorrect_attempts = 0
    accumulated_game_correct_attempts = 0
    accumulated_game_incorrect_attempts = 0
    accumulated_assessment_correct_attempts = 0
    accumulated_assessment_incorrect_attempts = 0
    accumulated_actions = 0
    counter = 0
    time_first_activity = float(user_sample['timestamp'].values[0])
    durations = []
    game_durations = []
    activity_durations = []
    assessment_durations = []
    game_accuracies_list = []
    assessment_accuracies_list = []
    activity_4070_list = []
    last_accuracy_title = {'acc_' + title: -1 for title in assess_titles}
    event_code_count: Dict[str, int] = {ev: 0 for ev in list_of_event_code}
    event_id_count: Dict[str, int] = {eve: 0 for eve in list_of_event_id}
    title_count: Dict[str, int] = {eve: 0 for eve in activities_labels.values()} 
    title_event_code_count: Dict[str, int] = {t_eve: 0 for t_eve in all_title_event_code}
    
    # itarates through each session of one instalation_id
    for i, session in user_sample.groupby('game_session', sort=False):
        # i = game_session_id
        # session is a DataFrame that contain only one game_session
        session = session.sort_values(['event_count']).reset_index(drop=True)
        
        # get some sessions information
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        session_title_text = activities_labels[session_title]
        session_duration = (session.iloc[-1, 2] - session.iloc[0, 2] ).seconds                    
            
        # for each assessment, and only this kind off session, the features below are processed
        # and a register are generated
        if (session_type == 'Assessment') & (test_set or len(session)>1):
            # search for event_code 4100, that represents the assessments trial
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            # then, check the numbers of wins and the number of losses
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            pseudo_true_attempts = session['event_data'].str.contains('true').sum()
            pseudo_false_attempts = session['event_data'].str.contains('false').sum()
            # copy a dict to use as feature template, it's initialized with some itens: 
            # {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
            features = user_activities_count.copy()
            features.update(last_accuracy_title.copy())
            features.update(event_code_count.copy())
            features.update(event_id_count.copy())
            features.update(title_count.copy())
            features.update(title_event_code_count.copy())
            '''features.update(game_correct_attempts.copy())
            features.update(game_incorrect_attempts.copy())
            features.update(activity_playtime.copy())'''
            
            # get installation_id for aggregated features
            features['installation_id'] = session['installation_id'].iloc[-1]
            # add title as feature, remembering that title represents the name of the game
            features['session_title'] = session['title'].iloc[0]
            # the 4 lines below add the feature of the history of the trials of this player
            # this is based on the all time attempts so far, at the moment of this assessment
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            
            # time features
            features['hour'] = session['timestamp'].iloc[0].hour
            features['day'] = session['timestamp'].iloc[0].day
            features['dayofweek'] = session['timestamp'].iloc[0].dayofweek
            features['week'] = session['timestamp'].iloc[0].week
            features['month'] = session['timestamp'].iloc[0].month
            features['year'] = session['timestamp'].iloc[0].year
            
            #
            features['accumulated_pseudo_correct_attempts'] = accumulated_pseudo_correct_attempts
            features['accumulated_pseudo_incorrect_attempts'] = accumulated_pseudo_incorrect_attempts
            accumulated_pseudo_correct_attempts += pseudo_true_attempts 
            accumulated_pseudo_incorrect_attempts += pseudo_false_attempts
            
            #
            features['accumulated_game_correct_attempts'] = accumulated_game_correct_attempts
            features['accumulated_game_incorrect_attempts'] = accumulated_game_incorrect_attempts
            features['accumulated_assessment_correct_attempts'] = accumulated_assessment_correct_attempts
            features['accumulated_assessment_incorrect_attempts'] = accumulated_assessment_incorrect_attempts
            
            # the time spent in the app so far
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2]).seconds)
            # the accurace is the all time wins divided by the all time attempts
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            pseudo_accuracy = pseudo_true_attempts/(pseudo_true_attempts+pseudo_false_attempts) if (pseudo_true_attempts+pseudo_false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            last_accuracy_title['acc_' + session_title_text] = accuracy
            # a feature of the current accuracy categorized
            # it is a counter of how many times this player was in each accuracy group
            features['accuracy'] = accuracy
            features['pseudo_accuracy'] = pseudo_accuracy
            features['pseudo_accuracy_group'] = categorise_accuracy_group(pseudo_accuracy)
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1
            features.update(accuracy_groups)
            accuracy_groups[features['accuracy_group']] += 1
            # mean of the all accuracy groups of this player
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            accumulated_accuracy_group += features['accuracy_group']
            # how many actions the player has done so far, it is initialized as 0 and updated some lines below
            features['accumulated_actions'] = accumulated_actions
            
            # attempted before
            if assessment_total_attempts_dict[inv_title_map[session_title]+'_total_attempts'] != 0:
                features['attempted_before'] = 1
            else:
                features['attempted_before'] = 0
            
            # solved before
            if assessment_correct_attempts[inv_title_map[session_title]+'_correct_attempts'] != 0:
                features['solved_before'] = 1
            else:
                features['solved_before'] = 0
            
            # if solved before, then previous accuracy group
            
            # first accuracy group
            if assessment_accuracy_list_dict[inv_title_map[session_title]+'_accuracy_list'] != []:
                features['first_accuracy'] = assessment_accuracy_list_dict[inv_title_map[session_title]+'_accuracy_list'][0]
            else:
                features['first_accuracy'] = 0
            
            # last accuracy group
            if assessment_accuracy_list_dict[inv_title_map[session_title]+'_accuracy_list'] != []:
                features['last_accuracy'] = assessment_accuracy_list_dict[inv_title_map[session_title]+'_accuracy_list'][-1]
            else:
                features['last_accuracy'] = 0
            
            # mean accuracy group
            if assessment_accuracy_list_dict[inv_title_map[session_title]+'_accuracy_list'] != []:
                features['mean_previous_accuracy'] = np.mean(assessment_accuracy_list_dict[inv_title_map[session_title]\
                                                                                           +'_accuracy_list'])
            else:
                features['mean_previous_accuracy'] = 0
            
            # std accuracy
            if assessment_accuracy_list_dict[inv_title_map[session_title]+'_accuracy_list'] != []:
                features['std_previous_accuracy'] = np.std(assessment_accuracy_list_dict[inv_title_map[session_title]\
                                                                                           +'_accuracy_list'])
            else:
                features['std_previous_accuracy'] = 0
                
            # number of previous attempts (correct, incorrect, total)
            features['previous_correct_attempts'] = assessment_correct_attempts[inv_title_map[session_title]\
                                                                                +'_correct_attempts']
            features['previous_incorrect_attempts'] = assessment_incorrect_attempts[inv_title_map[session_title]\
                                                                                    +'_incorrect_attempts']
            features['previous_total_attempts'] = assessment_total_attempts_dict[inv_title_map[session_title]\
                                                                                 +'_total_attempts']
            # previous assessment duration
            if assessment_durations_dict[inv_title_map[session_title]+'_duration'] != []:
                features['first_assessment_duration'] = assessment_durations_dict[inv_title_map[session_title]+'_duration'][0]
                features['last_assessment_duration'] = assessment_durations_dict[inv_title_map[session_title]+'_duration'][-1]
                features['mean_assessment_duration'] = np.mean(assessment_durations_dict[inv_title_map[session_title]+'_duration'])
            else:
                features['first_assessment_duration'] = 0
                features['last_assessment_duration'] = 0
                features['mean_assessment_duration'] = 0
            
            # previous playtime
            features['previous_playtime'] = assessment_playtime_dict[inv_title_map[session_title]+'_playtime']
            
            # previous attempts and playtime combined
            features['previous_correct_per_second'] = features['previous_correct_attempts'] / features['previous_playtime'] if features['previous_playtime'] != 0 else 0
            features['previous_incorrect_per_second'] = features['previous_incorrect_attempts'] / features['previous_playtime'] if features['previous_playtime'] != 0 else 0
            features['previous_total_per_second'] = features['previous_total_attempts'] / features['previous_playtime'] if features['previous_playtime'] != 0 else 0
            
            # game last played
            
            # activity last played
            
            # class weights for each title
            '''if title_value_counts_dict:
                feature_value_counts = title_value_counts_dict[inv_title_map[session_title]]
                feature_class_weights = [feature_value_counts.min() / feature_value_counts[i] for i in range(len(feature_value_counts))]
                for i in range(len(feature_value_counts)):
                    features['feature_value_counts_{}'.format(i)] = feature_value_counts[i]
                    features['feature_class_weights_{}'.format(i)] = feature_class_weights[i]
                    #features['accuracy_group_order_{}'.format(i)] = feature_value_counts.index[i]
                    features['frequency_order_{}'.format(i)] = feature_value_counts.iloc[i]'''
            
            # game accuracies feature engineering
            features['mean_game_accuracy2'] = 0
            if game_accuracies_list != []:
                features['mean_game_accuracy2'] = np.mean(game_accuracies_list)
                
            features['std_game_accuracy2'] = 0
            if game_accuracies_list != []:
                features['std_game_accuracy2'] = np.std(game_accuracies_list)
                
            # assessment accuracies feature engineering
            '''features['mean_assessment_accuracy2'] = 0
            features['std_assessment_accuracy2'] = 0
            if assessment_accuracies_list != []:
                features['mean_assessment_accuracy2'] = np.mean(assessment_accuracies_list)
                features['std_assessment_accuracy2'] = np.std(assessment_accuracies_list)'''
                
            # activity playtime feature engineering
            features['total_activity_playtime'] = 0
            features['mean_activity_playtime'] = 0
            features['std_activity_playtime'] = 0
            if activity_durations != []:
                features['total_activity_playtime'] = np.sum(activity_durations)
                features['mean_activity_playtime'] = np.mean(activity_durations)
                features['std_activity_playtime'] = np.std(activity_durations)
                
            # activity 4070 feature engineering
            features['total_activity_4070'] = 0
            features['mean_activity_4070'] = 0
            features['std_activity_4070'] = 0
            if activity_4070_list != []:
                features['total_activity_4070'] = np.sum(activity_4070_list)
                features['mean_activity_4070'] = np.mean(activity_4070_list)
                features['std_activity_4070'] = np.std(activity_4070_list)
            
            # activity 4070 and time
            features['activity_4070_per_second'] = features['total_activity_4070'] / features['total_activity_playtime'] if features['total_activity_playtime'] != 0 else 0
            features['mean_activity_4070_per_second'] = features['mean_activity_4070'] / features['total_activity_playtime'] if features['total_activity_playtime'] != 0 else 0
            
            # there are some conditions to allow this features to be inserted in the datasets
            # if it's a test set, all sessions belong to the final dataset
            # it it's a train, needs to be passed throught this clausule: session.query(f'event_code == {win_code[session_title]}')
            # that means, must exist an event_code 4100 or 4110
            if test_set:
                all_assessments.append(features)
            elif true_attempts+false_attempts > 0:
                all_assessments.append(features)
                
            counter += 1
        
        # this piece counts how many actions was made in each event_code so far
        def update_counters(counter: dict, col: str):
                num_of_session_count = Counter(session[col])
                for k in num_of_session_count.keys():
                    x = k
                    if col == 'title':
                        x = activities_labels[k]
                    counter[x] += num_of_session_count[k]
                return counter
            
        event_code_count = update_counters(event_code_count, "event_code")
        event_id_count = update_counters(event_id_count, "event_id")
        title_count = update_counters(title_count, 'title')
        title_event_code_count = update_counters(title_event_code_count, 'title_event_code')

        # counts how many actions the player has done so far, used in the feature of the same name
        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type
        
        # game attempt counts
        if session_type == 'Game':
            game_true_attempts = session['event_data'].str.contains('"correct":true').sum()
            game_false_attempts = session['event_data'].str.contains('"correct":false').sum()
            current_game_total_attempts = game_true_attempts + game_false_attempts
            current_game_accuracy = game_true_attempts / current_game_total_attempts if current_game_total_attempts != 0 else 'pass'
            if current_game_accuracy != 'pass':
                game_accuracies_dict[inv_title_map[session_title]+'_accuracy'].append(current_game_accuracy)
                game_accuracies_list.append(current_game_accuracy)
            
            accumulated_game_correct_attempts += game_true_attempts
            accumulated_game_incorrect_attempts += game_false_attempts
            
            game_correct_attempts[inv_title_map[session_title]+'_correct_attempts'] += game_true_attempts
            game_incorrect_attempts[inv_title_map[session_title]+'_incorrect_attempts'] += game_false_attempts
            
            game_correct_attempts_list_dict[inv_title_map[session_title]+'_correct_attempts_list'].append(game_true_attempts)
            game_incorrect_attempts_list_dict[inv_title_map[session_title]+'_incorrect_attempts_list'].append(game_false_attempts)
            if current_game_total_attempts > 0:
                game_durations_dict[inv_title_map[session_title]+'_duration'].append(session_duration)
            
            game_playtime_dict[inv_title_map[session_title]+'_playtime'] += session_duration
            
        elif session_type == 'Activity':
            playtime = (session.iloc[-1, 2] - session.iloc[0, 2] ).seconds
            activity_durations.append(playtime)
            
            activity_playtime[inv_title_map[session_title]+'_playtime'] += playtime
            activity_4070 = (session['event_code'] == 4070).sum()
            activity_4070_list.append(activity_4070)
        
        elif session_type == 'Assessment':
            assessment_durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            
            if session_title == 'Bird Measurer (Assessment)':
                attempts_df = session[session['event_code'] == 4110]
            else:
                attempts_df = session[session['event_code'] == 4100]
            assessment_true_attempts = attempts_df['event_data'].str.contains('"correct":true').sum()
            assessment_false_attempts = attempts_df['event_data'].str.contains('"correct":false').sum()
            assessment_total_attempts = assessment_true_attempts + assessment_false_attempts
            feature_assessment_accuracy = assessment_true_attempts / assessment_total_attempts if assessment_total_attempts != 0 else 'pass'
            accumulated_assessment_correct_attempts += assessment_true_attempts
            accumulated_assessment_incorrect_attempts += assessment_false_attempts
            if assessment_total_attempts > 0:
                assessment_durations_dict[inv_title_map[session_title]+'_duration'].append(session_duration)
                assessment_playtime_dict[inv_title_map[session_title]+'_playtime'] += session_duration
            
            assessment_correct_attempts[inv_title_map[session_title]+'_correct_attempts'] += assessment_true_attempts
            assessment_incorrect_attempts[inv_title_map[session_title]+'_incorrect_attempts'] += assessment_false_attempts
            assessment_total_attempts_dict[inv_title_map[session_title]+'_total_attempts'] += assessment_total_attempts
            
            assessment_correct_attempts_list_dict[inv_title_map[session_title]+'_correct_attempts_list'].append(assessment_true_attempts)
            assessment_incorrect_attempts_list_dict[inv_title_map[session_title]+'_incorrect_attempts_list'].append(assessment_false_attempts)
            assessment_total_attempts_list_dict[inv_title_map[session_title]+'_total_attempts_list']\
            .append(assessment_total_attempts)
            if feature_assessment_accuracy == 'pass':
                pass
            else:
                assessment_accuracy_list_dict[inv_title_map[session_title]+'_accuracy_list'].append(feature_assessment_accuracy)
                assessment_accuracies_list.append(feature_assessment_accuracy)
            
            
    # if it't the test_set, only the last assessment must be predicted, the previous are scraped
    if all_assessments != []:
        for features_dict in all_assessments:
            features_dict['is_last_session'] = 0
            features_dict['is_first_session'] = 0
        all_assessments[-1]['is_last_session'] = 1
        all_assessments[0]['is_first_session'] = 1
        
    '''game_accuracies_list = []
    for k, v in game_accuracies_dict.items():
        if v != []:
            game_accuracies_list.append(v[0])'''
            
    assessment_durations_list = []
    for k, v in assessment_durations_dict.items():
        if v != []:
            assessment_durations_list.append(v[0])
            
    game_durations_list = []
    for k, v in game_durations_dict.items():
        if v!= []:
            game_durations_list.append(v[0])
    
    '''if all_assessments != []:
        for features_dict in all_assessments:
            features_dict['mean_first_game_accuracy'] = 0
            features_dict['std_first_game_accuracy'] = 0
            if game_accuracies_list != []:
                features_dict['mean_first_game_accuracy'] = np.mean(game_accuracies_list)
                features_dict['std_first_game_accuracy'] = np.std(game_accuracies_list)
            features_dict['mean_first_game_duration'] = 0
            features_dict['std_first_game_duration'] = 0
            if game_durations_list != []:
                features_dict['mean_first_game_duration'] = np.mean(game_durations_list)
                features_dict['std_first_game_duration'] = np.std(game_durations_list)
            features_dict['first_game_accuracy_per_second'] = features_dict['mean_first_game_accuracy'] / features_dict['mean_first_game_duration'] if features_dict['mean_first_game_duration'] != 0 else 0
            
            features_dict['mean_first_assessment_duration'] = 0
            if assessment_durations_list != []:
                features_dict['mean_first_assessment_duration'] = np.mean(assessment_durations_list)
                features_dict['std_first_assessment_duration'] = np.std(assessment_durations_list) if np.std(assessment_durations_list) != np.nan else 0
            features_dict['first_game_accuracy*first_assessment_duration'] = features_dict['mean_first_game_accuracy'] * features_dict['mean_first_assessment_duration']
            features_dict['first_assessment_duration/first_game_accuracy'] = features_dict['mean_first_assessment_duration'] / features_dict['mean_first_game_accuracy'] if features_dict['mean_first_game_accuracy'] != 0 else 0
            features_dict['first_game_accuracy + first_assessment_duration / 60'] = features_dict['mean_first_game_accuracy'] + (features_dict['mean_first_assessment_duration']/60)
            features_dict['first_game_accuracy + 1/(first_assessment_duration / 60)'] = features_dict['mean_first_game_accuracy'] + 1/(features_dict['mean_first_assessment_duration']/60) if (features_dict['mean_first_assessment_duration']/60) != 0 else 0'''
    
    if test_set:
        return all_assessments[-1]
    # in the train_set, all assessments goes to the dataset
    return all_assessments

In [None]:
def get_train_and_test(train, test):
    '''compiled_train = []
    for ins_id, user_sample in tqdm(train.groupby('installation_id', sort = False), position=0):
        compiled_train += get_data(user_sample)
    reduce_train = pd.DataFrame(compiled_train)
    
    title_value_counts_dict = {}
    for title in assessment_titles:
        title_df = reduce_train[reduce_train['session_title'] == activities_map[title]]
        title_value_counts = title_df['accuracy_group'].value_counts(normalize=True)
        title_value_counts_dict[title] = title_value_counts'''
    
    compiled_train = []
    compiled_test_X = []
    compiled_test = []
    for ins_id, user_sample in tqdm(train.groupby('installation_id', sort = False), position=0):
        compiled_train += get_data(user_sample)#, title_value_counts_dict=title_value_counts_dict)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), position=0):
        compiled_test_X += get_data(user_sample)#, title_value_counts_dict=title_value_counts_dict)
    for ins_id, user_sample in tqdm(test.groupby('installation_id', sort = False), position=0):
        test_data = get_data(user_sample, test_set=True)#, title_value_counts_dict=title_value_counts_dict)
        compiled_test.append(test_data)
    reduce_train = pd.DataFrame(compiled_train)
    reduce_test_X = pd.DataFrame(compiled_test_X)
    reduce_test = pd.DataFrame(compiled_test)
    categoricals = ['session_title']
    
    return reduce_train, reduce_test_X, reduce_test, categoricals

In [None]:
# tranform function to get the train and test set
reduce_train, reduce_test_X, reduce_test, categoricals = get_train_and_test(train, test)
print(reduce_train.shape, reduce_test_X.shape, reduce_test.shape)

In [None]:
reduce_train_save = reduce_train.copy()
reduce_test_save = reduce_test.copy()
reduce_test_X_save = reduce_test_X.copy()

In [None]:
reduce_train = reduce_train_save.copy()
reduce_test = reduce_test_save.copy()
reduce_test_X = reduce_test_X_save.copy()

In [None]:
def create_title_stats(df):
    unique_titles = df['session_title'].unique()

    title_stats_df = pd.DataFrame()
    title_stats_df['session_title'] = unique_titles
    
    '''# value counts
    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        for j in range(len(title_val_counts)):
            title_stats_df['value_count_{}'.format(j)] = 0

    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        for j in range(len(title_val_counts)):
            title_stats_df['value_count_{}'.format(j)].iloc[i] = title_val_counts[j]'''
    
    '''# class weights
    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        title_class_weights = [title_val_counts.max() / title_val_counts[i] for i in range(len(title_val_counts))]
        for j in range(len(title_class_weights)):
            title_stats_df['class_weight_{}'.format(j)] = 0

    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        title_class_weights = [title_val_counts.max() / title_val_counts[i] for i in range(len(title_val_counts))]
        for j in range(len(title_class_weights)):
            title_stats_df['class_weight_{}'.format(j)].iloc[i] = title_class_weights[j]'''
            
    # accuracy group order
    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        for j in range(len(title_val_counts)):
            title_stats_df['accuracy_group_order_{}'.format(j)] = 0

    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        for j in range(len(title_val_counts)):
            title_stats_df['accuracy_group_order_{}'.format(j)].iloc[i] = title_val_counts.index[j]
    
    '''# frequency order
    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        for j in range(len(title_val_counts)):
            title_stats_df['frequency_order_{}'.format(j)] = 0

    for i in range(len(title_stats_df)):
        title = title_stats_df['session_title'][i]
        title_val_counts = df[df['session_title'] == title]['accuracy_group'].value_counts(normalize=True)
        for j in range(len(title_val_counts)):
            title_stats_df['frequency_order_{}'.format(j)].iloc[i] = title_val_counts.iloc[j]'''
    
    return title_stats_df

In [None]:
def secondary_feature_engineering(df):
    df['accumulated_total_attempts'] = df['accumulated_correct_attempts'] + df['accumulated_uncorrect_attempts']
    df['accumulated_game_total_attempts'] = df['accumulated_game_correct_attempts'] + df['accumulated_game_incorrect_attempts']
    df['accumulated_assessment_total_attempts'] = df['accumulated_assessment_correct_attempts'] + df['accumulated_assessment_incorrect_attempts']
    
    #
    '''df['something_accuracy'] = df['accumulated_correct_attempts'] / df['accumulated_total_attempts']
    df['something_accuracy'] = df['something_accuracy'].fillna(0)'''
    '''df['something_accuracy_group'] = df['something_accuracy'].apply(categorise_accuracy_group)
    df['game_accuracy'] = df['accumulated_game_correct_attempts'] / df['accumulated_game_total_attempts']
    df['game_accuracy'] = df['game_accuracy'].fillna(0)
    df['game_accuracy_group'] = df['game_accuracy'].apply(categorise_accuracy_group)
    df['assessment_accuracy'] = df['accumulated_assessment_correct_attempts'] / df['accumulated_assessment_total_attempts']
    df['assessment_accuracy'] = df['assessment_accuracy'].fillna(0)
    df['assessment_accuracy_group'] = df['assessment_accuracy'].apply(categorise_accuracy_group)
    df['accuracy_user_score'] = df[['something_accuracy', 'game_accuracy', 'assessment_accuracy']].sum(axis=1)
    df['accuracy_group_user_score'] = df[['something_accuracy_group', 'game_accuracy_group', 'assessment_accuracy_group']].sum(axis=1)'''
    
    # 2
    df['user_score'] = df[['mean_game_accuracy2', 'accumulated_accuracy']].sum(axis=1)
    df['game_accuracy+mean_previous_accuracy'] = df[['mean_game_accuracy2', 'mean_previous_accuracy']].sum(axis=1)
    df['game_accuracy+last_previous_accuracy'] = df[['mean_game_accuracy2', 'last_accuracy']].sum(axis=1)
    
    # attempts
    '''df['first_correct_attempts'] = df.groupby('installation_id')['accumulated_correct_attempts'].transform('first')
    df['first_uncorrect_attempts'] = df.groupby('installation_id')['accumulated_uncorrect_attempts'].transform('first')
    df['first_total_attempts'] = df.groupby('installation_id')['accumulated_total_attempts'].transform('first')
    df['first_idk_accuracy'] = df['first_correct_attempts'] / df['first_total_attempts']
    df['first_idk_accuracy'] = df['first_idk_accuracy'].fillna(0)'''
        
    # assessment attempts
    '''df['first_assessment_correct_attempts'] = df.groupby('installation_id')['accumulated_assessment_correct_attempts'].transform('first')
    df['first_assessment_incorrect_attempts'] = df.groupby('installation_id')['accumulated_assessment_incorrect_attempts'].transform('first')
    df['first_assessment_total_attempts'] = df.groupby('installation_id')['accumulated_assessment_total_attempts'].transform('first')
    df['first_assessment_accuracy'] = df['first_assessment_correct_attempts'] / df['first_assessment_total_attempts']
    df['first_assessment_accuracy'] = df['first_assessment_accuracy'].fillna(0)'''
    
    return df

In [None]:
# function that creates more features
def preprocess(reduce_train, reduce_test, reduce_test_X, categoricals):
    
    # new val scheme
    '''reduce_train = reduce_train.sort_values(['installation_id', 'accumulated_actions']).reset_index(drop=True)
    unique_titles = reduce_train['session_title'].unique()
    last_title_dfs_list = []
    for i in range(len(unique_titles)):
        title_df = reduce_train[reduce_train['session_title'] == unique_titles[i]]
        title_last_df = title_df.groupby('installation_id').last()
        last_title_dfs_list.append(title_last_df)

    reduce_train = pd.concat(last_title_dfs_list).reset_index()[reduce_train.columns].sort_values(['installation_id', 'accumulated_actions'])'''
    
    # title stats feature engineering
    '''train_title_stats_df = create_title_stats(reduce_train)
    reduce_train = reduce_train.merge(train_title_stats_df, on=['session_title'])
    reduce_train = reduce_train.sort_values(['installation_id', 'accumulated_actions'])
    reduce_test = reduce_test.merge(train_title_stats_df, on=['session_title'])
    reduce_test_X = reduce_test_X.merge(train_title_stats_df, on=['session_title'])'''
    
    for df in [reduce_train, reduce_test, reduce_test_X]:
        #df['installation_session_count'] = df.groupby(['installation_id'])['Clip'].transform('count')
        #df['first_duration_mean'] = df.groupby(['installation_id'])['assessment_duration_mean'].transform('first')
        '''df['mean_assessment_duration_mean'] = df.groupby(['installation_id'])['assessment_duration_mean'].transform('mean')
        df['std_assessment_duration_mean'] = df.groupby(['installation_id'])['assessment_duration_mean'].transform('std')
        df['std_assessment_duration_mean'] = df['std_assessment_duration_mean'].fillna(0)'''
        #df['installation_duration_std'] = df.groupby(['installation_id'])['duration_mean'].transform('std')
        #df['installation_title_nunique'] = df.groupby(['installation_id'])['session_title'].transform('nunique')
        
        df['sum_event_code_count'] = df[[2050, 4100, 4230, 5000, 4235, 2060, 4110, 5010, 2070, 2075, 2080, 2081, 2083, 3110, 4010, 3120, 3121, 4020, 4021, 
                                        4022, 4025, 4030, 4031, 3010, 4035, 4040, 3020, 3021, 4045, 2000, 4050, 2010, 2020, 4070, 2025, 2030, 4080, 2035, 
                                        2040, 4090, 4220, 4095]].sum(axis = 1)
        
        #df['installation_event_code_count_mean'] = df.groupby(['installation_id'])['sum_event_code_count'].transform('mean')
        #df['installation_event_code_count_std'] = df.groupby(['installation_id'])['sum_event_code_count'].transform('std')
        df = secondary_feature_engineering(df)
        '''df['first_game_total_attempts'] = df['first_game_correct_attempts'] + df['first_game_incorrect_attempts']
        df['first_game_accuracy'] = df['first_game_correct_attempts'] / df['first_game_total_attempts']
        df['first_game_accuracy'] = df['first_game_accuracy'].fillna(0)'''
        '''df['first_user_score'] = df.groupby('installation_id')['user_score'].transform('first')
        df['mean_game_accuracy'] = df.groupby('installation_id')['game_accuracy'].transform('mean')
        #df['mean_user_score'] = df.groupby('installation_id')['user_score'].transform('mean')
        df['std_game_accuracy'] = df.groupby('installation_id')['game_accuracy'].transform('std')
        df['std_game_accuracy'] = df['std_game_accuracy'].fillna(0)'''
        #df['std_user_score'] = df.groupby('installation_id')['user_score'].transform('std')
        #df['std_user_score'] = df['std_user_score'].fillna(0)
        
        #
        '''for title in game_titles:
            df[title+'_total_attempts'] = df[title+'_correct_attempts'] + df[title+'_incorrect_attempts']
            df[title+'_accuracy'] = df[title+'_correct_attempts'] / df[title+'_total_attempts']
            df[title+'_accuracy'] = df[title+'_accuracy'].fillna(0)
            df[title+'_accuracy_group'] = df[title+'_accuracy'].apply(categorise_accuracy_group)'''
        
    '''for cat in categoricals:
        agg = reduce_train.groupby(cat)['accuracy_group'].agg({'mean', 'std'})
        agg.columns = ['std_accuracy_group_per_{}'.format(cat), 'mean_accuracy_group_per_{}'.format(cat)]
        reduce_train = reduce_train.merge(agg, on=['{}'.format(cat)])
        reduce_test = reduce_test.merge(agg, on=['{}'.format(cat)])
        reduce_test_X = reduce_test_X.merge(agg, on=['{}'.format(cat)])'''    
    
    reduce_train = reduce_train.sort_values(['installation_id', 'accumulated_actions']).reset_index(drop=True)
        
    features = reduce_train.loc[(reduce_train.sum(axis=1) != 0), (reduce_train.sum(axis=0) != 0)].columns # delete useless columns
    features = features.drop(['accuracy_group', 'installation_id', 'accuracy', 'is_first_session', 'is_last_session'])
    features = sorted([str(col) for col in features if 'pseudo' not in str(col)])
   
    return reduce_train, reduce_test, reduce_test_X, features

In [None]:
%%time
# call feature engineering function
#reduce_test = pd.concat([reduce_test_X, reduce_test]).sort_values(['installation_id', 'accumulated_actions'])
reduce_train, reduce_test, reduce_test_X, features = preprocess(reduce_train, reduce_test, reduce_test_X, categoricals)
reduce_test_X['is_last_session'] = 0
#reduce_test = pd.concat([reduce_test, reduce_test_X]).sort_values(['installation_id', 'accumulated_actions']).reset_index(drop=True)
reduce_train = pd.concat([reduce_train, reduce_test_X]).sort_values(['installation_id', 'accumulated_actions']).reset_index(drop=True)
#reduce_test = reduce_test.groupby(['installation_id']).last().reset_index()
print(reduce_train.shape, reduce_test.shape)

In [None]:
reduce_train.columns = [str(col) for col in reduce_train.columns]
reduce_test.columns = [str(col) for col in reduce_test.columns]

In [None]:
reduce_train[[col for col in reduce_train.columns if reduce_train[col].isna().any() == True]] = reduce_train[[col for col in reduce_train.columns if reduce_train[col].isna().any() == True]].fillna(0)
reduce_test[[col for col in reduce_test.columns if reduce_test[col].isna().any() == True]] = reduce_test[[col for col in reduce_test.columns if reduce_test[col].isna().any() == True]].fillna(0)

In [None]:
counter = 0
to_remove = []
for feat_a in features:
    for feat_b in features:
        if feat_a != feat_b and feat_a not in to_remove and feat_b not in to_remove:
            c = np.corrcoef(reduce_train[feat_a], reduce_train[feat_b])[0][1]
            if c >= 1:
                counter += 1
                to_remove.append(feat_b)
                print('{}: FEAT_A: {} FEAT_B: {} - Correlation: {}'.format(counter, feat_a, feat_b, c))

In [None]:
features = sorted([col for col in features if col not in to_remove])
print(len(features))

In [None]:
categorical_features = ['solved_before', 'attempted_before', 'is_last_session', 'is_first_session', 'session_title']

In [None]:
reduce_train.head(20)

In [None]:
reduce_train['prac_accuracy'] = reduce_train['accuracy'] * 0.6 + reduce_train['pseudo_accuracy'] * 0.4
reduce_train['target_accuracy'] = reduce_train['prac_accuracy'].apply(lambda x: 1-1/4**(x))
reduce_train['target_accuracy'] = reduce_train['target_accuracy'] * (3 / reduce_train['target_accuracy'].max())
print(reduce_train['target_accuracy'].describe())
sns.distplot(reduce_train['target_accuracy'])
plt.show()
plt.clf()
optR = OptimizedRounder()
optR.fit(reduce_train['target_accuracy'], reduce_train['accuracy_group'])
coefficients = optR.coefficients()
opt_preds = optR.predict(reduce_train['target_accuracy'], coefficients)
print('kappa score:', cohen_kappa_score(reduce_train['accuracy_group'], opt_preds, weights='quadratic'))

In [None]:
cols_to_drop = ['is_first_session', 'is_last_session', 'user_score', 'game_accuracy+mean_previous_accuracy', 'game_accuracy+last_previous_accuracy']
features = sorted([col for col in features if col not in cols_to_drop])

In [None]:
time_features = ['hour', 'day', 'dayofweek', 'week', 'month', 'year']
categoricals = ['session_title'] + [col for col in features if col in time_features]
categoricals

# Installation_id as a feature

In [None]:
categoricals = categoricals + ['installation_id']
categoricals

In [None]:
features = sorted(features + ['installation_id'])
len(features)

In [None]:
train_no_previous_records_df = reduce_train[(reduce_train['is_last_session'] == 1) & (reduce_train['is_first_session'] == 1)]
temp_reduce_test_X = reduce_train[reduce_train['installation_id'].isin(reduce_test_X['installation_id'])]
temp_reduce_train = reduce_train[~reduce_train['installation_id'].isin(reduce_test_X['installation_id'])]
train_records_df = temp_reduce_train[~temp_reduce_train['installation_id'].isin(train_no_previous_records_df['installation_id'])]

unique_ids_df = pd.DataFrame()
unique_ids_df['installation_id'] = train_records_df['installation_id'].unique()

train_records_df = pd.concat([train_records_df, temp_reduce_test_X]).sort_values(['installation_id', 'accumulated_actions']).reset_index(drop=True)

In [None]:
from catboost import CatBoostRegressor, Pool

In [None]:
def catboost_plot_importances(features, model, n_features=30):
    df = pd.DataFrame()
    df['features'] = features
    df['importance'] = model.get_feature_importance()
    df = df.sort_values('importance', ascending=False)
    sns.barplot(df['importance'][:n_features], df['features'][:n_features])
    plt.show()
    plt.clf()
    
    return df[df['importance'] > 0]

In [None]:
ids_df = unique_ids_df.copy()
val_ids_list = []
models_list1 = []
for i in range(5):
    print('------ Fold {} starting -------'.format(i))
    val_ids_df = ids_df.sample(frac=1/(5-i), random_state=42)
    val_ids_list += val_ids_df['installation_id'].unique().tolist()
    ids_df = ids_df[~ids_df['installation_id'].isin(val_ids_list)]
    
    train_data = train_records_df[~train_records_df['installation_id'].isin(val_ids_df['installation_id'])]
    val_data = train_records_df[train_records_df['installation_id'].isin(val_ids_df['installation_id'])]
    val_last = val_data[val_data['is_last_session'] == 1]
    val_not_last = val_data[val_data['is_last_session'] == 0]
    train_data = pd.concat([train_data, val_not_last]).reset_index(drop=True)
    val_data = val_last.copy()
    
    x_train = train_data[features]
    y_train = train_data['target_accuracy']
    
    x_val = val_data[features]
    y_val = val_data['target_accuracy']
    y_val_acc_group = val_data['accuracy_group']
    
    train_pool = Pool(x_train, y_train, cat_features=categoricals)
    val_pool = Pool(x_val, y_val, cat_features=categoricals)
    
    model = CatBoostRegressor(loss_function='RMSE', eval_metric='RMSE', iterations=1e6, learning_rate=0.01,
                                   random_seed=42, use_best_model=True, depth=8, has_time=True)
    
    model.fit(train_pool, eval_set=val_pool, verbose=100, early_stopping_rounds=300)
    models_list1.append(model)
    preds = model.predict(x_val)
    top_features = catboost_plot_importances(features, model)
    optR = OptimizedRounder()
    optR.fit(preds.reshape(-1,), y_val_acc_group, method='Powell')
    coefficients = optR.coefficients()
    print('optimized coefficients:', coefficients)
    opt_preds = optR.predict(preds.reshape(-1, ), coefficients)
    print('Fold kappa score:', cohen_kappa_score(y_val_acc_group, opt_preds, weights='quadratic'))   
    print('------ Fold {} finished ------'.format(i))

In [None]:
previous_records_df = reduce_test[reduce_test['installation_id'].isin(reduce_test_X['installation_id'])]
no_records_df = reduce_test[~reduce_test['installation_id'].isin(reduce_test_X['installation_id'])]
print(previous_records_df.shape, no_records_df.shape)

In [None]:
preds_list = []
rank_preds_list = []
for model in models_list1:
    preds = model.predict(train_records_df[features])
    preds_list.append(preds)
    preds = rankdata(preds)
    rank_preds_list.append(preds)
train_records_df['raw_preds'] = np.mean(preds_list, axis=0)
train_records_df['rank'] = np.mean(rank_preds_list, axis=0)
sns.distplot(train_records_df['raw_preds'])

In [None]:
sns.distplot(train_records_df['rank'])

In [None]:
preds_list = []
rank_preds_list = []
for model in models_list1:
    preds = model.predict(previous_records_df[features])
    preds_list.append(preds)
    preds = rankdata(preds)
    rank_preds_list.append(preds)
previous_records_df['raw_preds'] = np.mean(preds_list, axis=0)
previous_records_df['rank'] = np.mean(rank_preds_list, axis=0)
sns.distplot(previous_records_df['raw_preds'])

In [None]:
sns.distplot(previous_records_df['rank'])

In [None]:
sns.distplot(train_records_df['raw_preds'])
sns.distplot(previous_records_df['raw_preds'])

In [None]:
last_df = train_records_df[train_records_df['is_last_session'] == 1]
print(last_df.shape)

In [None]:
sns.distplot(last_df['raw_preds'])
sns.distplot(previous_records_df['raw_preds'])

In [None]:
unique_titles = previous_records_df['session_title'].unique() 

In [None]:
for title in unique_titles:
    train_title_df = last_df[last_df['session_title'] == title]
    test_title_df = previous_records_df[previous_records_df['session_title'] == title]
    
    sns.distplot(train_title_df['raw_preds'])
    sns.distplot(test_title_df['raw_preds'])
    plt.show()
    plt.clf()

In [None]:
temp_test_X = train_records_df[train_records_df['installation_id'].isin(reduce_test_X['installation_id'])]
print(temp_test_X.shape)

In [None]:
sns.distplot(temp_test_X['raw_preds'])
sns.distplot(previous_records_df['raw_preds'])

In [None]:
for title in unique_titles:
    train_title_df = temp_test_X[temp_test_X['session_title'] == title]
    test_title_df = previous_records_df[previous_records_df['session_title'] == title]
    
    sns.distplot(train_title_df['raw_preds'])
    sns.distplot(test_title_df['raw_preds'])
    plt.show()
    plt.clf()

In [None]:
concat_df = pd.concat([last_df, temp_test_X])
print(concat_df.shape)

In [None]:
sns.distplot(concat_df['raw_preds'])
sns.distplot(previous_records_df['raw_preds'])

In [None]:
for title in unique_titles:
    train_title_df = concat_df[concat_df['session_title'] == title]
    test_title_df = previous_records_df[previous_records_df['session_title'] == title]
    
    sns.distplot(train_title_df['raw_preds'])
    sns.distplot(test_title_df['raw_preds'])
    plt.show()
    plt.clf()

In [None]:
title_ratio = previous_records_df['session_title'].value_counts(normalize=True)
acc_group_dist = {0:0, 1:0, 2:0, 3:0}
for title in unique_titles:
    title_df = temp_test_X[temp_test_X['session_title'] == title]
    title_acc_group_dist = title_df['accuracy_group'].value_counts(normalize=True)
    for i in range(len(title_acc_group_dist)):
        acc_group_dist[i] += title_acc_group_dist[i] * title_ratio[title]
acc_group_dist

In [None]:
acum = 0
bound = {}
for i in range(3):
    acum += acc_group_dist[i]
    bound[i] = np.percentile(previous_records_df['rank'], acum * 100)
print(bound)

previous_records_df['dist_preds'] = np.array(list(map(classify, previous_records_df['rank']))).astype('int')

In [None]:
previous_records_df['dist_preds'].value_counts(normalize=True)

# No installation_id as a feature

In [None]:
time_features = ['hour', 'day', 'dayofweek', 'week', 'month', 'year']
categoricals = ['session_title'] + [col for col in features if col in time_features]
categoricals

In [None]:
features.remove('installation_id')
features = sorted(features)
len(features)

In [None]:
def run_catboost_regression(reduce_train, usefull_features):
    kf = GroupKFold(n_splits=5)#, shuffle=True, random_state=42)
    target = 'target_accuracy'
    models_list = []
    top_features_list = []
    oof = np.zeros(len(reduce_train))
    for fold, (train_idx, val_idx) in enumerate(kf.split(reduce_train, reduce_train[target], reduce_train['installation_id'])):
        print('Fold {}'.format(fold + 1))
        x_train = reduce_train[usefull_features].iloc[train_idx]
        y_train = reduce_train[target].iloc[train_idx]
        val_data = reduce_train.iloc[val_idx]
        val_data = val_data[val_data['is_first_session'] == 1]
        x_val = val_data[usefull_features]
        y_val = val_data[target]
        y_acc_group = val_data['accuracy_group']
        x_val_for_oof = reduce_train[usefull_features].iloc[val_idx]
        y_val_for_oof = reduce_train[target].iloc[val_idx]
        y_acc_group_for_oof = reduce_train['accuracy_group'].iloc[val_idx]
        
        train_pool = Pool(x_train, y_train, cat_features=categoricals)
        val_pool = Pool(x_val, y_val, cat_features=categoricals)

        model = CatBoostRegressor(loss_function='RMSE', eval_metric='RMSE', iterations=1e6, learning_rate=0.01,
                                   random_seed=42, use_best_model=True, depth=8, has_time=True)
        model.fit(train_pool, eval_set=val_pool, verbose=100, early_stopping_rounds=300)
        preds = model.predict(x_val_for_oof)
        models_list.append(model)
        top_features = catboost_plot_importances(usefull_features, model)
        top_features_list.append(top_features)
        oof[val_idx] = preds
        optR = OptimizedRounder()
        optR.fit(preds.reshape(-1,), y_acc_group_for_oof, method='Powell')
        coefficients = optR.coefficients()
        print('optimized coefficients:', coefficients)
        opt_preds = optR.predict(preds.reshape(-1, ), coefficients)
        print('Fold kappa score:', cohen_kappa_score(y_acc_group_for_oof, opt_preds, weights='quadratic'))
        
    
    optR = OptimizedRounder()
    optR.fit(oof.reshape(-1,), reduce_train['accuracy_group'])
    coefficients = optR.coefficients()
    print('optimized coefficients:', coefficients)
    opt_preds = optR.predict(oof.reshape(-1,), coefficients)
    print(pd.Series(opt_preds).value_counts(normalize=True))
    print('oof kappa score:', cohen_kappa_score(reduce_train['accuracy_group'], opt_preds, weights='quadratic'))
    print('\n')
        
    return models_list, top_features_list, coefficients, optR, oof

In [None]:
models_list2, top_features_list, coefficients, optR, oof = run_catboost_regression(reduce_train, features)
reduce_train['oof'] = oof

In [None]:
preds_list = []
rank_preds_list = []
for model in models_list2:
    preds = model.predict(reduce_train[features])
    preds_list.append(preds)
    preds = rankdata(preds)
    rank_preds_list.append(preds)
reduce_train['raw_preds'] = np.mean(preds_list, axis=0)
reduce_train['rank'] = np.mean(rank_preds_list, axis=0)
sns.distplot(reduce_train['raw_preds'])

In [None]:
sns.distplot(reduce_train['rank'])

In [None]:
preds_list = []
rank_preds_list = []
for model in models_list2:
    preds = model.predict(no_records_df[features])
    preds_list.append(preds)
    preds = rankdata(preds)
    rank_preds_list.append(preds)
no_records_df['raw_preds'] = np.mean(preds_list, axis=0)
no_records_df['rank'] = np.mean(rank_preds_list, axis=0)
sns.distplot(no_records_df['raw_preds'])

In [None]:
sns.distplot(no_records_df['rank'])

In [None]:
sns.distplot(reduce_train['raw_preds'])
sns.distplot(no_records_df['raw_preds'])

In [None]:
first_df = reduce_train[reduce_train['is_first_session'] == 1]
print(first_df.shape)

In [None]:
sns.distplot(first_df['raw_preds'])
sns.distplot(no_records_df['raw_preds'])

In [None]:
sns.distplot(first_df['rank'])
# not really sure what this graph means

In [None]:
sns.distplot(no_records_df['rank'])

In [None]:
unique_titles = no_records_df['session_title'].unique()

In [None]:
for title in unique_titles:
    train_title_df = first_df[first_df['session_title'] == title]
    test_title_df = no_records_df[no_records_df['session_title'] == title]
    
    sns.distplot(train_title_df['raw_preds'])
    sns.distplot(test_title_df['raw_preds'])
    plt.show()
    plt.clf()

In [None]:
title_ratio = no_records_df['session_title'].value_counts(normalize=True)
acc_group_dist = {0:0, 1:0, 2:0, 3:0}
for title in unique_titles:
    title_df = first_df[first_df['session_title'] == title]
    title_acc_group_dist = title_df['accuracy_group'].value_counts(normalize=True)
    for i in range(len(title_acc_group_dist)):
        acc_group_dist[i] += title_acc_group_dist[i] * title_ratio[title]
acc_group_dist

In [None]:
acum = 0
bound = {}
for i in range(3):
    acum += acc_group_dist[i]
    bound[i] = np.percentile(no_records_df['rank'], acum * 100)
print(bound)

no_records_df['dist_preds'] = np.array(list(map(classify, no_records_df['rank']))).astype('int')

In [None]:
no_records_df['dist_preds'].value_counts(normalize=True)

In [None]:
test_df = pd.concat([previous_records_df, no_records_df]).reset_index(drop=True)
print(test_df.shape)

In [None]:
submission = sample_submission[['installation_id']].merge(test_df[['installation_id', 'dist_preds']], on=['installation_id'])
submission.columns = ['installation_id', 'accuracy_group']
submission.to_csv('submission.csv', index=False)
submission['accuracy_group'].value_counts(normalize=True)