In [32]:
import numpy as np
import pandas as pd
import datetime
from catboost import CatBoostClassifier
from time import time
from tqdm import tqdm_notebook as tqdm
import os
import random
from sklearn.model_selection import KFold


In [23]:
def seed_everything(seed=0):
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

In [24]:
seed_everything(78)

In [2]:
from sklearn.metrics import confusion_matrix
def qwk(act,pred,n=4,hist_range=(0,3)):
    
    O = confusion_matrix(act,pred)
    O = np.divide(O,np.sum(O))
    
    W = np.zeros((n,n))
    for i in range(n):
        for j in range(n):
            W[i][j] = ((i-j)**2)/((n-1)**2)
            
    act_hist = np.histogram(act,bins=n,range=hist_range)[0]
    prd_hist = np.histogram(pred,bins=n,range=hist_range)[0]
    
    E = np.outer(act_hist,prd_hist)
    E = np.divide(E,np.sum(E))
    
    num = np.sum(np.multiply(W,O))
    den = np.sum(np.multiply(W,E))
        
    return 1-np.divide(num,den)
    

In [3]:
train = pd.read_csv('/kaggle/input/data-science-bowl-2019/train.csv')
train_labels = pd.read_csv('/kaggle/input/data-science-bowl-2019/train_labels.csv')
specs = pd.read_csv('/kaggle/input/data-science-bowl-2019/specs.csv')
test = pd.read_csv('/kaggle/input/data-science-bowl-2019/test.csv')
submission = pd.read_csv('/kaggle/input/data-science-bowl-2019/sample_submission.csv')

In [4]:
list_of_user_activities = list(set(train['title'].value_counts().index).union(set(test['title'].value_counts().index)))
activities_map = dict(zip(list_of_user_activities, np.arange(len(list_of_user_activities))))

train['title'] = train['title'].map(activities_map)
test['title'] = test['title'].map(activities_map)
train_labels['title'] = train_labels['title'].map(activities_map)

In [5]:
win_code = dict(zip(activities_map.values(), (4100*np.ones(len(activities_map))).astype('int')))
win_code[activities_map['Bird Measurer (Assessment)']] = 4110

In [6]:
train['timestamp'] = pd.to_datetime(train['timestamp'])
test['timestamp'] = pd.to_datetime(test['timestamp'])

In [7]:
def get_data(user_sample, test_set=False):
    last_activity = 0
    user_activities_count = {'Clip':0, 'Activity': 0, 'Assessment': 0, 'Game':0}
    accuracy_groups = {0:0, 1:0, 2:0, 3:0}
    all_assessments = []
    accumulated_accuracy_group = 0
    accumulated_accuracy=0
    accumulated_correct_attempts = 0 
    accumulated_uncorrect_attempts = 0 
    accumulated_actions = 0
    counter = 0
    durations = []
    for i, session in user_sample.groupby('game_session', sort=False):
        session_type = session['type'].iloc[0]
        session_title = session['title'].iloc[0]
        if test_set == True:
            second_condition = True
        else:
            if len(session)>1:
                second_condition = True
            else:
                second_condition= False
            
        if (session_type == 'Assessment') & (second_condition):
            all_attempts = session.query(f'event_code == {win_code[session_title]}')
            true_attempts = all_attempts['event_data'].str.contains('true').sum()
            false_attempts = all_attempts['event_data'].str.contains('false').sum()
            features = user_activities_count.copy()
            features['session_title'] = session['title'].iloc[0] 
            features['accumulated_correct_attempts'] = accumulated_correct_attempts
            features['accumulated_uncorrect_attempts'] = accumulated_uncorrect_attempts
            accumulated_correct_attempts += true_attempts 
            accumulated_uncorrect_attempts += false_attempts
            if durations == []:
                features['duration_mean'] = 0
            else:
                features['duration_mean'] = np.mean(durations)
            durations.append((session.iloc[-1, 2] - session.iloc[0, 2] ).seconds)
            features['accumulated_accuracy'] = accumulated_accuracy/counter if counter > 0 else 0
            accuracy = true_attempts/(true_attempts+false_attempts) if (true_attempts+false_attempts) != 0 else 0
            accumulated_accuracy += accuracy
            if accuracy == 0:
                features['accuracy_group'] = 0
            elif accuracy == 1:
                features['accuracy_group'] = 3
            elif accuracy == 0.5:
                features['accuracy_group'] = 2
            else:
                features['accuracy_group'] = 1

            features.update(accuracy_groups)
            features['accumulated_accuracy_group'] = accumulated_accuracy_group/counter if counter > 0 else 0
            features['accumulated_actions'] = accumulated_actions
            accumulated_accuracy_group += features['accuracy_group']
            accuracy_groups[features['accuracy_group']] += 1
            if test_set == True:
                all_assessments.append(features)
            else:
                if true_attempts+false_attempts > 0:
                    all_assessments.append(features)
                
            counter += 1

    #         break

        accumulated_actions += len(session)
        if last_activity != session_type:
            user_activities_count[session_type] += 1
            last_activitiy = session_type

    if test_set:
        return all_assessments[-1] 
    return all_assessments

In [8]:
train = get_data(train)

In [11]:
train_data = pd.DataFrame.from_dict(train)
train_data.head()

Unnamed: 0,Clip,Activity,Assessment,Game,session_title,accumulated_correct_attempts,accumulated_uncorrect_attempts,duration_mean,accumulated_accuracy,accuracy_group,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,19,7,0,7,10,0,0,0.0,0.0,3,0,0,0,0,0.0,2185
1,22,8,1,9,11,1,0,39.0,1.0,0,0,0,0,1,3.0,2681
2,22,8,2,9,10,1,11,65.5,0.5,3,1,0,0,1,1.5,2768
3,32,13,4,13,10,2,11,41.25,0.5,2,2,0,0,2,1.5,3697
4,36,14,5,16,11,3,12,39.2,0.5,3,2,0,1,2,1.6,4124


In [13]:
train = train_data.copy()
del train_data

In [19]:
x = train.drop(columns=['accuracy_group'],axis=1)
y = train['accuracy_group']

In [35]:
def make_classifier():
    clf = CatBoostClassifier(
                               loss_function='MultiClass',
                               task_type="CPU",
                               learning_rate=0.01,
                               iterations=100,
                               od_type="Iter",
                               early_stopping_rounds=50,
                               random_seed=2019,
                               colsample_bylevel=0.87,
                               eval_metric='Kappa',
                              )
        
    return clf
oof = np.zeros(len(x))

In [None]:
params = {
       'loss_function':'MultiClass',
       'task_type':'CPU',
       'learning_rate':0.01,
       'iterations':20,
       'early_stopping_rounds':5,
       'random_seed':89,
       'colsample_bylevel':0.87,
       'eval_metric':'Kappa',
}

In [36]:
oof = np.zeros(len(x))
NFOLDS = 5
folds = KFold(n_splits=NFOLDS, shuffle=True, random_state=2019)

training_start_time = time()
for fold, (trn_idx, test_idx) in enumerate(folds.split(x, y)):
    start_time = time()
    print(f'Training on fold {fold+1}')
    clf = make_classifier()
    clf.fit(x.loc[trn_idx, all_features], y.loc[trn_idx], eval_set=(x.loc[test_idx, all_features], y.loc[test_idx]),
                          use_best_model=True, verbose=500, cat_features=cat_features)
    oof[test_idx] = clf.predict(x.loc[test_idx, all_features]).reshape(len(test_idx))
    
    print('Fold {} finished in {}'.format(fold + 1, str(datetime.timedelta(seconds=time() - start_time))))
    
print('-' * 30)
print('OOF QWK:', qwk(y, oof))
print('-' * 30)

Training on fold 1
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 89.3ms	remaining: 8.84s
99:	learn: 0.2786716	test: 0.2890752	best: 0.2890752 (99)	total: 2.22s	remaining: 0us

bestTest = 0.2890752386
bestIteration = 99

Fold 1 finished in 0:00:02.500966
Training on fold 2
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 24.4ms	remaining: 2.42s
99:	learn: 0.2887208	test: 0.2619001	best: 0.2647638 (54)	total: 2.17s	remaining: 0us

bestTest = 0.2647637675
bestIteration = 54

Shrink model to first 55 iterations.
Fold 2 finished in 0:00:02.289294
Training on fold 3
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 24.7ms	remaining: 2.44s
99:	learn: 0.2778267	test: 0.2906851	best: 0.2906851 (68)	total: 2.07s	remaining: 0us

bestTest = 0.2906851356
bestIteration = 68

Shrink model to first 69 iterations.
Fold 3 finished in 0:00:02.193022
Training on fold 4
0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)	total: 25ms	remaining: 2.48s
99:	learn

Note that Cross validation is only for the feature engineering part and you don't actually need it if you want to submit the results. You can safely comment it out. 

In [39]:
# process test set
new_test = []
for ins_id, user_sample in tqdm(test.groupby('installation_id', sort=False), total=1000):
    a = get_data(user_sample, test_set=True)
    new_test.append(a)
    
test = pd.DataFrame(new_test)
del new_test

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [40]:
test.head()

Unnamed: 0,Clip,Activity,Assessment,Game,session_title,accumulated_correct_attempts,accumulated_uncorrect_attempts,duration_mean,accumulated_accuracy,accuracy_group,0,1,2,3,accumulated_accuracy_group,accumulated_actions
0,14,7,1,3,22,1,0,30.0,1.0,0,0,0,0,1,3.0,867
1,29,11,5,12,37,4,7,49.6,0.466667,0,1,1,2,1,1.6,2718
2,6,2,0,0,10,0,0,0.0,0.0,0,0,0,0,0,0.0,149
3,10,2,0,1,10,0,0,0.0,0.0,0,0,0,0,0,0.0,233
4,17,1,0,6,37,0,0,0.0,0.0,0,0,0,0,0,0.0,951


In [42]:
# make predictions on test set once
preds = clf.predict(test)
del test

## Make submission

In [44]:
submission['accuracy_group'] = np.round(preds).astype('int')
submission.to_csv('submission.csv', index=None)
submission.head()

Unnamed: 0,installation_id,accuracy_group
0,00abaee7,3
1,01242218,3
2,017c5718,3
3,01a44906,3
4,01bc6cb6,3
