In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import json
import ast
from tqdm import tqdm
from collections import Counter
from matplotlib import pyplot as plt
import time


# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

train = pd.read_csv("/kaggle/input/data-science-bowl-2019/train.csv")
train_labels = pd.read_csv("/kaggle/input/data-science-bowl-2019/train_labels.csv")
test = pd.read_csv("/kaggle/input/data-science-bowl-2019/test.csv")
sample_sbmission = pd.read_csv("/kaggle/input/data-science-bowl-2019/sample_submission.csv")




stop_events = ['{"event_code": 2000, "event_count": 1}',
 '{"version":"1.0","event_count":1,"game_time":0,"event_code":2000}',
 '{"version":"1.0","level":0,"round":0,"event_count":1,"game_time":0,"event_code":2000}',
 '{"version":"1.0","round":0,"event_count":1,"game_time":0,"event_code":2000}',
 '{"description":"Let\'s set off these fireworks. We can drag them to any height we want!","identifier":"Dot_LetsFireworks,Dot_DragAnyHeight","media_type":"audio","total_duration":3885,"event_count":2,"game_time":0,"event_code":3010}',
 '{"version":"1","round":0,"event_count":1,"game_time":0,"event_code":2000}',
              '{"event_count":2,"game_time":77,"event_code":2025}']


def get_data_from(data):
    
    features = {}
    
    features["total_time"] = 0
    features["time0"] = 0
    features["time25"] = 0
    features["time50"] = 0
    features["time75"] = 0
    features["time100"] = 0
    features["start_time"] = 0
    features["end_time"] = 0
    features["day_of_start"] = 0
    features["hour_of_start"] = 0
    features["day_of_end"] = 0
    features["hour_of_end"] = 0
    features["num_false"] = 0
    features["num_cor"] = 0
    features["num_events"] = len(data)
    features["type"] = data["type"].mode().values[0]
    features["title"] = data["title"].mode().values[0]
    features["world"] = data["world"].mode().values[0]
    times = []
    descr = []
    dict2id = {}
    for item in data.values:
        d = dict(json.loads(item[3]))
        
        if "description" in d.keys():
            descr.append(d["description"])
            dict2id[d["description"]] = item[0]
        if "correct" in d.keys():
            if d["correct"]:
                features["num_cor"] += 1
            else:
                features["num_false"] += 1
        times.append(d["game_time"])
    times = np.array(times)
    cnt = Counter(descr).most_common()
    features["cnt_descr"] = len(cnt)
    if len(cnt) > 0:
        features["most_popular_dscr"] = cnt[0][0]
        features["most_popular_dscr_id"] = dict2id[cnt[0][0]]
        features["most_popular_dscr_num"] = cnt[0][1]
        
    else: 
        features["most_popular_dscr"] = None
        features["most_popular_dscr_id"] = None
        features["most_popular_dscr_num"] = None
        
        
    if len(cnt) > 1:
        features["most_popular_dscr2"] = cnt[1][0]
        features["most_popular_dscr2_id"] = dict2id[cnt[1][0]]
        features["most_popular_dscr2_num"] = cnt[1][1] 
    else:
        features["most_popular_dscr2"] = None
        features["most_popular_dscr2_id"] = None
        features["most_popular_dscr2_num"] = None
    if len(cnt) > 2:
        features["most_popular_dscr3"] = cnt[2][0]
        features["most_popular_dscr3_id"] = dict2id[cnt[2][0]]
        features["most_popular_dscr3_num"] = cnt[2][1]
    else:
        features["most_popular_dscr3"] = None
        features["most_popular_dscr3_id"] = None
        features["most_popular_dscr3_num"] = None
    features["total_time"] = times.sum()
    features["time0"], features["time25"], features["time50"], features["time75"], features["time100"] = np.percentile(times, np.linspace(0, 1, 5))
    features["start_time"] = data["timestamp"].min()
    features["end_time"] = data["timestamp"].max()
    features["day_of_start"] = features["start_time"].day_name()
    features["hour_of_start"] = features["start_time"].hour
    features["day_of_end"] = features["end_time"].day_name()
    features["hour_of_end"] = features["end_time"].hour
    features["duration"] = (features["end_time"].value - features["start_time"].value)
    return features



def data_preprocessing(data):
    
    
    
    
    data = data[data["event_data"].apply(lambda x: x not in stop_events)].reset_index(drop = True)
    data["timestamp"] = pd.to_datetime(data["timestamp"])
    grouped = data.groupby(by = ["installation_id", "game_session"])
    d = grouped.groups
    groups = list(d.keys())
#     features_data = pd.DataFrame()
    dict_for_data = {}
    
    for i,gr in enumerate(tqdm(groups)):
        
        features = get_data_from(data.iloc[d[gr]])
        
        if "installation_id" not in dict_for_data.keys():
            dict_for_data["installation_id"] = []
            dict_for_data["game_session"] = []
        dict_for_data["installation_id"].append(gr[0])
        dict_for_data["game_session"].append(gr[1])
#         features_data.loc[i, "installation_id"] = gr[0]
#         features_data.loc[i, "game_session"] = gr[1]
        for key in list(features.keys()):
            if key not in dict_for_data.keys():
                dict_for_data[key] = []
            dict_for_data[key].append(features[key])
#             features_data.loc[i, key] = features[key]
    features_data = pd.DataFrame(dict_for_data)
    
    
    features_data = features_data.sort_values(by = ["start_time", "end_time"]).reset_index(drop = True)
    return features_data


def merge_train(prep_data, train_labels):
    return train_labels.merge(prep_data, how = "left", on = ["game_session", "installation_id"]).sort_values(by = ["start_time", "end_time"]).reset_index(drop = True)
    
    
    
prep_data = data_preprocessing(train)
# prep_data.to_csv("prep_data.csv")

prep_test = data_preprocessing(test)
# prep_test.to_csv("prep_test.csv")


for_train = merge_train(prep_data, train_labels)

cat_columns = ['title_y', 'world', 'most_popular_dscr', 'most_popular_dscr_id', 'most_popular_dscr2', 'most_popular_dscr2_id', 
              'most_popular_dscr3', 'most_popular_dscr3_id', 'day_of_start', 'day_of_end']

numeric_columns = ['total_time', 'time0',
       'time25', 'time50', 'time75', 'time100', 'start_time', 'end_time',
        'hour_of_start', 'hour_of_end',
       'num_false', 'num_cor', 'num_events','most_popular_dscr_num',
               'most_popular_dscr2_num',  'most_popular_dscr3_num', 'duration'    
                  ]

target1 = 'num_correct'
target2 = 'num_incorrect'


from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, CatBoostClassifier

for_train[cat_columns] = for_train[cat_columns].fillna("Na")
for_train[numeric_columns] = for_train[numeric_columns].fillna(0)

X_train1, X_test1, y_train1, y_test1 = train_test_split(for_train,for_train[target1], test_size = 0.3, shuffle = False)

X_train2, X_test2, y_train2, y_test2 = train_test_split(for_train,for_train[target2], test_size = 0.3, shuffle = False)

params1= {'bagging_temperature': 0.05363349091949243,
   'depth': 4,
   'iterations': 960,
   'l2_leaf_reg': 1.4404906764735232,
   'learning_rate': 0.04306412229499572,
   'max_ctr_complexity': 6,
   'model_size_reg': 9.33853472553047,
   'random_strength': 0.5477062496599889,
          "has_time" : True}

model = CatBoostClassifier(**params1)

model.fit(X_train1[cat_columns + numeric_columns], y_train1,
          eval_set = (X_test1[cat_columns + numeric_columns], y_test1), 
          cat_features=cat_columns, plot = True)

pred1 = model.predict(X_test1[cat_columns + numeric_columns])

params2 ={'bagging_temperature': 8.322400566687106,
'depth': 4,
'iterations': 5000,
'l2_leaf_reg': 30.306334651665228,
'learning_rate': 0.002511844524894725,
'max_ctr_complexity': 7,
'model_size_reg': 5.6099373073302345,
'random_strength': 2.320658986302457,
      "has_time" : True}

model2 = CatBoostRegressor(**params2)
model2.fit(X_train2[cat_columns + numeric_columns], y_train2,
          eval_set = (X_test2[cat_columns + numeric_columns], y_test2), 
          cat_features=cat_columns, plot = True)

pred2 = model2.predict(X_test2[cat_columns + numeric_columns])

plt.plot(y_test2, pred2, ".")
plt.plot(y_test2, y_test2)


from sklearn.base import BaseEstimator, TransformerMixin

def qwk(a1, a2):
    """
    Source: https://www.kaggle.com/c/data-science-bowl-2019/discussion/114133#latest-660168

    :param a1:
    :param a2:
    :param max_rat:
    :return:
    """
    max_rat = 3
    a1 = np.asarray(a1, dtype=int)
    a2 = np.asarray(a2, dtype=int)

    hist1 = np.zeros((max_rat + 1, ))
    hist2 = np.zeros((max_rat + 1, ))

    o = 0
    for k in range(a1.shape[0]):
        i, j = a1[k], a2[k]
        hist1[i] += 1
        hist2[j] += 1
        o +=  (i - j) * (i - j)

    e = 0
    for i in range(max_rat + 1):
        for j in range(max_rat + 1):
            e += hist1[i] * hist2[j] * (i - j) * (i - j)

    e = e / a1.shape[0]

    return 1 - o / e


def eval_qwk_lgb(y_true, y_pred):
    """
    Fast cappa eval function for lgb.
    """

#     y_pred = y_pred.reshape(len(np.unique(y_true)), -1).argmax(axis=0)
    return 'cappa', qwk(y_true, y_pred), True

def make_pred(pred1, pred2, a, b):
    y_pred1 = np.zeros(len(pred1)) 
    y_pred1[(pred1!= 0) & (pred2 >= a)] = 1
    y_pred1[(pred1!= 0) & (pred2 >= b) & (pred2 < a)] = 2
    y_pred1[(pred1!= 0) &  (pred2 < b)] = 3
    return y_pred1

y_pred1 = make_pred(pred1, pred2, 1.5, 0.6)

eval_qwk_lgb(X_test1["accuracy_group"].values, make_pred(pred1, pred2, 1.5, 0.6))

prep_test['title_y'] = prep_test["title"]

prep_test[numeric_columns] = prep_test[numeric_columns].fillna(0)
prep_test[cat_columns] = prep_test[cat_columns].fillna("Na")

final_pred1 = model.predict(prep_test[prep_test["type"] == "Assessment"][cat_columns + numeric_columns])
final_pred2 = model2.predict(prep_test[prep_test["type"] == "Assessment"][cat_columns + numeric_columns])
final_pred = make_pred(final_pred1, final_pred2, 1.5, 0.6)
ids = prep_test[prep_test["type"] == "Assessment"]["installation_id"]
for i, inst in enumerate(sample_submission["installation_id"].values):
    sample_submission.loc[i, "accuracy_group"] = final_pred[ids == inst].mean()
    
sample_submission = sample_submission.fillna(0)
sample_submission["accuracy_group"] = sample_submission["accuracy_group"].astype(int)
sample_submission.to_csv('submission.csv', index=False)

/kaggle/input/data-science-bowl-2019/sample_submission.csv
/kaggle/input/data-science-bowl-2019/specs.csv
/kaggle/input/data-science-bowl-2019/train_labels.csv
/kaggle/input/data-science-bowl-2019/test.csv
/kaggle/input/data-science-bowl-2019/train.csv


KeyboardInterrupt: 