# GoToHack Kaggle
Решение от Никиты Петрова, г.Казань

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
import datetime

In [2]:
location = './kaggle/'
events = pd.DataFrame.from_csv(location+"user_activity.csv",index_col=None)
structure = pd.DataFrame.from_csv(location+"structure.csv",index_col=None)
targets = pd.DataFrame.from_csv(location+"targets.csv",index_col=None)
events_test = pd.DataFrame.from_csv(location+"user_activity_test.csv",index_col=None)

### Идея - выделим все степы-задания и отметим 1 если пользователь проходил степ-задание, 0 - если нет

In [3]:
def getTaskSteps(events):
    z = events.groupby(['step_id', 'step_type']).count()
    z_tasks = z.iloc[(z.index.get_level_values(1) != 'video') & (z.index.get_level_values(1) != 'text')]
    task_steps = z_tasks.index.get_level_values(0)
    return task_steps

In [4]:
def getPivot(events, task_steps):
    passed_tasks = events[events['step_id'].isin(task_steps) & (events['action']=='passed')][['user_id', 'step_id']]
    passed_tasks['indicator'] = 1
    tasks = pd.DataFrame(events[events['step_id'].isin(task_steps)].groupby(['user_id', 'step_id']).sum().index.values)
    passed_tasks[0] = list(zip(passed_tasks.user_id, passed_tasks.step_id))
    ft = tasks.merge(passed_tasks, on=0, how='left')
    ft.indicator.fillna(0, inplace=True)
    ft['user_id'] = ft[0].apply(lambda x: x[0]) 
    ft['step_id'] = ft[0].apply(lambda x: x[1])
    del(ft[0])
    users = pd.DataFrame()
    users['user_id'] = events.user_id.unique()
    ft = ft.merge(users, on='user_id', how='right')
    ft.step_id.fillna(ft.step_id.max(), inplace=True)
    ft.indicator.fillna(0, inplace=True)
    features = ft.pivot_table(index='user_id', columns='step_id', aggfunc=np.min).fillna(0)
    return features

In [6]:
task_steps = getTaskSteps(events)
pivot_train = getPivot(events, task_steps)
pivot_test = getPivot(events_test, task_steps)

In [15]:
pivot_train.head()

Unnamed: 0_level_0,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator,indicator
step_id,2824.0,2825.0,2829.0,2918.0,2919.0,3139.0,4287.0,4289.0,4952.0,4955.0,...,42436.0,42514.0,43605.0,43606.0,43607.0,44097.0,44160.0,44167.0,44174.0,44313.0
user_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


### Идея - посчитаем сколько времени всего пользователь участвовал в курсе, а также суммарное количество сдач и просмотров

In [7]:
def get_duration(ev):
    start_time = ev.groupby(['user_id'])['time'].agg(np.min)
    finish_time = ev.groupby(['user_id'])['time'].agg(np.max)
    duration = finish_time - start_time
    return (duration.index, duration.values)

In [8]:
events.time = events.time.apply(lambda x : datetime.datetime.fromtimestamp(int(x)))
events_test.time = events_test.time.apply(lambda x : datetime.datetime.fromtimestamp(int(x)))

In [10]:
counts = events.groupby(['user_id', 'action'])['time'].count()
X_train = pd.DataFrame()
X_train['user_id'], X_train['duration']  = get_duration(events)
X_train['duration'] = X_train['duration'].apply(lambda x : x.days + (x.seconds // 3600) / 24 )

In [17]:
counts_test = events.groupby(['user_id', 'action'])['time'].count()
X_test = pd.DataFrame()
X_test['user_id'], X_test['duration']  = get_duration(events_test)
X_test['duration'] = X_test['duration'].apply(lambda x : x.days + (x.seconds // 3600) / 24 )

In [12]:
passed_counts = counts.iloc[counts.index.get_level_values(1) == 'passed']
passed_counts.index = passed_counts.index.droplevel(1)
X_train = X_train.join(passed_counts, on='user_id')
X_train = X_train.rename(index=str, columns={'time' : 'passed_count'})

In [18]:
passed_counts_test = counts_test.iloc[counts_test.index.get_level_values(1) == 'passed']
passed_counts_test.index = passed_counts_test.index.droplevel(1)
X_test = X_test.join(passed_counts_test, on='user_id')
X_test = X_test.rename(index=str, columns={'time' : 'passed_count'})

In [13]:
viewed_counts = counts.iloc[counts.index.get_level_values(1) == 'viewed']
viewed_counts.index = viewed_counts.index.droplevel(1)
X_train = X_train.join(viewed_counts, on='user_id')
X_train = X_train.rename(index=str, columns={'time' : 'viewed_count'})

In [19]:
viewed_counts_test = counts_test.iloc[counts_test.index.get_level_values(1) == 'viewed']
viewed_counts_test.index = viewed_counts_test.index.droplevel(1)
X_test = X_test.join(viewed_counts, on='user_id')
X_test = X_test.rename(index=str, columns={'time' : 'viewed_count'})

In [20]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [21]:
X_train.head()

Unnamed: 0,user_id,duration,passed_count,viewed_count
0,1,5.625,254.0,1371.0
1,2,0.0,9.0,11.0
2,3,4.75,31.0,46.0
3,4,4.583333,66.0,105.0
4,6,2.291667,106.0,198.0


In [22]:
pivot_train['duration'] = X_train.duration.values
pivot_train['passed_count'] = X_train.passed_count.values
pivot_train['viewed_count'] = X_train.viewed_count.values

In [23]:
pivot_test['duration'] = X_test.duration.values
pivot_test['passed_count'] = X_test.passed_count.values
pivot_test['viewed_count'] = X_test.viewed_count.values

In [24]:
from sklearn.ensemble import RandomForestClassifier

In [25]:
modul = RandomForestClassifier(n_estimators=10000)

In [26]:
y = targets.passed.values

### Компенсируем несбалансированные классы с помощью class_weight

In [29]:
gsv = GridSearchCV(modul, {'class_weight': [{1: 2}, {1: 5}, {1: 10}, {1: 15}, {1: 25}]}, 
                    n_jobs=1, scoring='f1')

In [30]:
gsv.fit(trainset,y)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10000, n_jobs=1, oob_score=False,
            random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'class_weight': [{1: 2}, {1: 5}, {1: 10}, {1: 15}, {1: 25}]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='f1', verbose=0)

In [31]:
gsv.best_score_

0.32938114197603102

In [32]:
gsv.best_params_

{'class_weight': {1: 2}}

In [55]:
ans = gsv.predict(pivot_test) 

In [28]:
sample_submission = pd.read_csv(location+'ones_only.csv', index_col=None)

In [57]:
result = pd.DataFrame(sample_submission.user_id)
result['passed'] = ans
result.head()

Unnamed: 0,user_id,passed
0,5,0
1,9,0
2,21,0
3,26,0
4,29,0


In [58]:
result.to_csv('submission.csv', index = False)

In [59]:
result.describe()

Unnamed: 0,user_id,passed
count,4255.0,4255.0
mean,10566.669095,0.009166
std,6017.481505,0.095309
min,5.0,0.0
25%,5304.0,0.0
50%,10627.0,0.0
75%,15788.0,0.0
max,20876.0,1.0


## Результат 0.12766

### Идея - так как классы несбалансированные, лучше будет предсказывать баллы которые пользователь наберет и потом по порогу выбирать к какому классу его отнести

In [29]:
train_X = pd.DataFrame()

In [30]:
def change(trainset):
    tr = pd.DataFrame()
    for col in trainset.columns:
        if col[0] == 'indicator':
            tr[col[1]] = trainset[col]
        else:
            tr[col[0]]= trainset[col]
    return tr

In [31]:
train_X = change(pivot_train)

In [32]:
train_X.head()

Unnamed: 0_level_0,2824.0,2825.0,2829.0,2918.0,2919.0,3139.0,4287.0,4289.0,4952.0,4955.0,...,43606.0,43607.0,44097.0,44160.0,44167.0,44174.0,44313.0,duration,passed_count,viewed_count
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.625,254.0,1371.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,11.0
3,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.75,31.0,46.0
4,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.583333,66.0,105.0
6,1.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,2.291667,106.0,198.0


In [33]:
test_X = change(pivot_test)

In [34]:
from sklearn.ensemble import RandomForestRegressor

In [35]:
y_r = targets.score.values

In [36]:
rfr = RandomForestRegressor(n_estimators=10000)

In [37]:
param = {'max_depth':[None,10,20], 'max_features':['auto',5, 10,25,50]} 

In [24]:
gcv2 = GridSearchCV(rfr, param,  n_jobs=2)

In [25]:
gcv2.fit(train_X, y_r)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10000, n_jobs=1, oob_score=False,
           random_state=None, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=2,
       param_grid={'max_features': ['auto', 5, 10, 25, 50], 'max_depth': [None, 10, 20]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [30]:
gcv2.best_score_ 

0.39619278334641644

In [31]:
gcv2.best_params_

{'max_depth': 10, 'max_features': 5}

In [37]:
#сохраним лучшую модель
with open('regression.pickle','wb+') as f:
    pickle.dump(obj=gcv2, file=f)

In [42]:
result = pd.DataFrame(sample_submission.user_id.values)

In [49]:
result['passed'] = gcv2.predict(test_X)

In [50]:
#Стандартный порог узнаем из тренировочного набора
targets[targets.passed == 1].score.min()

91

In [53]:
result['passed2'] = result.passed.apply(lambda x: 1 if x > 90.4 else 0)

In [46]:
result.head()

Unnamed: 0,0,passed
0,5,0
1,9,0
2,21,0
3,26,0
4,29,1


# Проверка смещения регрессии по распределениям предсказания и тестовой выборки

In [50]:
result.describe()

Unnamed: 0,0,passed
count,4255.0,4255.0
mean,10566.669095,11.204049
std,6017.481505,17.750898
min,5.0,1.622058
25%,5304.0,1.794377
50%,10627.0,2.96289
75%,15788.0,12.355861
max,20876.0,110.826534


In [102]:
result['r'] = result.passed.apply(lambda x: 1 if x > 64.82 else 0)

In [103]:
result.describe()

Unnamed: 0,0,passed,r,passed2
count,4255.0,4255.0,4255.0,4255.0
mean,10566.669095,11.204049,0.039718,0.00329
std,6017.481505,17.750898,0.195319,0.057273
min,5.0,1.622058,0.0,0.0
25%,5304.0,1.794377,0.0,0.0
50%,10627.0,2.96289,0.0,0.0
75%,15788.0,12.355861,0.0,0.0
max,20876.0,110.826534,1.0,1.0


In [104]:
targets.describe()

Unnamed: 0,user_id,score,passed,distinction
count,16625.0,16625.0,16625.0,16625.0
mean,10408.208301,11.727459,0.039639,0.028451
std,6030.045326,25.757249,0.195116,0.166263
min,1.0,0.0,0.0,0.0
25%,5188.0,0.0,0.0,0.0
50%,10411.0,1.0,0.0,0.0
75%,15626.0,8.0,0.0,0.0
max,20880.0,119.0,1.0,1.0


In [105]:
result2 = pd.DataFrame()
result2['user_id'] = result[0]
result2['passed'] = result.r

In [106]:
result2.head()

Unnamed: 0,user_id,passed
0,5,0
1,9,0
2,21,0
3,26,0
4,29,1


In [107]:
result2.to_csv('submission_regression_unbiased.csv', index=False)

## Результат 0.41808