#### Подключаем требуемые библиотеки

In [1]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import roc_auc_score

#### Функции для предобратки данных

In [2]:
# Добавляет в новый датафрейм колоник с датой и днем
def add_day(df):
    df_day = df.copy()
    df_day['date'] = pd.to_datetime(df_day.timestamp, unit='s')
    df_day['day'] = df_day.date.dt.date
    return df_day

In [3]:
# Создает датафрейм из номера пользователя и того, завершил ли он курс
def is_finished(sub_df):
    correct_steps = sub_df.query('submission_status=="correct"').groupby('user_id', as_index=False)\
    .agg({'step_id':'count'}).rename(columns={'step_id':'corrects'})
    passed_course = correct_steps
    passed_course.corrects = pd.Series(np.where(passed_course.corrects>=40, 1, 0))
    passed_course=passed_course.rename(columns={'corrects':'finished'})
    return passed_course

In [4]:
# Фильтрует данные, оставляя действия за первые два дня активности пользователя
def filter_by_days(df, timedelta=(2*24*60*60)):
    filtered_df = df.copy()
    min_timestamp = filtered_df.groupby('user_id').agg({'timestamp':'min'}).rename(columns={'timestamp':'min_t'})
    filtered_df = filtered_df.merge(min_timestamp, on='user_id', how='outer')
    filtered_df = filtered_df.query('timestamp < min_t + @timedelta')
    return filtered_df

In [5]:
# Создает датафрейм из номера пользователя и числа шагов, которые он попытался решить
def steps_tried(sub_df):
    submission_df = sub_df.copy()
    steps_tried = submission_df.groupby('user_id', as_index=False).agg({'step_id':'count'})\
    .rename(columns = {'step_id':'steps_tried'})
    return steps_tried

In [6]:
# Разворачивает номинативные переменые в определенной колонке
def expand_features(df, expand_col):
    exp_df = df.copy()
    exp_df = exp_df.pivot_table(index='user_id',
                        columns=expand_col,
                        values='step_id',
                        aggfunc='count',
                        fill_value=0).reset_index().rename_axis('', axis=1)
    return exp_df

In [7]:
# Загружаем данные
event_train = pd.read_csv(r'C:\Users\Вова\Downloads\event_data_train\event_data_train.csv')
submission_train=pd.read_csv(r'C:\Users\Вова\Downloads\submissions_data_train\submissions_data_train.csv')
submission_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/submission_data_test.csv')
event_test = pd.read_csv('https://stepik.org/media/attachments/course/4852/events_data_test.csv')

In [8]:
# Добавляем дни
event_train = add_day(event_train)
submission_train = add_day(submission_train)
event_test = add_day(event_test)
submission_test = add_day(submission_test)

In [9]:
# Определяем, кто закончил курс, а кто нет
passed_train = is_finished(submission_train)
passed_test = is_finished(submission_test)

In [10]:
passed_test

Unnamed: 0,user_id,finished
0,12,0
1,13,0
2,15,0
3,21,0
4,35,0
...,...,...
2758,26775,1
2759,26780,0
2760,26785,0
2761,26796,0


In [11]:
# Отираем данные за первые два дня
event_train_filtered = filter_by_days(event_train)
submission_train_filtered = filter_by_days(submission_train)
event_test_filtered = filter_by_days(event_test)
submission_test_filtered = filter_by_days(submission_test)

In [12]:
event_test_filtered

Unnamed: 0,step_id,timestamp,action,user_id,date,day,min_t
0,30456,1526893787,viewed,24417,2018-05-21 09:09:47,2018-05-21,1526893439
1,30456,1526893797,viewed,24417,2018-05-21 09:09:57,2018-05-21,1526893439
2,30456,1526893954,viewed,24417,2018-05-21 09:12:34,2018-05-21,1526893439
3,30456,1526895780,viewed,24417,2018-05-21 09:43:00,2018-05-21,1526893439
4,30456,1526893787,discovered,24417,2018-05-21 09:09:47,2018-05-21,1526893439
...,...,...,...,...,...,...,...
274763,158433,1545846175,discovered,3649,2018-12-26 17:42:55,2018-12-26,1545846175
274764,158433,1545846175,passed,3649,2018-12-26 17:42:55,2018-12-26,1545846175
274765,158433,1545846283,viewed,20953,2018-12-26 17:44:43,2018-12-26,1545846283
274766,158433,1545846283,discovered,20953,2018-12-26 17:44:43,2018-12-26,1545846283


In [13]:
# Находим число попыток решить задачу
steps_train = steps_tried(submission_train_filtered)
steps_test = steps_tried(submission_test_filtered)

In [14]:
# Разворачиваем номинативные переменные в колонки
event_train_filtered_expanded = expand_features(event_train_filtered, 'action')
submission_train_filtered_expanded = expand_features(submission_train_filtered, 'submission_status')
event_test_filtered_expanded = expand_features(event_test_filtered, 'action')
submission_test_filtered_expanded = expand_features(submission_test_filtered, 'submission_status')

In [28]:
event_train_filtered_expanded

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed
0,1,1,0,0,1
1,2,9,9,2,9
2,3,15,15,4,20
3,5,1,1,0,1
4,7,1,1,0,1
...,...,...,...,...,...
19229,26790,2,2,0,2
19230,26793,1,0,1,1
19231,26794,50,50,24,90
19232,26797,10,10,2,10


In [15]:
# Добавляем фичу отношения верных ответов ко всем
submission_train_filtered_expanded['rate'] = submission_train_filtered_expanded.correct /\
(submission_train_filtered_expanded.correct + submission_train_filtered_expanded.wrong)
submission_test_filtered_expanded['rate'] = submission_test_filtered_expanded.correct /\
(submission_test_filtered_expanded.correct + submission_test_filtered_expanded.wrong)

In [16]:
# Объединяем датафреймы
user_train = pd.merge(event_train_filtered_expanded, 
                      submission_train_filtered_expanded, 
                      on='user_id', how='outer').fillna(0)
user_train = user_train.merge(steps_train, on='user_id', how='outer').fillna(0)
user_train = user_train.merge(passed_train, on='user_id', how='outer').fillna(0)

In [17]:
user_train

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,correct,wrong,rate,steps_tried,finished
0,1,1,0,0,1,0.0,0.0,0.000000,0.0,0.0
1,2,9,9,2,9,2.0,0.0,1.000000,2.0,0.0
2,3,15,15,4,20,4.0,4.0,0.500000,8.0,0.0
3,5,1,1,0,1,2.0,2.0,0.500000,4.0,0.0
4,7,1,1,0,1,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
19229,26790,2,2,0,2,1.0,0.0,1.000000,1.0,0.0
19230,26793,1,0,1,1,0.0,0.0,0.000000,0.0,0.0
19231,26794,50,50,24,90,24.0,7.0,0.774194,31.0,0.0
19232,26797,10,10,2,10,2.0,0.0,1.000000,2.0,0.0


In [18]:
user_test = pd.merge(event_test_filtered_expanded, 
                      submission_test_filtered_expanded, 
                      on='user_id', how='outer').fillna(0)
user_test = user_test.merge(steps_test, on='user_id', how='outer').fillna(0)
user_test = user_test.merge(passed_test, on='user_id', how='outer').fillna(0)

In [19]:
user_test

Unnamed: 0,user_id,discovered,passed,started_attempt,viewed,correct,wrong,rate,steps_tried,finished
0,4,1,1,0,1,0.0,0.0,0.000000,0.0,0.0
1,6,1,1,0,1,0.0,0.0,0.000000,0.0,0.0
2,10,2,2,0,6,0.0,0.0,0.000000,0.0,0.0
3,12,11,9,4,14,1.0,0.0,1.000000,1.0,0.0
4,13,70,70,35,105,29.0,36.0,0.446154,65.0,0.0
...,...,...,...,...,...,...,...,...,...,...
6179,26791,1,1,0,1,0.0,0.0,0.000000,0.0,0.0
6180,26795,1,1,0,1,0.0,0.0,0.000000,0.0,0.0
6181,26796,6,4,2,12,2.0,3.0,0.400000,5.0,0.0
6182,26799,6,6,2,6,2.0,0.0,1.000000,2.0,0.0


In [20]:
# Формируем данные для обучения, валидации и теста
y = user_train.finished
x = user_train.drop('finished', axis=1)
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.33, random_state=42)
y_test = user_test.finished
x_test = user_test.drop('finished', axis=1)

#### Обучение модели

In [61]:
# Создаем случаный лес
rf = RandomForestClassifier()
# Задаем широкий список параметров
params = {'n_estimators':range(10,50,3), 
          'criterion':['gini', 'entropy'], 
          'max_depth':range(1,10,2), 
          'min_samples_split':range(2,10), 
          'min_samples_leaf':range(1,10,2)}
# Для увеличения скорости обучения используем RandomizedSearchCV
rand_search = RandomizedSearchCV(rf, params, cv=5)
# Обучаем модель
rand_search.fit(x_train, y_train)
# Смотрим на полученные лучшие параметры
rand_search.best_params_

{'n_estimators': 37,
 'min_samples_split': 8,
 'min_samples_leaf': 9,
 'max_depth': 7,
 'criterion': 'entropy'}

In [21]:
# Задаем узкий список параметров исходя из результатов предыдущего шага
parametres = {
    'n_estimators':[37,39], 
    'criterion':['entropy'], 
    'max_depth':[6,7], 
    'min_samples_split':[8,9], 
    'min_samples_leaf':[6,9]}

In [22]:
random_forest = RandomForestClassifier()
# Перебираем все комбинации параметров
grid_search = GridSearchCV(random_forest, parametres, cv=5)
# Обучаем модель
grid_search.fit(x_train, y_train)
grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 9,
 'min_samples_split': 9,
 'n_estimators': 37}

In [23]:
# Модель с лучшими параметрами
best_clf = grid_search.best_estimator_
# Предасказываем вероятность прохождения курса по валидационным данным
y_pred_proba = best_clf.predict_proba(x_valid)[:,1]

In [24]:
# Считаем валидационный ROC Score
roc = roc_auc_score(y_valid, y_pred_proba)
roc

0.8815477158841412

In [25]:
# Создаем датасет на тестовых данных, где предсказана вероятность прохождения курса каждым пользователем
test_csv = pd.DataFrame({'is_gone':best_clf.predict_proba(x_test)[:,1]}, index=x_test.user_id)
test_csv

Unnamed: 0_level_0,is_gone
user_id,Unnamed: 1_level_1
4,0.000000
6,0.000000
10,0.000000
12,0.052449
13,0.666451
...,...
26791,0.000000
26795,0.000000
26796,0.074042
26799,0.125455


In [26]:
# Создаем файл, который потом загружается на проверку
test_csv.to_csv()

#### Результат, при загрузке файла на проверку: Your ROC score is 0.8895367080048218