In [2]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import scipy
import datetime
import tqdm
from utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


**Данные о просмотрах каналов**

In [4]:
df = pd.read_csv('dataset11-30.csv')
df.start_time = pd.to_datetime(df.start_time)
df.stop_time = pd.to_datetime(df.stop_time)

**Данные о тв-передачах**

In [3]:
df_info = pd.read_csv('export_arh_11-20-final.csv').append(pd.read_csv('export_arh_21-30-final.csv'))
df_info.start_time = pd.to_datetime(df_info.start_time, format='%d.%m.%Y %H:%M:%S')
df_info['stop_time'] = df_info['start_time'] + pd.to_timedelta(df_info['duration'], unit='s')
df_info = df_info[df_info.tv_show_id != 0].copy()
df_info_future = pd.read_csv('export_arh_31-42-final.csv', low_memory=False)

**Данные о просмотрах тв-передач**

In [12]:
df_show = pd.read_csv('dataset_with_shows.csv')
for c in ['start_time', 'show_start_time', 'stop_time','show_stop_time']:
    df_show[c] = pd.to_datetime(df_show[c])
df_show['start_show_user_time'] = df_show[['show_start_time','start_time']].max(axis=1)
df_show['stop_show_user_time'] = df_show[['stop_time','show_stop_time']].min(axis=1)
df_show['user_watch_time'] = (df_show['stop_show_user_time'] - df_show['start_show_user_time']).dt.total_seconds()
df_show['show_duration'] = (df_show['show_stop_time'] - df_show['show_start_time']).dt.total_seconds()
df_show['user_watch_perc'] = df_show['user_watch_time'] / df_show['show_duration']

**Holdout по топ-просмотрам**

In [5]:
# разбиение на holdout по времени
train_start_date = df_show.start_time.min()# + datetime.timedelta(days = 4 * 7)
split_date = df_show.start_time.max() - datetime.timedelta(days = 9 * 7)
train = df_show[(df_show.start_time <= split_date) & (df_show.start_time >= train_start_date)].copy()
val = df_show[df_show.start_time > split_date].copy()

# сбор признаков и таргета по просмотрам
train_top_views = get_features(train[train['tv_show_id'].isin(val['tv_show_id'].unique())])
val_top_views = get_target(val)
overall_top_movies = train_top_views.groupby(['tv_show_id'], as_index = False)['user_show_freq'].sum().sort_values(['user_show_freq'])['tv_show_id'][-5:].values
overall_top_movies = overall_top_movies[::-1]

In [375]:
# проверка точности решения
preds = []
trues = []
for user in train_top_views.user_id.unique():
    
    predict_n_movies = train_top_views[train_top_views.user_id == user]['tv_show_id'].values[:5]
    actual_n_movies = val_top_views[val_top_views.user_id == user]['tv_show_id'].values[:5]
    
    if len(predict_n_movies) < 5:
        predict_n_movies = list(predict_n_movies[:len(predict_n_movies)]) + list(overall_top_movies[:5 - len(predict_n_movies)])
    
    
    preds.append(list(predict_n_movies))
    trues.append(list(actual_n_movies))
    
score = np.round(mapk(trues, preds, k = 5), 5)
print(f'MAP@{5} = {score}')

MAP@5 = 0.32881


**Предсказание по топ-всегда просмотрам**

In [380]:
# данные о просмотрах
df_top_views = get_features(df_show[df_show['tv_show_id'].isin(df_info_future['tv_show_id'].unique())])
overall_top_movies = df_top_views.groupby(['tv_show_id'], as_index = False)['user_show_freq'].sum().sort_values(['user_show_freq'])['tv_show_id'][-5:].values[::-1]

# прогноз на будущее
submission_df = pd.read_csv('submission.csv')
for index_row, row in submission_df.iterrows():
    pred_n_movies = list(df_top_views[df_top_views.user_id == row['user_id']]['tv_show_id'].values[:5])
    
    if len(pred_n_movies) < 5:
        pred_n_movies = list(pred_n_movies[:len(pred_n_movies)]) + list(overall_top_movies[:5 - len(pred_n_movies)])
    
    pred = ' '.join([str(int(x)) for x in pred_n_movies])
    submission_df.loc[index_row, 'tv_show_id'] = pred
    
submission_df.to_csv('baseline_submission.csv', index = False)