In [1]:
#!c1.8
%pip install -U pip icecream seaborn scikit-learn

Defaulting to user installation because normal site-packages is not writeable


In [40]:
#!c1.8
import pandas as pd
import numpy as np

import os
import pickle

import matplotlib.pyplot as plt
import seaborn as sns

from icecream import ic
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

random_state = 42

In [42]:
#!c1.8

def prepare_df(df):
    
    # Преобразуем время создания к типу datetime
    df.created = pd.to_datetime(df.created)
    # Выделим дату
    df['date'] = df.created.dt.date
    # Выделим номер месяца
    df['month'] = df.created.dt.month
    # Выделим номер дня недели
    df['weekdayid'] = pd.to_datetime(df.date).dt.weekday
    # Убираем пустые даты
    df = df[df.created!='1970-01-01 03:00:00']
    df.created = pd.to_datetime(df.created)
    df.fillna('', inplace=True)
    # Добавляем в подкатегирии категирии, в города области, в версии ос наименования ос во избежание неверной идентификации
    df.subgamecategory = df.gamecategory + '/' + df.subgamecategory
    df.city = df.oblast + '/' + df.city
    df.osv = df.os + '/' + df.osv
    df.os = df.os.str.lower()
    
    dfr = pd.DataFrame()

    if 'Segment' in df.columns:
        dfr['segment'] = df['Segment']

    # Разобьем время на 5-минутные интервалы (условно-минималная сессия без прерываний видео)
    # если в рамках одной сессии показано несколько видео, то фиксируем нескольких пользователей
    dfr['5min'] = (df.created.sub(pd.to_datetime('2021-01-01')).dt.total_seconds() // 300).astype(int)
    dfr['weekdayid'] = df.weekdayid
    dfr['month'] = df.month
    gamecategory_list = df['gamecategory'].unique()
    gamecategory_to_gcid = dict(zip(gamecategory_list, np.arange(len(gamecategory_list))))
    dfr['gcid'] = df['gamecategory'].map(gamecategory_to_gcid)
    subgamecategory_list = df['subgamecategory'].sort_values().unique()
    subgamecategory_to_sgcid = dict(zip(subgamecategory_list, np.arange(len(subgamecategory_list))))
    dfr['sgcid'] = df['subgamecategory'].map(subgamecategory_to_sgcid)
    bundle_list = df['bundle'].unique()
    bundle_to_bid = dict(zip(bundle_list, np.arange(len(bundle_list))))
    dfr['bid'] = df['bundle'].map(bundle_to_bid)
    shift_list = df['shift'].unique()
    shift_to_tsid = dict(zip(shift_list, np.arange(len(shift_list))))
    dfr['tsid'] = df['shift'].map(shift_to_tsid)
    oblast_list = df['oblast'].unique()
    oblast_to_oid = dict(zip(oblast_list, np.arange(len(oblast_list))))
    dfr['oid'] = df['oblast'].map(oblast_to_oid)
    city_list = df['city'].unique()
    city_to_cid = dict(zip(city_list, np.arange(len(city_list))))
    dfr['cid'] = df['city'].map(city_to_cid)
    os_list = df['os'].unique()
    os_to_osid = dict(zip(os_list, np.arange(len(os_list))))
    dfr['osid'] = df['os'].map(os_to_osid)
    osv_list = df['osv'].unique()
    osv_to_osvid = dict(zip(osv_list, np.arange(len(osv_list))))
    dfr['osvid'] = df['osv'].map(osv_to_osvid)
    return dfr


dates = [
    '2021-07-05',
    '2021-07-06',
    '2021-07-07',
    '2021-07-08',
    '2021-07-09',
    '2021-07-10',
    '2021-07-11',
    '2021-08-02',
    '2021-08-03',
    '2021-08-04',
    '2021-08-05',
    '2021-08-06',
    '2021-08-07',
    '2021-08-08',
    '2021-09-15',
    '2021-09-16',
    '2021-09-17',
    '2021-09-18',
    '2021-09-19',
    '2021-09-20',
]

for d in dates:
    ic()
    df_train = pd.read_csv(f"data/train_{d}.csv.zip")
    df_prep = prepare_df(df_train)

    X = df_prep.iloc[:,1:]
    y = df_prep.iloc[:,0]
    model = RandomForestClassifier(n_estimators=25, max_depth=15, random_state=random_state, n_jobs=8)
    model.fit(X, y)

    df_test = pd.read_csv(f"data/test_{d}.csv.zip", index_col=0)
    
    X_test = prepare_df(df_test)
    
    y_pred=model.predict(X_test)
    y_score=model.predict_proba(X_test)
    pd.DataFrame(y_score, columns=model.classes_, index = df_test.index).to_csv(f"results_{d}.csv", index=True, compression='zip')
    os.rename(f"results_{d}.csv", f"data/results_{d}.zip")

ic| <ipython-input-17-29664266f362>:82 in <module> at 18:55:34.032
ic| <ipython-input-17-29664266f362>:82 in <module> at 18:56:23.097
ic| <ipython-input-17-29664266f362>:82 in <module> at 18:57:01.660
ic| <ipython-input-17-29664266f362>:82 in <module> at 18:57:33.964
ic| <ipython-input-17-29664266f362>:82 in <module> at 18:58:09.194
ic| <ipython-input-17-29664266f362>:82 in <module> at 18:58:49.142
ic| <ipython-input-17-29664266f362>:82 in <module> at 18:59:25.977
ic| <ipython-input-17-29664266f362>:82 in <module> at 18:59:59.585
  mask |= (ar1 == a)
ic| <ipython-input-17-29664266f362>:82 in <module> at 19:02:27.746
  mask |= (ar1 == a)
ic| <ipython-input-17-29664266f362>:82 in <module> at 19:04:51.458
  mask |= (ar1 == a)
ic| <ipython-input-17-29664266f362>:82 in <module> at 19:06:58.474
ic| <ipython-input-17-29664266f362>:82 in <module> at 19:07:49.255
ic| <ipython-input-17-29664266f362>:82 in <module> at 19:08:31.487
ic| <ipython-input-17-29664266f362>:82 in <module> at 19:09:10.362

In [48]:
#!c1.8
with open('rfc.pkl', 'wb') as f:
    pickle.dump(model, f)

In [26]:
#!c1.8
df_test.index

Int64Index([       3,       22,       33,       56,       69,       77,
                  91,      103,      108,      158,
            ...
            11213474, 11213475, 11213479, 11213514, 11213531, 11213566,
            11213596, 11213601, 11213623, 11213627],
           dtype='int64', length=463223)

In [30]:
#!c1.8
p = pd.DataFrame(y_score)

In [36]:
#!c1.8
y_score

array([[0.17777778, 0.57122222, 0.251     ],
       [0.04551774, 0.3245384 , 0.62994385],
       [0.06927304, 0.62321844, 0.30750852],
       ...,
       [0.05105888, 0.31769927, 0.63124185],
       [0.03903878, 0.4452879 , 0.51567331],
       [0.09594484, 0.17828158, 0.72577358]])

In [38]:
#!c1.8
model.classes_

array([1, 3, 4])

In [25]:
#!c1.8
pd.DataFrame(y_score, columns=[1,2,3,4,5], index = df_test.index)

ValueError: Shape of passed values is (463223, 288), indices imply (463223, 5)

In [17]:
#!c1.8
df_train

Unnamed: 0,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv
2,,,1387897651,2021-09-17 15:54:00,MSK,Москва,Москва,ios,14.4.0
9,,,1493199883,2021-09-17 11:59:23,MSK+2,Свердловская область,Екатеринбург,ios,14.7.1
19,Games,Casino,com.murka.slotsera,2021-09-17 12:51:20,MSK,Санкт-Петербург,Санкт-Петербург,android,8.0.0
24,,,ru.ok.android,2021-09-17 09:37:36,MSK,Москва,Москва,Android,10
25,Games,Casual,com.sukhavati.gotoplaying.bubble.BubbleShooter...,2021-09-17 14:34:09,MSK,Кабардино-Балкария,Нальчик,android,10.0
...,...,...,...,...,...,...,...,...,...
11213528,Games,Word,es.socialpoint.wordlife,2021-09-17 18:36:14,MSK,Тамбовская область,Тамбов,android,4.4.4
11213534,Games,Casual,bling.crush.match3.free.android,2021-09-17 21:44:10,MSK,Москва,Москва,android,11.0
11213551,Games,Simulation,com.paisa.flying.horse,2021-09-17 05:56:30,MSK+7,Хабаровский край,Хабаровск,android,8.1.0
11213562,,,com.rrpopstar.union,2021-09-17 17:33:29,MSK,Краснодарский край,Краснодар,android,9.0.0


In [3]:
#!c1.8
ic()
df = pd.read_csv('data/train_2021-09-15.csv.zip')
ic()
# Преобразуем время создания к типу datetime
df.created = pd.to_datetime(df.created)
# Выделим дату
df['date'] = df.created.dt.date
# Выделим номер месяца
df['month'] = df.created.dt.month
# Выделим номер дня недели
df['weekdayid'] = pd.to_datetime(df.date).dt.weekday
ic()

ic| <ipython-input-3-eacb13f4ab40>:1 in <module> at 13:36:10.622
ic| <ipython-input-3-eacb13f4ab40>:3 in <module> at 13:36:15.231
ic| <ipython-input-3-eacb13f4ab40>:12 in <module> at 13:36:17.050


In [4]:
#!c1.8
ic()

# Убираем путсые даты
df = df[df.created!='1970-01-01 03:00:00']
df.created = pd.to_datetime(df.created)
df.fillna('', inplace=True)

# Добавляем в подкатегирии категирии, в города области, в версии ос наименования ос во избежание неверной идентификации
df.subgamecategory = df.gamecategory + '/' + df.subgamecategory
df.city = df.oblast + '/' + df.city
df.osv = df.os + '/' + df.osv
df.os = df.os.str.lower()
ic()

ic| <ipython-input-4-4468538cfdbb>:1 in <module> at 13:36:20.083
ic| <ipython-input-4-4468538cfdbb>:13 in <module> at 13:36:23.696


In [5]:
#!c1.8
ic()
display(df.describe(include='all'))
ic()

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv,date,month,weekdayid
count,2274243.0,2274243,2274243,2274243,2274243,2274243,2274243,2274243,2274243,2274243,2274243,2274243.0,2274243.0
unique,,24,75,24889,86058,12,84,1868,2,317,1,,
top,,Games,/,com.merge.cube.winner,2021-09-15 12:33:30,MSK,Москва,Москва/Москва,android,android/10.0,2021-09-15,,
freq,,1298469,745925,53253,142,1326632,400446,400188,1964815,473907,2274243,,
first,,,,,2021-09-15 00:00:10,,,,,,,,
last,,,,,2021-09-15 23:59:59,,,,,,,,
mean,4.150158,,,,,,,,,,,9.0,2.0
std,1.046951,,,,,,,,,,,0.0,0.0
min,1.0,,,,,,,,,,,9.0,2.0
25%,3.0,,,,,,,,,,,9.0,2.0


ic| <ipython-input-5-83fcf1c675ca>:1 in <module> at 13:40:40.082
ic| <ipython-input-5-83fcf1c675ca>:3 in <module> at 13:40:43.155


In [6]:
#!c1.8
ic()
display(df)
ic()

Unnamed: 0,Segment,gamecategory,subgamecategory,bundle,created,shift,oblast,city,os,osv,date,month,weekdayid
0,2,,/,ru.ok.android,2021-09-15 17:27:53,,,/,android,Android/10,2021-09-15,9,2
1,2,Games,Games/Simulation,com.paurau.MakeHer,2021-09-15 13:56:23,MSK,Орловская область,Орловская область/Ливны,android,android/10.0,2021-09-15,9,2
2,3,Games,Games/Casual,sandbox.pixel.number.coloring.book.page.art.free,2021-09-15 17:55:38,MSK+2,Пермский край,Пермский край/Пермь,android,android/11.0,2021-09-15,9,2
3,5,Games,Games/Role Playing,com.sonypicturestelevision.zombieland,2021-09-15 11:43:56,MSK+3,Омская область,Омская область/Омск,android,android/11.0,2021-09-15,9,2
4,2,Games,Games/Action,com.glu.zbs,2021-09-15 13:24:05,,,/,android,android/11.0,2021-09-15,9,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2274238,5,Games,Games/Board,1061530248,2021-09-15 08:31:38,MSK,Ставрополье,Ставрополье/Пятигорск,ios,ios/12.5.4,2021-09-15,9,2
2274239,5,,/,566537830,2021-09-15 16:59:01,MSK,Краснодарский край,Краснодарский край/Краснодар,ios,ios/14.4.0,2021-09-15,9,2
2274240,3,Games,Games/Puzzle,1369521645,2021-09-15 12:21:49,MSK,Санкт-Петербург,Санкт-Петербург/Санкт-Петербург,ios,ios/14.4,2021-09-15,9,2
2274241,3,Games,Games/Strategy,net.wargaming.wot.blitz,2021-09-15 10:27:29,MSK,Краснодарский край,Краснодарский край/Краснодар,android,android/11.0,2021-09-15,9,2


ic| <ipython-input-6-aa7d96ba2c26>:1 in <module> at 13:40:45.450
ic| <ipython-input-6-aa7d96ba2c26>:3 in <module> at 13:40:45.491


In [7]:
#!c1.8
ic()
dfr = pd.DataFrame(columns = ['segment', '5min', 'weekdayid', 'month', 'gcid', 'sgcid', 'bid', 'tsid', 'oid', 'cid', 'osid', 'osvid'])

# Сформируем для каждой записи порядковый номер на временнй шкале
# dfr['dtid'] = df.index.map(dict(zip(df.sort_values('created').index, np.arange(len(df)))))
# ic()

# Разобьем время на 5-минутные интервалы (условно-минималная сессия без прерываний видео)
# если в рамках одной сессии показано несколько видео, то фиксируем нескольких пользователей
dfr['5min'] = (df.created.sub(pd.to_datetime('2021-01-01')).dt.total_seconds() // 300).astype(int)
ic()

dfr['weekdayid'] = df.weekdayid
ic()

dfr['month'] = df.month
ic()

dfr['segment'] = df['Segment']
ic()
gamecategory_list = df['gamecategory'].unique()
gamecategory_to_gcid = dict(zip(gamecategory_list, np.arange(len(gamecategory_list))))
dfr['gcid'] = df['gamecategory'].map(gamecategory_to_gcid)
ic()
subgamecategory_list = df['subgamecategory'].sort_values().unique()
subgamecategory_to_sgcid = dict(zip(subgamecategory_list, np.arange(len(subgamecategory_list))))
dfr['sgcid'] = df['subgamecategory'].map(subgamecategory_to_sgcid)
ic()
bundle_list = df['bundle'].unique()
bundle_to_bid = dict(zip(bundle_list, np.arange(len(bundle_list))))
dfr['bid'] = df['bundle'].map(bundle_to_bid)
ic()
shift_list = df['shift'].unique()
shift_to_tsid = dict(zip(shift_list, np.arange(len(shift_list))))
dfr['tsid'] = df['shift'].map(shift_to_tsid)
ic()
oblast_list = df['oblast'].unique()
oblast_to_oid = dict(zip(oblast_list, np.arange(len(oblast_list))))
dfr['oid'] = df['oblast'].map(oblast_to_oid)
ic()
city_list = df['city'].unique()
city_to_cid = dict(zip(city_list, np.arange(len(city_list))))
dfr['cid'] = df['city'].map(city_to_cid)
ic()
os_list = df['os'].unique()
os_to_osid = dict(zip(os_list, np.arange(len(os_list))))
dfr['osid'] = df['os'].map(os_to_osid)
ic()
osv_list = df['osv'].unique()
osv_to_osvid = dict(zip(osv_list, np.arange(len(osv_list))))
dfr['osvid'] = df['osv'].map(osv_to_osvid)
ic()

ic| <ipython-input-7-064cdd9d57fd>:1 in <module> at 13:40:47.687
ic| <ipython-input-7-064cdd9d57fd>:11 in <module> at 13:40:48.707
ic| <ipython-input-7-064cdd9d57fd>:14 in <module> at 13:40:48.970
ic| <ipython-input-7-064cdd9d57fd>:17 in <module> at 13:40:49.209
ic| <ipython-input-7-064cdd9d57fd>:20 in <module> at 13:40:49.428
ic| <ipython-input-7-064cdd9d57fd>:24 in <module> at 13:40:49.850
ic| <ipython-input-7-064cdd9d57fd>:28 in <module> at 13:40:53.612
ic| <ipython-input-7-064cdd9d57fd>:32 in <module> at 13:40:54.157
ic| <ipython-input-7-064cdd9d57fd>:36 in <module> at 13:40:54.512
ic| <ipython-input-7-064cdd9d57fd>:40 in <module> at 13:40:54.908
ic| <ipython-input-7-064cdd9d57fd>:44 in <module> at 13:40:55.447
ic| <ipython-input-7-064cdd9d57fd>:48 in <module> at 13:40:55.770
ic| <ipython-input-7-064cdd9d57fd>:52 in <module> at 13:40:56.062


In [8]:
#!c1.8
from itertools import cycle

from sklearn import svm, datasets
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from scipy import interp
from sklearn.metrics import roc_auc_score
random_state = 42

In [9]:
#!c1.8
ic()
X = dfr.iloc[:,1:]
y = dfr.iloc[:,0]
ic()

ic| <ipython-input-9-54cbcc555221>:1 in <module> at 13:41:01.375
ic| <ipython-input-9-54cbcc555221>:4 in <module> at 13:41:01.399


In [10]:
#!c1.8
pd.set_option('display.max_rows', 10)
display(X)

Unnamed: 0,5min,weekdayid,month,gcid,sgcid,bid,tsid,oid,cid,osid,osvid
0,74225,2,9,0,0,0,0,0,0,0,0
1,74183,2,9,1,53,1,1,1,1,0,1
2,74231,2,9,1,45,2,2,2,2,0,2
3,74156,2,9,1,52,3,3,3,3,0,2
4,74176,2,9,1,39,4,0,0,0,0,2
...,...,...,...,...,...,...,...,...,...,...,...
2274238,74118,2,9,1,42,3083,1,8,204,1,14
2274239,74219,2,9,0,0,363,1,6,6,1,76
2274240,74164,2,9,1,50,170,1,15,16,1,26
2274241,74141,2,9,1,55,162,1,6,6,0,2


In [11]:
#!c1.8
ic()
pd.set_option('display.max_rows', 10)
display(y)
ic()

0          2
1          2
2          3
3          5
4          2
          ..
2274238    5
2274239    5
2274240    3
2274241    3
2274242    2
Name: segment, Length: 2274243, dtype: int64

ic| <ipython-input-11-855bfa74c11c>:1 in <module> at 13:41:03.168
ic| <ipython-input-11-855bfa74c11c>:4 in <module> at 13:41:03.193


In [12]:
#!c1.8
ic()
y = label_binarize(y, classes=[1, 2, 3, 4, 5])
n_classes = y.shape[1]
display(y.shape)
ic()

(2274243, 5)

ic| <ipython-input-12-d65addeac226>:1 in <module> at 13:41:03.817
ic| <ipython-input-12-d65addeac226>:5 in <module> at 13:41:04.030


In [13]:
#!c1.8
ic()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=random_state)
ic()

ic| <ipython-input-13-cba075d7eb41>:1 in <module> at 13:41:04.854
ic| <ipython-input-13-cba075d7eb41>:3 in <module> at 13:41:05.720


In [14]:
#!c1.8
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1137121, 11), (1137122, 11), (1137121, 5), (1137122, 5))

In [None]:
#!c1.8
