## V8

In [1]:
%matplotlib inline
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime, timedelta

pd.set_option('display.float_format', lambda x: '%.5f' % x)

# Use 3 decimal places in output display
pd.set_option("display.precision", 3)

CATEGORY_FREQ = 50

In [2]:
df_shops = pd.read_csv("shops.csv")
df_shops.describe()

Unnamed: 0,shop_id
count,60.0
mean,29.5
std,17.46425
min,0.0
25%,14.75
50%,29.5
75%,44.25
max,59.0


In [3]:
# get valid shop id
def valid_shop_id(id):
    if id == 0:
        return 57
    if id == 1:
        return 58
#    if id == 23:
#        return 24
    if id == 11:
        return 10
    if id == 40:
        return 39 
    return id

def shop_type(shop):
    #types = ['ТК', 'ТЦ', 'ТРК', 'ТРЦ']
    p = re.compile(r"(Т[РКЦ]+)")
    r = p.search(shop)
    if r is not None: return r.group(1) 
    if 'нлайн' in shop: return 'Онлайн'
    if 'нтернет' in shop: return 'Онлайн'
    return 'Магазин'

# get city
def shop_city(shop):
    p = re.compile(r"^([а-яА-Я\.]*)")
    r = p.search(shop)
    if r is not None: return r.group(1) 
    return 'Unknown'

#df_shops['valid_id'] = df_shops['shop_id'] 

In [4]:
df_shops.drop([0,1], inplace=True)

In [5]:
df_shops['city'] = df_shops['shop_name'].apply(lambda x: shop_city(x))
df_shops['type'] = df_shops['shop_name'].apply(lambda x: shop_type(x))
df_shops['mega'] = df_shops['shop_name'].str.contains('мега', case=False).astype(int)

In [6]:
df_shops = df_shops.set_index('shop_id')
df_shops.tail()

Unnamed: 0_level_0,shop_name,city,type,mega
shop_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
55,Цифровой склад 1С-Онлайн,Цифровой,Онлайн,0
56,"Чехов ТРЦ ""Карнавал""",Чехов,ТРЦ,0
57,"Якутск Орджоникидзе, 56",Якутск,Магазин,0
58,"Якутск ТЦ ""Центральный""",Якутск,ТЦ,0
59,"Ярославль ТЦ ""Альтаир""",Ярославль,ТЦ,0


In [7]:
 df_shops.iloc[25]['type']

'ТЦ'

In [8]:
df_categories = pd.read_csv("item_categories.csv", index_col='item_category_id')
df_categories.describe()

Unnamed: 0,item_category_name
count,84
unique,84
top,Подарки - Фигурки
freq,1


In [9]:
df_categories['category'] = df_categories['item_category_name'].str.split('[-(]', n=0).str[0].str.strip()
df_categories['digital'] = df_categories['item_category_name'].str.contains('цифра', case=False).astype(int)

In [10]:
df_categories.index.names = ['category_id']
df_categories.head()

Unnamed: 0_level_0,item_category_name,category,digital
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,PC - Гарнитуры/Наушники,PC,0
1,Аксессуары - PS2,Аксессуары,0
2,Аксессуары - PS3,Аксессуары,0
3,Аксессуары - PS4,Аксессуары,0
4,Аксессуары - PSP,Аксессуары,0


In [11]:
df_items = pd.read_csv("items.csv", index_col='item_id')
df_items.describe()

Unnamed: 0,item_category_id
count,22170.0
mean,46.29075
std,15.94149
min,0.0
25%,37.0
50%,40.0
75%,58.0
max,83.0


In [12]:
df_items.head()

Unnamed: 0_level_0,item_name,item_category_id
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,! ВО ВЛАСТИ НАВАЖДЕНИЯ (ПЛАСТ.) D,40
1,!ABBYY FineReader 12 Professional Edition Full...,76
2,***В ЛУЧАХ СЛАВЫ (UNV) D,40
3,***ГОЛУБАЯ ВОЛНА (Univ) D,40
4,***КОРОБКА (СТЕКЛО) D,40


In [13]:
#df_items["label1"] = df_items["item_name"].str.lower().str.extract(r"\((.*?)\)")[0]
df_items["label1"] = df_items["item_name"].str.lower().str.extractall(r"\((.*?)\)")[0].str.split(',').groupby(level=0).apply(lambda l: [item.strip() for sublist in l for item in sublist])
df_items["label2"] = df_items["item_name"].str.lower().str.extractall(r"\[(.*?)\]")[0].str.split(',').groupby(level=0).apply(lambda l: [item.strip() for sublist in l for item in sublist])
#label1 = df_items["label1"].value_counts()
#[print(k) for k in label1.keys() if label1[k] < 2]
df_items[["label1", "label2"]].head(20)

Unnamed: 0_level_0,label1,label2
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,[пласт.],
1,,"[pc, цифровая версия]"
2,[unv],
3,[univ],
4,[стекло],
5,[uni],
6,[uni],
7,[uni],
8,,
9,[uni],


In [14]:
label_corrections = {'univ': 'unv'
                      ,'uni': 'unv'
                      ,'mp3-cd': ['mp3', 'cd']
                      ,'mp3-cd+dvd': ['mp3', 'cd', 'dvd']
                      ,'ps3/xbox 360/pc/mac': ['xbox 360', 'ps3', 'pc', 'mac']
                      ,'x360/ps3/pc/mac': ['xbox 360', 'ps3', 'pc', 'mac']
                      ,'xbox 360/pc/mac': ['xbox 360', 'pc', 'mac']
                      ,'pc/ xbox 360': ['xbox 360', 'pc']
                      ,'ps3/pc/mac': ['ps3', 'pc', 'mac']
                      ,'ps3/pc': ['ps3', 'pc']
                      ,'с поддержкой ps4': 'ps4'
                      ,'с поддержкой ps3 и ps vita': ['ps3', 'ps vita']
                      ,'cd-digipack': ['cd','digipack']
                      ,'mp3-dvd': ['mp3', 'dvd']
                      ,'dvd-book': ['dvd', 'book']
                      ,'dvd+книга': ['dvd', 'book']
                      ,'dvd-digipack': ['dvd', 'digipack']
                      ,'dvd box': ['dvd', 'box']
                      ,'dvd-box': ['dvd', 'box']
                      ,'dvd- box': ['dvd', 'box']
                      ,'dvd+ bd': ['dvd', 'bd']
                      ,'cd-box': ['cd', 'box']
                      ,'dvd-pack': 'dvd'
                      ,'dvdpack': 'dvd'
                      ,'1dvd+5 в подарок': 'dvd'
                      ,'+cd': 'cd'
                      ,'+ cd': 'cd'
                      ,'+ dvd': 'dvd'
                      ,'8 cd': 'cd'
                      ,'2dvd': 'dvd'
                      ,'2 dvd': 'dvd'
                      ,'2 изд-е': '2-е издание'
                      ,'2-е изд.': '2-е издание'
                      ,'2-е изд': '2-е издание'
                      ,'2 издание': '2-е издание'
                      ,'изд.2': '2-е издание'
                      ,'изд.3': '3-е издание'
                      ,'2-е изд. перераб и доп': ['2-е издание','переработано','дополнено']
                      ,'3 изд-е': '3-е издание'
                      ,'3-е изд-е': '3-е издание'
                      ,'переработанное': 'переработано'
                      ,'испр и доп': ['исправлено', 'дополнено']
                      ,'исправ. и доп.': ['исправлено', 'дополнено']
                      ,'перераб и доп': ['переработано', 'дополнено']
                      ,'испр и доп.': ['исправлено', 'дополнено']
                      ,'2bd': 'bd'
                      ,'bd 4k': 'bd'
                      ,'3d bd+bd': ['3d', 'bd']
                      ,'dvd+bd+3d открытка': ['3d', 'bd', 'dvd', 'коллекционная открыка']
                      ,'3d bd+bd+коллекционная открыка': ['3d', 'bd', 'коллекционная открыка']
                      ,'bd+коллекционная открытка': ['bd', 'коллекционная открыка']
                      ,'3d bd': ['3d','bd']
                      ,'3 диска 3d bd': ['3d','bd']
                      ,'pc-dvd': ['pc', 'dvd']
                      ,'dvd+bd': ['dvd', 'bd']
                      ,'коробка': 'box'
                      ,'фирм.': 'фирм'
                      ,'только для ms kinect': 'kinect'
                      ,'толькоя для ms kinect': 'kinect'
                      ,'только для kinect': 'kinect'
                      ,'только для ps move': 'pc move'
                      ,'с поддержкой ms kinect': 'kinect'
                      ,'с поддержкой ps move': 'pc move'
                      ,'с поддержкой move': 'pc move'
                      ,'c поддержкой ps move': 'pc move'
                      ,'требуется ps move': 'pc move'
                      ,'с поддержкой 3d': '3d'
                      ,'рус.в.': 'русская версия'
                      ,'рус.в': 'русская версия'
                      ,'рус': 'русская версия'
                      ,'предзаказ': 'preorder'
                      ,'по предзаказам': 'preorder'
                      ,'digpack': 'digipack'
                      ,'1 устройство/1год': ['1 устройство','1 год']
                      ,'1 устройство/2года': ['1 устройство','2 года']
                      ,'лицензия на 1 год на 1 пк': ['1 устройство','1 год']
                      ,'лицензия на 1 год на 2 пк': ['2 устройства','1 год']
                      ,'лицензия на 1 год на 3 пк': ['3 устройства','1 год']
                      ,'лицензия на 1 год на 5 пк': ['5 устройств','1 год']
                      ,'лицензия на 2 года на 1 пк': ['1 устройство','2 года']
                      ,'лицензия на 2 года на 2 пк': ['2 устройства','2 года']
                      ,'лицензия на 6 месяцев на 1 пк': ['1 устройство','6 месяцев']
                      ,'3 устройства / 1 год': ['3 устройства','1 год']
                      ,'лицензия на 1 год 2 пк': ['2 устройства','1 год']
                      ,'лицензия на 2 года 1 пк': ['1 устройство','2 года']
                      ,'лицензия на 2 года 2 пк': ['2 устройства','2 года']
                      ,'лицензия на 6 месяцев 1 пк': ['1 устройство','6 месяцев']
                      ,'2пк / 1 год': ['2 устройства','1 год']
                      ,'англ. в.': 'английская версия'
                      ,'англ.в': 'английская версия'
                      ,'англ.в.': 'английская версия'
                      ,'англ': 'английская версия'
                      ,'1пк / 1 год': ['1 устройство','1 год']
                      ,'3пк / 1 год': ['3 устройства','1 год']
                      ,'предзаказ1': 'preorder'
                      ,'предзаказ2': 'preorder'
                      ,'предзаказ3': 'preorder'
                      ,'2пк / 1год': ['2 устройства','1 год']
                      ,'2 пк/1 год': ['2 устройства','1 год']
                      ,'2 пк / 1 год': ['2 устройства','1 год']
                      ,'1 смартфон / 1 год': ['1 устройство','1 год']
                      ,'3 пк / 1 год': ['3 устройства','1 год']
                      ,'full eng': 'английская версия'
                      ,'eng': 'английская версия'
                      ,'англ.в.рус.с.': ['английская версия', 'русские субтитры']
                      ,'пожизненная лицензия на 3пк': '3 устройства'
                      ,'срок действия лицензии 1 год': '1 год'
                      ,'рукоятка для ps move controller в виде пистолета для стрельбы': 'pc move'
                      ,'контроллер движений ps move : cech-zcm1r bx: scee': 'pc move'
                      ,'зарядная станция/подставка для контроллера ps move': 'pc move'
                      ,'рукоятка для ps move controller в виде автомата для стрельбы и навигации': 'pc move'
                      ,'камера ps eye + контроллер движений ps move + демо-диск': 'pc move'
                      ,'подписка на 1 год': '1 год'
                      ,'1 пк': '1 устройство'
                      ,'на 1 год': '1 год'
                      ,'лицензия на 1 год': '1 год'
                      ,'3 года': '3 года'
                      ,'2 пк': '2 устройства'
                      ,'3 пк': '3 устройства'
                      ,'подписка на 6 месяцев': '6 месяцев'
                      ,'russian': 'русская версия'
                      ,'англ.в.рус.д.': ['английская версия','русская документация']
                      ,'книга + cd': ['book','cd']
                      ,'english': 'английская версия'
                      ,'4 dvd': 'dvd'
                      ,'3d bd+bd+dvd': ['3d', 'bd', 'dvd']
                      ,'3dvd': 'dvd'
                      ,'5dvd': 'dvd'
                      ,'48 dvd': 'dvd'
                      ,'мистерия регион': 'регион'
                      ,'13 dvd': 'dvd'
                      ,'21 dvd': 'dvd'
                      ,'11dvd': 'dvd'
                      ,'12dvd': 'dvd'
                      ,'18 dvd': 'dvd'
                      ,'колл. 9 dvd': ['dvd', 'колл']
                      ,'колл. 7 dvd': ['dvd', 'колл']
                      ,'колл. 5 dvd': ['dvd', 'колл']
                      ,'колл 9 dvd': ['dvd', 'колл']
                      ,'колл. 10 dvd': ['dvd', 'колл']
                      ,'коллекц. 16 dvd': ['dvd', 'колл']
                      ,'3 dvd': 'dvd'
                      ,'4dvd': 'dvd'
                      ,'6 dvd': 'dvd'
                      ,'колл. 4 dvd': ['dvd', 'колл']
                      ,'3bd': 'bd'
                      ,'23 bd': 'bd'
                      ,'24 bd': 'bd'
                      ,'4bd': 'bd'
                      ,'2 bd': 'bd'
                      ,'3 bd': 'bd'
                      ,'подар': 'подарочный'
                      ,'подар.': 'подарочный'
                      ,'5 dvd': 'dvd'
                      ,'колл.': 'колл'
                      ,'bd+cd': ['bd', 'cd']
                      ,'3 bd + 3 dvd + кольцо': ['bd', 'dvd']
                      ,'3 bd + 3 dvd': ['bd', 'dvd']
                      ,'6dvd': 'dvd'
                      ,'bd + dvd': ['bd', 'dvd']
                      ,'dvd+3d bd': ['dvd', '3d', 'bd']
                      ,'11 bd': 'bd'
                      ,'11 bd+закладки 2': 'bd'
                      ,'11 bd+закладки': 'bd'
                      ,'11 bd+значки': 'bd'
                      ,'3d+2d': '3d'
                      ,'коллекционное издание': 'колл'
                      ,'8dvd': 'dvd'
                      ,'uni регион.': ['unv', 'регион']
                      ,'dvd+ 3d bd': ['dvd', '3d', 'bd']
                      ,'вольга регион': 'регион'
                      ,'регион.': 'регион'
                      ,'dvd + bd': ['dvd', 'bd']
                      ,'3d bd + 2dvd': ['3d', 'bd', 'dvd']
                      ,'9bd': 'bd'
                      ,'14 dvd': 'dvd'
                      ,'15 dvd + трон+7 открыток': 'dvd'
                      ,'5 bd': 'bd'
                      ,'5 bd+яйцо+открытки': 'bd'
                      ,'4 bd': 'bd'
                      ,'3 сезона. колл': 'колл'
                      ,'колл. 4dvd': ['dvd', 'колл']
                      ,'3d-открытка в подарок': '3d'
                      ,'24 dvd': 'dvd'
                      ,'4 dvd-15 фильмов': 'dvd'
                      ,'8bd': 'bd'
                      ,'7 bd': 'bd'
                      ,'7dvd': 'dvd'
                      ,'5 3d bd': ['3d', 'bd']
                      ,'5bd': 'bd'
                      ,'dvd+cd': ['dvd', 'cd']
                      ,'коллекц.издание': 'колл'
                      ,'коллекц.': 'колл'
                      ,'2dvd+bd': ['dvd', 'bd']
                      ,'4bd+dvd': ['dvd', 'bd']
                      ,'рег.': 'регион'
                      ,'bd+dvd+шайба+автофлаг': ['bd', 'dvd']
                      ,'dvd+автофлаг': 'dvd'
                      ,'3dbd+bd+коллекционная открытка': ['3d', 'bd', 'коллекционная открыка']
                      ,'bd+dvd+коллекционная открытка': ['dvd', 'bd', 'коллекционная открыка']
                      ,'коллекция': 'колл'
                      ,'3d bd+bd+ dvd': ['3d','bd', 'dvd']
                      ,'8 dvd': 'dvd'
                      ,'7bd': 'bd'
                      ,'союз регион': 'регион'
                      ,'коллекц.изд.': 'колл'
                      ,'колл. 8 bd': ['колл', 'bd']
                      ,'колл. 8 dvd': ['колл', 'dvd']
                      ,'6 bd': 'bd'
                      ,'м.т.регион': 'регион'
                      ,'10 dvd': 'dvd'
                      ,'bd+dvd с доп. материалами+буклет': ['bd', 'dvd']
                      ,'5 bd+артбук': ['bd', 'book']
                      ,'6 dvd+артбук': ['dvd', 'book']
                      ,'3d bd+bd+фотоальбом': ['3d', 'bd']
                      ,'3d bd+2 диска bd': ['3d', 'bd']
                      ,'реоион': 'регион'
                      ,'6bd': 'bd'
                      ,'mp3-cd+ audio-cd': ['mp3', 'cd']
                      ,'mp3-сd': ['mp3', 'cd']
                      ,'+сд': 'cd'
                      ,'pc-cd': ['pc', 'cd']
                      ,'bd+dvd': ['bd','dvd']
                      ,'обитель зла bd в подарок': 'bd'
                      ,'коллекционное': 'колл'
                      ,'14 bd': 'bd'
                      ,'6 real 3d bd+6 bd': ['3d', 'bd']
                      ,'2 диска 3d bd+bd': ['3d', 'bd']
                      ,'2bd + "гендальф" ручка+закладка': 'bd'
                      ,'2bd +"торрин" ручка+закладка': 'bd'
                      ,'2 диска 3d bd+3 bd': ['3d', 'bd']
                      ,'2 3d bd+2bd': ['3d', 'bd']
                      ,'2bd + "бильбо" ручка+закладка': 'bd'
                      ,'2dvd+коллекционная открытка': ['dvd', 'коллекционная открыка']
                      ,'2диска 3d bd+2bd+3д-открытка': ['3d', 'bd', 'коллекционная открыка']
                      ,'+колл. открытка': 'коллекционная открыка'
                      ,'3d bd+ bd': ['3d', 'bd']
                      ,'+3d-открытка': ['3d', 'коллекционная открыка']
                      ,'+3d открытка': ['3d', 'коллекционная открыка']
                      ,'pс': 'pc'
                      ,'русские субтитры': 'русские субтитры'
                      ,'x360': 'xbox 360'
                      ,'xbox360': 'xbox 360'
                      ,'x-box 360': 'xbox 360'
                      ,'xbox 360. английская версия': ['xbox 360','английская версия']
                      ,'русские субт': 'русские субтитры'
                      ,'русские субти': 'русские субтитры'
                      ,'рс': 'pc'
                      ,'цифровая версия epay': ['цифровая версия', 'epay']
                      ,'jewel русская версия': ['jewel', 'русская версия']
                      ,'цифр. версия': 'цифровая версия'
                      ,'англ.в.': 'английская версия'
                      ,'jewel рус.в.': ['jewel','русская версия']
                      ,'рус. суб.': 'русские субтитры'
                      ,'рус. субтитры': 'русские субтитры'
                      ,'[pc': 'pc'
                      ,'mac цифровая версия': ['mac','цифровая версия']
                      ,'русcкие субтитры': 'русские субтитры'
                      ,'pc/mac': ['pc', 'mac']
                      ,'рус.в.': 'русская версия'
                      ,'pyc.в.': 'русская версия'
                      ,'rus': 'русская версия'
                      ,'сил. чехол': 'чехол'
                      ,'rem.': 'rem'
                      ,'c поддержкой 3d': '3d'
                      ,'ps vita travel kit: scee': 'ps vita'
                      ,'ps vita memory card 16gb - pch-z161: scee': 'ps vita'
                      ,'ps vita memory card 32gb - pch-z321: scee': 'ps vita'
                      ,'ps vita memory card 4 gb - pch-z041: scee': 'ps vita'
                      ,'ps vita memory card 8 gb - pch-z081: scee': 'ps vita'
                      ,'ps vita portable battery charger: scee': 'ps vita'
                      ,'ps vita in-ear headset - pch-zhs1e: scee': 'ps vita'
                      ,'ps vita starter kit: scee': 'ps vita'
                      ,'вd': 'bd'}

def correct_labels(label_list, label_correction):
    correct_list = []
    for label in label_list:
        #print(label)
        if label in label_correction.keys():
            correction = label_correction[label]
            #print('Label: ', label, ' Correct: ', correction)
            if not isinstance(correction, list):
                correction = [correction]  
            correct_list = np.concatenate([correct_list, correction])
        else:
            correct_list = np.append(correct_list, label)
    return correct_list

In [15]:
df_items['label1'] = df_items['label1'].where(df_items['label1'].notnull(), None).apply(lambda l: l if l is None else correct_labels(l, label_corrections))
df_items['label2'] = df_items['label2'].where(df_items['label2'].notnull(), None).apply(lambda l: l if l is None else correct_labels(l, label_corrections))
df_items[['label1', 'label2']].head(20)

Unnamed: 0_level_0,label1,label2
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,[пласт.],
1,,"[pc, цифровая версия]"
2,[unv],
3,[unv],
4,[стекло],
5,[unv],
6,[unv],
7,[unv],
8,,
9,[unv],


In [16]:
def concat2arrays(arr1, arr2):
    result = []
    #print(type(arr1))
    if arr1 is not None:
        result = np.concatenate([result, arr1])
    if arr2 is not None:
        result = np.concatenate([result, arr2])
    result = np.unique(result) if len(result) > 0 else [None]
    #print(result)
    return result
        
df_items['label12'] = df_items[['label1', 'label2']].apply(lambda l: concat2arrays(l[0],l[1]), axis=1)
df_items[['label1', 'label2','label12']].head(20)
#concat2arrays(None,['pc', 'цифровая версия'])

Unnamed: 0_level_0,label1,label2,label12
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,[пласт.],,[пласт.]
1,,"[pc, цифровая версия]","[pc, цифровая версия]"
2,[unv],,[unv]
3,[unv],,[unv]
4,[стекло],,[стекло]
5,[unv],,[unv]
6,[unv],,[unv]
7,[unv],,[unv]
8,,,[None]
9,[unv],,[unv]


In [17]:
#df_items['label12'].unique()

In [18]:
labels = df_items['label12'].dropna()
#labels.describe()

In [19]:
item_categories = []
for cl in labels.values:
    for item in cl:
        if item not in item_categories and item is not None:
            if len(item)>1:
                item_categories.append(item)
len(item_categories)

1069

In [20]:
item_categories

['пласт.',
 'pc',
 'цифровая версия',
 'unv',
 'стекло',
 'регион',
 'сер.3-4',
 'ps3',
 'русская версия',
 'jewel',
 'xbox 360',
 'bd',
 'rem',
 'cd',
 'digipack',
 'mp3',
 'подар. уп.',
 'book',
 'dvd',
 'ср',
 'bitrix',
 'by jules verne',
 'best funny stories',
 'beginner',
 'elementary',
 'pre-intermediate',
 'сборник',
 'для средней школы',
 'вторая часть',
 'первая часть',
 'mp3-аудиоспектакль',
 'бандл',
 'box',
 'с участием в.гафта',
 'спектакль',
 'в главной роли а. джигарханян',
 'с участием в. гафта',
 'радиоспектакль',
 'ред.3.0',
 'usb',
 'издание 6',
 'по ред. 2.0',
 'steam версия',
 'вкл версию для порт.устр-в',
 '4 класс',
 '5–9 классы',
 '1–6 классы',
 '1-6 классы',
 'лучшие игры для родителей с детьми',
 'английский язык',
 'с прил.на cd-rom',
 '2-е издание',
 'дополнено',
 'исправлено',
 '2013',
 '2014',
 '2011',
 '3-е издание',
 'переработано',
 'английская версия',
 'ххкат',
 'фирм',
 '3d',
 'коллекционная открыка',
 'наш сад 10',
 'красный',
 'матовый',
 'собранны

In [None]:
labels_categories = []
for cat in item_categories:
#    cat = cat.split('(')[0]
    #print(cat)
    s = 0
    for l in labels:
        s += cat in l
    if s > CATEGORY_FREQ:
        #print(cat, ' - ', s)
        #print(cat, )
        labels_categories.append(cat)
labels_categories
#df_items[df_items['label1'].str.constains('')]

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(classes=labels_categories)

In [None]:
item_labels = pd.DataFrame(mlb.fit_transform(df_items['label12']), columns=mlb.classes_)
item_labels.columns = "l0_" + item_labels.columns

In [None]:
item_labels.describe()

In [None]:
#items = pd.DataFrame(items)
item_labels.head()

In [None]:
item_features = {}
def get_items_features(item_id):
    if item_id not in item_features:
        item_features[item_id] = item_labels.iloc[item_id]
    return item_features[item_id]

In [None]:
get_items_features(2)

In [None]:
import datetime as dt
import holidays

start_date = dt.datetime.strptime("2013-01-01", "%Y-%m-%d")
end_date = dt.datetime.strptime("2015-11-30", "%Y-%m-%d")

dates = [start_date + dt.timedelta(days=x) for x in range(0, (end_date - start_date + dt.timedelta(days=1)).days)]

ru_holidays = holidays.Russia()

calendar = pd.Series(dates).rename("date").to_frame()

calendar["bank_holiday"] = calendar["date"].apply(lambda x: ru_holidays.get(x))
calendar["weekday"] = calendar["date"].apply(lambda x: dt.date.isoweekday(x))
calendar.head()

In [None]:
calendar['bank_holiday'].unique()

In [None]:
calendar['weekend'] = calendar['weekday'].apply(lambda x: 1 if x in (6,7) else 0)
calendar['holyday'] = calendar['bank_holiday'].apply(lambda x: 1 if x is not None else 0)
calendar['date_block_num'] = calendar['date'].apply(lambda x: (x.year-2013)*12+x.month-1)
calendar.head(10)

In [None]:
calendar.tail(10)

In [None]:
dates_features = calendar[['date_block_num','weekend','holyday']].groupby('date_block_num').sum()
dates_dict = dates_features.to_dict()
dates_features.head(10)

In [None]:
def get_holydays(block_num):
    return dates_dict['holyday'][block_num]

def get_weekends(block_num):
    return dates_dict['weekend'][block_num]

In [None]:
dates_features.iloc[0]['weekend']

In [None]:
df_test = pd.read_csv("test.csv")
df_test.describe()

In [None]:
federal_cities = ['Москва', 'СПб']
milioner_cities = ['Воронеж', 'Казань', 'Красноярск', 'Н.Новгород', 'Новосибирск', 'Омск', 'РостовНаДону', 'Самара', 'Уфа']
regional_caps = ['Адыгея', 'Калуга', 'Курск', 'Тюмень', 'Якутск', 'Ярославль', 'Вологда', 'Томск']
regional_cities = ['Балашиха', 'Волжский', 'Жуковский', 'Коломна', 'Сергиев', 'Сургут', 'Химки', 'Чехов', 'Мытищи']

shop_types = {}
shop_cities = {}
mega_shops = {}
categories = {}
digital_categories = {}
items = {}

def get_city_type(city):
    if city in federal_cities: return 'Federal' 
    if city in milioner_cities: return 'Milioner' 
    if city in regional_caps: return 'RegionalCenter' 
    if city in regional_cities: return 'Regional' 
    return 'Virtual'

def get_category(item_id):
    if item_id in items:
        category_id = items[item_id]
    else:
        category_id = df_items.loc[item_id]['item_category_id']
        items[item_id] = category_id
    
    if category_id not in categories:
        categories[category_id] = df_categories.loc[category_id]['item_category_name']
#        categories[category_id] = df_categories.loc[category_id]['category']
    return categories[category_id]


def get_category_digitality(item_id):
    if item_id in items:
        category_id = items[item_id]
    else:
        category_id = df_items.loc[item_id]['item_category_id']
        items[item_id] = category_id
    
    if category_id not in digital_categories:
        digital_categories[category_id] = df_categories.loc[category_id]['digital']
    return digital_categories[category_id]

def get_shop_type(shop_id):
    if shop_id not in shop_types:
        shop_types[shop_id] = df_shops.loc[shop_id]['type']
    return shop_types[shop_id]

def get_shop_megality(shop_id):
    if shop_id not in mega_shops:
        mega_shops[shop_id] = df_shops.loc[shop_id]['mega']
    return mega_shops[shop_id]

def get_shop_city(shop_id):
    if shop_id not in shop_cities:
        shop_cities[shop_id] = df_shops.loc[shop_id]['city']
    return shop_cities[shop_id]

In [None]:
df_sales = pd.read_csv("sales_train.csv")
df_sales.describe()

In [None]:
%%time
df_sales = df_sales.loc[df_sales['item_cnt_day']>0]
df_sales['shop_id'] = df_sales['shop_id'].apply(lambda x: valid_shop_id(x))
df_sales['revenue'] = df_sales['item_cnt_day'] * df_sales['item_price']
df_sales = df_sales.groupby(['date_block_num','shop_id','item_id'])[['item_cnt_day', 'revenue']].sum().reset_index()
df_sales.head()

In [None]:
#df_sales.loc[(df_sales['date_block_num']==0) & (df_sales['item_id'] == 2552) & (df_sales['shop_id'] == 25)]
df_shops.loc[59]

In [None]:
%%time
df_sales['shop_type'] = df_sales['shop_id'].apply(lambda x: get_shop_type(x))

In [None]:
%%time
df_sales['category'] = df_sales['item_id'].apply(lambda x: get_category(x))

In [None]:
%%time
df_sales['city'] = df_sales['shop_id'].apply(lambda x: get_shop_city(x))

In [None]:
#Clip
#df_sales['item_cnt_day'] = np.clip(df_sales['item_cnt_day'], 0, 20)

In [None]:
df_sales['month'] = df_sales['date_block_num']%12 + 1

In [None]:
df_sales.head()

## Cluster Analysis

- shops
- cities
- categories

### Cluster analysis for shops

In [None]:
df_sales_items = df_sales.groupby(['date_block_num','shop_id'])['item_cnt_day'].sum().reset_index()
df_sales_items['month'] = df_sales_items['date_block_num'] % 12 + 1
df_sales_items.head()

In [None]:
# for clustrer analysis
df_cluster_sales = df_sales_items.groupby(['month','shop_id'])['item_cnt_day'].mean().to_frame(name='item_cnt_month').reset_index()
df_cluster_sales.head(10)

In [None]:
# for clustrer analysis
df_cluster_shops = df_cluster_sales.pivot_table(index=['shop_id'], columns=['month'], values=['item_cnt_month'], fill_value=0)
df_cluster_shops.reset_index(inplace=True)
df_cluster_shops.columns = [ ''.join((name, str(date))) for (name, date) in df_cluster_shops.columns.values]
df_cluster_shops.head()

In [None]:
df_cluster_shops.shape

In [None]:
#cluster analysis
#test_shops = np.sort(df_test['shop_id'].unique())
#df_cluster_shops = df_cluster_shops.loc[df_cluster_shops['shop_id'].isin(test_shops)]
#df_cluster_shops.shape

In [None]:
#cluster analysis
from sklearn.preprocessing import StandardScaler
df_cluster_without_shops = df_cluster_shops.drop('shop_id', axis=1)
scaled_sales = StandardScaler().fit_transform(df_cluster_without_shops.values)
scaled_sales.shape

In [None]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

SEED = 579

#neighborhood_clustering = df_neighborhood_categories.drop('Neighborhood', axis=1)

# set number of clusters
Ks = 16
inertia = np.zeros((Ks-2))
for n in range(2,Ks):
    
    #Train Model and Predict  
    kmeans = KMeans(init='k-means++', n_clusters=n, random_state=SEED, algorithm="full").fit(scaled_sales)
    inertia[n-2] = kmeans.inertia_
inertia

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(range(2,Ks),inertia,'gx-')
plt.ylabel('Average distance ')
plt.xlabel('Number of clusters (K)')
loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals
ax.xaxis.set_major_locator(loc)
#plt.scatter(8, inertia[7], marker='o', color='g', s=100)
#bbox_props = dict(boxstyle="larrow,pad=0.6", fc="white", ec="g", lw=2)
#t = ax.text(11, 20, "Elbow point (K = 8)", ha="center", va="center", rotation=37,
#            size=15,
#            bbox=bbox_props)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# set number of clusters
kclusters = 4

# run k-means clustering 
#kmeans = KMeans(n_clusters=kclusters, random_state=SEED).fit(neighborhood_clustering)
kmeans = KMeans(n_clusters=kclusters, random_state=SEED).fit(scaled_sales)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
df_cluster_shops.insert(0, 'shop_cluster', kmeans.labels_)
df_cluster_shops = df_cluster_shops[['shop_id', 'shop_cluster']].set_index('shop_id')
df_cluster_shops.head()

In [None]:
shop_clusters = {}

def get_shop_cluster(shop_id):
    if shop_id not in shop_clusters:
        shop_clusters[shop_id] = df_cluster_shops.loc[shop_id]['shop_cluster']
    return shop_clusters[shop_id]

### Cluster analysis for cities

In [None]:
#df_sales_items = df_sales.groupby(['date_block_num','shop_id'])['item_cnt_day'].sum().reset_index()
#df_sales_items['month'] = df_sales_items['date_block_num'] % 12 + 1
df_sales_items['city'] = df_sales_items['shop_id'].apply(lambda x: get_shop_city(x))
df_sales_items.head()

In [None]:
# for clustrer analysis
df_cluster_sales = df_sales_items.groupby(['month','city'])['item_cnt_day'].mean().to_frame(name='item_cnt_month').reset_index()
df_cluster_sales.head(10)

In [None]:
# for clustrer analysis
df_cluster_cities = df_cluster_sales.pivot_table(index=['city'], columns=['month'], values=['item_cnt_month'], fill_value=0)
df_cluster_cities.reset_index(inplace=True)
df_cluster_cities.columns = [ ''.join((name, str(date))) for (name, date) in df_cluster_cities.columns.values]
df_cluster_cities.head()

In [None]:
#cluster analysis
from sklearn.preprocessing import StandardScaler
df_cluster_without_cities = df_cluster_cities.drop('city', axis=1)
scaled_sales = StandardScaler().fit_transform(df_cluster_without_cities.values)
scaled_sales.shape

In [None]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

SEED = 579

#neighborhood_clustering = df_neighborhood_categories.drop('Neighborhood', axis=1)

# set number of clusters
Ks = 16
inertia = np.zeros((Ks-2))
for n in range(2,Ks):
    
    #Train Model and Predict  
    kmeans = KMeans(init='k-means++', n_clusters=n, random_state=SEED, algorithm="full").fit(scaled_sales)
    inertia[n-2] = kmeans.inertia_
inertia

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(range(2,Ks),inertia,'gx-')
plt.ylabel('Average distance ')
plt.xlabel('Number of clusters (K)')
loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals
ax.xaxis.set_major_locator(loc)
#plt.scatter(8, inertia[7], marker='o', color='g', s=100)
#bbox_props = dict(boxstyle="larrow,pad=0.6", fc="white", ec="g", lw=2)
#t = ax.text(11, 20, "Elbow point (K = 8)", ha="center", va="center", rotation=37,
#            size=15,
#            bbox=bbox_props)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# set number of clusters
kclusters = 6

# run k-means clustering 
#kmeans = KMeans(n_clusters=kclusters, random_state=SEED).fit(neighborhood_clustering)
kmeans = KMeans(n_clusters=kclusters, random_state=SEED).fit(scaled_sales)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
df_cluster_cities.insert(0, 'city_cluster', kmeans.labels_)
df_cluster_cities = df_cluster_cities[['city', 'city_cluster']].set_index('city')
df_cluster_cities.head(10)

In [None]:
city_clusters = {}

def get_city_cluster(city):
    if city not in city_clusters:
        city_clusters[city] = df_cluster_cities.loc[city]['city_cluster']
    return city_clusters[city]

## Cluster analysis for categories

In [None]:
df_sales_items = df_sales.groupby(['date_block_num','shop_id', 'category'])['item_cnt_day'].sum().reset_index()
df_sales_items['month'] = df_sales_items['date_block_num'] % 12 + 1
df_sales_items.head()

In [None]:
# for clustrer analysis
df_cluster_sales = df_sales.groupby(['month','category'])['item_cnt_day'].mean().to_frame(name='item_cnt_month').reset_index()
df_cluster_sales.head(10)

In [None]:
# for clustrer analysis
df_cluster_categories = df_cluster_sales.pivot_table(index=['category'], columns=['month'], values=['item_cnt_month'], fill_value=0)
df_cluster_categories.reset_index(inplace=True)
df_cluster_categories.columns = [ ''.join((name, str(date))) for (name, date) in df_cluster_categories.columns.values]
df_cluster_categories.head()

In [None]:
#cluster analysis
from sklearn.preprocessing import StandardScaler
df_cluster_without_categories = df_cluster_categories.drop('category', axis=1)
scaled_sales = StandardScaler().fit_transform(df_cluster_without_categories.values)
scaled_sales.shape

In [None]:
# import k-means from clustering stage
from sklearn.cluster import KMeans

SEED = 579

#neighborhood_clustering = df_neighborhood_categories.drop('Neighborhood', axis=1)

# set number of clusters
Ks = 16
inertia = np.zeros((Ks-2))
for n in range(2,Ks):
    
    #Train Model and Predict  
    kmeans = KMeans(init='k-means++', n_clusters=n, random_state=SEED, algorithm="full").fit(scaled_sales)
    inertia[n-2] = kmeans.inertia_
inertia

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as plticker
%matplotlib inline

fig, ax = plt.subplots(figsize=(10, 6))
plt.plot(range(2,Ks),inertia,'gx-')
plt.ylabel('Average distance ')
plt.xlabel('Number of clusters (K)')
loc = plticker.MultipleLocator(base=1.0) # this locator puts ticks at regular intervals
ax.xaxis.set_major_locator(loc)
#plt.scatter(8, inertia[7], marker='o', color='g', s=100)
#bbox_props = dict(boxstyle="larrow,pad=0.6", fc="white", ec="g", lw=2)
#t = ax.text(11, 20, "Elbow point (K = 8)", ha="center", va="center", rotation=37,
#            size=15,
#            bbox=bbox_props)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
# set number of clusters
kclusters = 5

# run k-means clustering 
#kmeans = KMeans(n_clusters=kclusters, random_state=SEED).fit(neighborhood_clustering)
kmeans = KMeans(n_clusters=kclusters, random_state=SEED).fit(scaled_sales)

# check cluster labels generated for each row in the dataframe
kmeans.labels_[0:10]

In [None]:
df_cluster_categories.insert(0, 'category_cluster', kmeans.labels_)
df_cluster_categories = df_cluster_categories[['category', 'category_cluster']].set_index('category')
df_cluster_categories.head()

In [None]:
category_clusters = {}

def get_category_cluster(category):
    if category not in category_clusters:
        category_clusters[category] = df_cluster_categories.loc[category]['category_cluster']
    return category_clusters[category]

## Model

In [None]:
#df_monthly_sales = df_sales.groupby(['date_block_num','shop_id','item_id'])[['item_cnt_day', 'revenue']].sum().reset_index()#.to_frame(name='item_cnt_month').reset_index()
#df_monthly_sales = df_sales.groupby(['date_block_num','shop_id','item_id'])['item_cnt_day'].sum().to_frame(name='item_cnt_month').reset_index()
df_monthly_sales = df_sales.copy()
#df_monthly_sales['shop_id'] = df_monthly_sales['shop_id'].apply(lambda x: valid_shop_id(x))
df_monthly_sales = df_monthly_sales.loc[df_monthly_sales['item_cnt_day'] > 0]
df_monthly_sales.head(10)

In [None]:
df_monthly_sales.loc[(df_monthly_sales['date_block_num']==0) & (df_monthly_sales['item_id'] == 2552) & (df_monthly_sales['shop_id'] == 25)]

In [None]:
# clip all cnt
#df_monthly_sales['item_cnt_day'] = np.clip(df_monthly_sales['item_cnt_day'], 0, 20)

In [None]:
df_monthly_sales.describe()

In [None]:
df = df_monthly_sales.pivot_table(index=['shop_id','item_id'], columns=['date_block_num'], values='item_cnt_day', fill_value=0)
#df = df_monthly_sales.pivot_table(index=['shop_id','item_id'], columns=['date_block_num'], values=['item_cnt_day', 'revenue'], fill_value=0)
df.reset_index(inplace=True)
df.head()

In [None]:
#df.columns = [ ''.join((name, str(date))) for (name, date) in df.columns.values]
#df['item_cnt_day34'] = np.nan
#df['revenue34'] = np.nan
df[34] = np.nan
df.describe()

In [None]:
# calc prev month
df['prev0'] = np.nan
df['prev_diff0'] = np.nan
df['mean3_0'] = np.nan
df['mean6_0'] = np.nan
#df_test['prev_year0'] = np.nan
#df_test['revenue_prev0'] = np.nan
#df_test['revenue_prev_diff0'] = np.nan
#df_test['revenue_prev_year0'] = np.nan
for col in range(1, 35):
    df[''.join(('prev', str(col)))] = df[col-1]
    df[''.join(('prev_diff', str(col)))] = df[''.join(('prev', str(col)))] - df[''.join(('prev', str(col-1)))]
    df[''.join(('mean3_', str(col)))] = np.nan

    if col > 2:
        df[''.join(('mean3_', str(col)))] = (df[col-1] + df[col-2] + df[col-3])/3
        
    df[''.join(('mean6_', str(col)))] = np.nan
    if col > 5:
        df[''.join(('mean6_', str(col)))] = (df[col-1] + df[col-2] + df[col-3] + df[col-4] + df[col-5] + df[col-6])/6
        
#    df_test[''.join(('revenue_prev_year', str(col)))] = np.nan
#    df_test[''.join(('prev_year', str(col)))] = np.nan
#    if col > 11:
#        df_test[''.join(('prev_year', str(col)))] = df_test[col-12]
#        df_test[''.join(('revenue_prev_year', str(col)))] = df_test[''.join(('revenue', str(col-12)))]
#    df_test[''.join(('revenue_prev', str(col)))] = df_test[''.join(('revenue', str(col-1)))]
#    df_test[''.join(('revenue_prev_diff', str(col)))] = df_test[''.join(('revenue_prev', str(col)))] - df_test[''.join(('revenue_prev', str(col-1)))]
        
df.head()

In [None]:
## calc quarts
#for col in range(2, 35):
#    if (col+1)%3 == 0:
#        q = (col+1)//3
#        df[''.join(('Q', str(q)))] = df[col] + df[col-1] + df[col-2]
#        #df_test[''.join(('revenueQ', str(q)))] = df_test[''.join(('revenue', str(col)))] + df_test[''.join(('revenue', str(col-1)))] + df_test[''.join(('revenue', str(col-2)))]
#df.head()

In [None]:
#df_test[[''.join(('itemQ', str(q))) for q in range(1,12)]].describe()
df[[''.join(('prev_diff', str(q))) for q in range(35)]].describe()

In [None]:
df.columns.values

In [None]:
df.describe()

In [None]:
%%time
df['shop_type'] = df['shop_id'].apply(lambda x: get_shop_type(x))

In [None]:
%%time
df['shop_mega'] = df['shop_id'].apply(lambda x: get_shop_megality(x))

In [None]:
%%time
df['shop_cluster'] = df['shop_id'].apply(lambda x: get_shop_cluster(x))

In [None]:
%%time
df['subcategory'] = df['item_id'].apply(lambda x: get_category(x))

In [None]:
%%time
df['digital'] = df['item_id'].apply(lambda x: get_category_digitality(x))

In [None]:
%%time
df['city'] = df['shop_id'].apply(lambda x: get_shop_city(x))

In [None]:
%%time
df['city_cluster'] = df['city'].apply(lambda x: get_city_cluster(x))

In [None]:
%%time
df['category_cluster'] = df['subcategory'].apply(lambda x: get_category_cluster(x))

In [None]:
df['category'] = df['subcategory'].str.split('[-(]', n=0).str[0].str.strip()

In [None]:
%%time
df[item_labels.columns.values] = df['item_id'].apply(lambda x: get_items_features(x))

In [None]:
#%%time
#df_sells_in_month['month'] = df_sells_in_month['date_block_num']%12 + 1

In [None]:
# clip all
#df_sells_in_month['item_cnt_prev_month'] = np.clip(df_sells_in_month['item_cnt_prev_month'], 0, 20)
#df_sells_in_month['item_cnt_month'] = np.clip(df_sells_in_month['item_cnt_month'], 0, 20)

#df_sells_in_month['prev_itemQ'].describe()

In [None]:
df.tail()

In [None]:
df.loc[df['digital'] == 1, 33].describe()

In [None]:
#df[item_labels.columns.values] = df['item_id'].apply(lambda x: get_items_features(x))

In [None]:
df.columns.values

In [None]:
#df_test = pd.merge(df_test, df, on=['shop_id','item_id'], how='left')
#df_test = df_test.fillna(0)
#df_test.drop(['ID'], axis=1, inplace=True)
#df_test.head()

In [None]:
#%%time
#df_test['shop_type'] = df_test['shop_id'].apply(lambda x: get_shop_type(x))
#df_test['shop_mega'] = df_test['shop_id'].apply(lambda x: get_shop_megality(x))
#df_test['shop_cluster'] = df_test['shop_id'].apply(lambda x: get_shop_cluster(x))
#df_test['subcategory'] = df_test['item_id'].apply(lambda x: get_category(x))
#df_test['digital'] = df_test['item_id'].apply(lambda x: get_category_digitality(x))
#df_test['city'] = df_test['shop_id'].apply(lambda x: get_shop_city(x))
#df_test['city_cluster'] = df_test['city'].apply(lambda x: get_city_cluster(x))
#df_test['category_cluster'] = df_test['subcategory'].apply(lambda x: get_category_cluster(x))
#df_test['category'] = df_test['subcategory'].str.split('[-(]', n=0).str[0].str.strip()

In [None]:
train_cols = [item for sublist in [[''.join(('prev', str(q)))
                       , ''.join(('prev_diff', str(q)))
                       , ''.join(('mean3_', str(q)))
                      , ''.join(('mean6_', str(q)))] 
                      for q in range(6, 33)] for item in sublist]
test_cols = [item for sublist in [[''.join(('prev', str(q)))
                       , ''.join(('prev_diff', str(q)))
                       , ''.join(('mean3_', str(q)))
                      , ''.join(('mean6_', str(q)))] 
                      for q in range(7, 34)] for item in sublist]
pred_cols = [item for sublist in [[''.join(('prev', str(q)))
                       , ''.join(('prev_diff', str(q)))
                       , ''.join(('mean3_', str(q)))
                      , ''.join(('mean6_', str(q)))] 
                      for q in range(8, 35)] for item in sublist]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import r2_score

date_ix = 0

class DataFrameSelector(BaseEstimator, TransformerMixin):
    def __init__(self, attributes_names):
        self.attributes_names = attributes_names
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.attributes_names].values # convert to NumPy array

class CycleTransformator( BaseEstimator, TransformerMixin):
    #Class Constructor 
    def __init__( self,  cycle_columns ):
        self._cycle_columns = cycle_columns
        self._cycle_stats = {}
    
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        for column in self._cycle_columns:
            self._cycle_stats[column] = { 'max': X[column].max(), 'min': X[column].min() }
        return self
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        self._df = pd.DataFrame(index=X.index)
        for column in self._cycle_columns:
            self._df[column+'_sin'] = np.sin(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))
            self._df[column+'_cos'] = np.cos(2*np.pi/(self._cycle_stats[column]['max']+1)*X[column].fillna(self._cycle_stats[column]['min']-1))

        #self._df = self._df.reset_index(drop=True)
        #print('Cycle transform shape is {}'.format(self._df.values.shape))
        return self._df.values

In [None]:
import sklearn.metrics as metrics
def regression_results(y_true, y_pred):
    # Regression metrics
    explained_variance=metrics.explained_variance_score(y_true, y_pred)
    mean_absolute_error=metrics.mean_absolute_error(y_true, y_pred) 
    mse=metrics.mean_squared_error(y_true, y_pred) 
    mean_squared_log_error=metrics.mean_squared_log_error(y_true, y_pred)
    median_absolute_error=metrics.median_absolute_error(y_true, y_pred)
    r2=metrics.r2_score(y_true, y_pred)
    print('explained_variance: ', round(explained_variance,4))    
    print('mean_squared_log_error: ', round(mean_squared_log_error,4))
    print('r2: ', round(r2,4))
    print('MAE: ', round(mean_absolute_error,4))
    print('MSE: ', round(mse,4))
    print('RMSE: ', round(np.sqrt(mse),4))

In [None]:
from sklearn.metrics import make_scorer

def rmse(actual, predict):
    predict = np.array(predict)
    actual = np.array(actual)
    distance = predict - actual
    square_distance = distance ** 2
    mean_square_distance = square_distance.mean()
    score = np.sqrt(mean_square_distance)
    return score

rmse_score = make_scorer(rmse, greater_is_better = False)

In [None]:
#num_attribs = ['item_cnt_prev_month','item_cnt_prev_diff','prev_itemQ','item_cnt_prev_year']
#num_attribs = ['prev_month','prev_diff', 'prev_itemQ', 'mean3', 'mean6']
num_attribs = item_labels.columns.values

#num_attribs = ['digital']
#cat_attribs = ['shop_cluster', 'category_cluster']
#cat_attribs = ['city_cluster','shop_cluster', 'category_cluster']
#cat_attribs = ['shop_type', 'category', 'city', 'shop_mega', 'digital']
#cat_attribs = ['shop_type', 'subcategory', 'category', 'city', 'shop_mega', 'digital']
#cat_attribs = ['shop_type', 'subcategory', 'category', 'city']
cat_attribs = ['city_cluster','shop_cluster', 'category_cluster', 'shop_type', 'category', 'city', 'shop_mega', 'digital']
#cat_attribs = ['city_cluster','shop_cluster', 'category_cluster', 'shop_type', 'subcategory', 'category', 'city', 'shop_mega', 'digital']
#cat_attribs = ['shop_type', 'category', 'city']
#cat_attribs = ['city_cluster','shop_cluster', 'category_cluster', 'shop_type', 'category']
#cat_attribs = ['city', 'category']
#num_attribs = ['item_id','shop_id','digital']
#cat_attribs = ['category']
date_attribs = ['month']

num_pipeline = Pipeline([
    ('selector', DataFrameSelector(num_attribs)),
#    ('imputer', SimpleImputer(strategy="median")),
#    ('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
    ('selector', DataFrameSelector(cat_attribs)),
    ('cat_encoder', OneHotEncoder(sparse=False, drop='first')),
])
counted_pipeline = Pipeline([
    ('cycle_transformator', CycleTransformator(cycle_columns=date_attribs)),
])

full_pipeline = FeatureUnion(transformer_list=[
    ('num_pipeline', num_pipeline),
    ('cat_pipeline', cat_pipeline),
#    ('counted_pipeline', counted_pipeline),
])

In [None]:
full_pipeline.fit(df)

In [None]:
X_train = df.loc[df[32] > 0]
#train_cols = [c for c in range(6, 32)]
X_train_features = full_pipeline.transform(X_train)
X_train_data = X_train[train_cols]
X_train_prep = np.concatenate((X_train_features, X_train_data), axis=1)
X_train_prep.shape

In [None]:
X_test = df.loc[df[33] > 0]
#test_cols = [c for c in range(7, 33)]
X_test_features = full_pipeline.transform(X_test)
X_test_data = X_test[test_cols]
X_test_prep = np.concatenate((X_test_features, X_test_data), axis=1)
X_test_prep.shape

In [None]:
X_pred = df.loc[df[33] > 0]
#pred_cols = [c for c in range(8, 34)]
X_pred_features = full_pipeline.transform(X_pred)
X_pred_data = X_pred[pred_cols]
X_pred_prep = np.concatenate((X_pred_features, X_pred_data), axis=1)
X_pred_prep.shape

In [None]:
Y_train = np.clip(X_train[32], 0, 20)
Y_test = np.clip(X_test[33], 0, 20)

In [None]:
#from sklearn.preprocessing import StandardScaler

#sc = StandardScaler()
##X_train = sc.fit_transform(X_train_data)
#X_train = np.concatenate((X_prepared, X_train_data), axis=1)
##X_train = X_train_data
#X_train.shape

In [None]:
#
#X_test = sc.transform(X_test_data)
#X_test = X_test_data
#X_test = np.concatenate((X_prepared, X_test_data), axis=1)
#X_test.shape

In [None]:
#X_pred = sc.transform(X_pred_data)
#X_pred = X_pred_data
#X_pred = np.concatenate((X_prepared_pred, X_pred_data), axis=1)
#X_pred.shape

In [None]:
#from sklearn.preprocessing import PolynomialFeatures

#poly = PolynomialFeatures(degree=2)
#X_train_prepared = poly.fit_transform(X_train_prepared)
#X_test_prepared = poly.transform(X_test_prepared)
#X_train_prepared.shape

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error

def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Std Deviation:", scores.std())

In [None]:
%%time

from sklearn.tree import DecisionTreeRegressor

# train
tree_reg = DecisionTreeRegressor(random_state=57)
tree_reg.fit(X_train_prep, Y_train)

# predict
predictions = tree_reg.predict(X_train_prep)

#scores = cross_val_score(tree_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2")
#print('%s: %f (%f)' % ('Tree: ', scores.mean(), scores.std()))
#tree_rmse_scores = np.sqrt(-scores)
#display_scores(tree_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = tree_reg.predict(X_test_prep)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))
#regression_results(Y_pred , Y_test)

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor

forest_reg = RandomForestRegressor(n_estimators=250, random_state=57, n_jobs=4, verbose=1)
print('Fitting...')
forest_reg.fit(X_train_prep, Y_train)
# predict
print('Predicting...')
predictions = forest_reg.predict(X_train_prep)

#print('Cross validating...')
#scores = cross_val_score(forest_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2", n_jobs=4, verbose=1)
#print('%s: %f (%f)' % ('Forest: ', scores.mean(), scores.std()))
#forest_rmse_scores = np.sqrt(-scores)
#display_scores(forest_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = forest_reg.predict(X_test_prep)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
#feature_importances = 
#forest_reg.feature_importances_

In [None]:
'''
#cat_encoder = cat_pipeline.named_steps["cat_encoder"]
#cat_one_hot_attribs = list(cat_encoder.categories_[0]) + list(cat_encoder.categories_[1]) + list(cat_encoder.categories_[2])
counted_encoder = counted_pipeline.named_steps["cycle_transformator"]
counted_attribs = list(counted_encoder._df.columns)
#attributes = num_attribs + cat_one_hot_attribs + counted_attribs
attributes = num_attribs + counted_attribs
sorted(zip(feature_importances, attributes), reverse=True)
'''

In [None]:
%%time
# Lasso
from sklearn.linear_model import Lasso

alpha = 0.1
lasso = Lasso(alpha=alpha)

lasso.fit(X_train_prep, Y_train)
# predict
predictions = lasso.predict(X_train_prep)

#scores = cross_val_score(lasso, X_train, Y_train, cv=tscv, scoring="r2")
#print('%s: %f (%f)' % ('Lasso: ', scores.mean(), scores.std()))
#lin_rmse_scores = np.sqrt(-scores)
#display_scores(lin_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = lasso.predict(X_test_prep)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
%%time
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.8)
enet.fit(X_train_prep, Y_train)
# predict
predictions = enet.predict(X_train_prep)

#scores = cross_val_score(enet, X_train_prepared, Y_train, cv=tscv, scoring="r2")
#print('%s: %f (%f)' % ('Enet: ', scores.mean(), scores.std()))
#lin_rmse_scores = np.sqrt(-scores)
#display_scores(lin_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = enet.predict(X_test_prep)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
%%time
from xgboost import XGBRegressor

boost_reg = XGBRegressor(random_state=57, verbosity=1)
print('Fitting...')
boost_reg.fit(X_train_prep, Y_train)
# predict
print('Predicting...')
predictions = boost_reg.predict(X_train_prep)

#print('Cross validating...')
#scores = cross_val_score(forest_reg, X_train_prepared, Y_train, cv=tscv, scoring="r2", n_jobs=4, verbose=1)
#print('%s: %f (%f)' % ('Forest: ', scores.mean(), scores.std()))
#forest_rmse_scores = np.sqrt(-scores)
#display_scores(forest_rmse_scores)

print("R2-score: %.2f" % r2_score(Y_train, predictions) )

print('Testing...')
Y_pred = boost_reg.predict(X_test_prep)
#Y_pred = np.clip(Y_pred, 0, 20)
print("R2-score: %.2f" % r2_score(Y_test, Y_pred) )
print("MSE: %.6f" % mean_squared_error(Y_test, Y_pred))

In [None]:
#df_test['34_scaled'] = (20*(df_test[34] - np.min(df_test[34]))/np.ptp(df_test[34]))   
#df_test['34'] = np.clip(df_test[34], 0, 20)

In [None]:
#X_prepared = full_pipeline.transform(X_pred)
#
#Y_pred = lasso.predict(X_pred)
Y_pred = forest_reg.predict(X_pred_prep)
#Y_pred = tree_reg.predict(X_pred)

In [None]:
df.loc[df[33]>0, 34] = Y_pred

In [None]:
df_test = pd.merge(df_test, df, on=['shop_id','item_id'], how='left')
df_test = df_test.fillna(0)
df_test.head()

In [None]:
df_test['item_cnt_month']= df_test[34]
df_test[['ID', 'item_cnt_month']].to_csv('submission108_2.csv', index=False)

In [None]:
#df_submission = pd.read_csv('sample_submission.csv')
#df_submission['item_cnt_month'] = df_test['34_scaled']
#df_submission['item_cnt_month'] = np.clip(df_prediction['item_cnt_month'], 0, 20)
#df_submission['item_cnt_month'] = Y_pred
#df_submission['item_cnt_month'] = np.clip(Y_pred, 0, 20)
#df_submission['item_cnt_month'] = Y_pred
#df_submission.to_csv('submission104_12.csv', index=False)
#df_submission.head()

In [None]:
df_submission['item_cnt_month'].describe()

In [None]:
df_submission[df_submission['item_cnt_month']>0].count()

In [None]:
print(df_test[df_test[32]>0].count())
df_test[32].describe()

submission104_1.csv
a few seconds ago by Andrey Vest

RandomForest, merge test, previous , cluster features
1.22073

submission104_6.csv
21 minutes ago by Andrey Vest

Tree, merge test, previous , all features - cluster features + item features
1.17881

submission104_7.csv
a few seconds ago by Andrey Vest

Lasso, merge test, previous , all features - cluster features + item features + stdScale
1.22322

submission104_11.csv
2 minutes ago by Andrey Vest

Forest, merge test, previous , all features - cluster features + item features, trained on full data
1.15678