In [2]:
# importing libraries

import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
# loading data

categories = pd.read_csv('../datasets/predict_future_sales/translated-categories.xls')
items = pd.read_csv('../datasets/predict_future_sales/translated-items.csv')

### Exercise 1: Extract main category, subcategory and subsubcategory of the item_category_name column (item_categories.csv table) 

In [63]:
categories

Unnamed: 0,category_name,category_id
0,PC - Headsets / Headphones,0
1,Accessories - PS2,1
2,Accessories - PS3,2
3,Accessories - PS4,3
4,Accessories - PSP,4
...,...,...
79,Service,79
80,Service - Tickets,80
81,Blank media (spire),81
82,Blank media (piece),82


In [64]:
def main_cat(cat_name):
    if '-' in cat_name:
        return cat_name.split('-')[0].strip()
    elif '(' in cat_name:
        return cat_name.split('(')[0].strip()
    else:
        return cat_name
    
categories['main_cat'] = categories['category_name'].apply(main_cat)
    
categories

Unnamed: 0,category_name,category_id,main_cat
0,PC - Headsets / Headphones,0,PC
1,Accessories - PS2,1,Accessories
2,Accessories - PS3,2,Accessories
3,Accessories - PS4,3,Accessories
4,Accessories - PSP,4,Accessories
...,...,...,...
79,Service,79,Service
80,Service - Tickets,80,Service
81,Blank media (spire),81,Blank media
82,Blank media (piece),82,Blank media


In [65]:
def sub_cat(cat_name):
    if '-' in cat_name:
        sub = cat_name.split('-')[1].strip()
        if '(' in sub:
            return sub.split('(')[0].split()[0].strip()
        else:
            return sub
    else:
        return None

categories['sub_cat'] = categories['category_name'].apply(sub_cat)

categories

Unnamed: 0,category_name,category_id,main_cat,sub_cat
0,PC - Headsets / Headphones,0,PC,Headsets / Headphones
1,Accessories - PS2,1,Accessories,PS2
2,Accessories - PS3,2,Accessories,PS3
3,Accessories - PS4,3,Accessories,PS4
4,Accessories - PSP,4,Accessories,PSP
...,...,...,...,...
79,Service,79,Service,
80,Service - Tickets,80,Service,Tickets
81,Blank media (spire),81,Blank media,
82,Blank media (piece),82,Blank media,


In [66]:
def subsub_cat(cat_name):
    if '(' in cat_name:
        return cat_name.split('(')[1].strip().strip(')')
    
categories['subsub_cat'] = categories['category_name'].apply(subsub_cat)

categories

Unnamed: 0,category_name,category_id,main_cat,sub_cat,subsub_cat
0,PC - Headsets / Headphones,0,PC,Headsets / Headphones,
1,Accessories - PS2,1,Accessories,PS2,
2,Accessories - PS3,2,Accessories,PS3,
3,Accessories - PS4,3,Accessories,PS4,
4,Accessories - PSP,4,Accessories,PSP,
...,...,...,...,...,...
79,Service,79,Service,,
80,Service - Tickets,80,Service,Tickets,
81,Blank media (spire),81,Blank media,,spire
82,Blank media (piece),82,Blank media,,piece


In [67]:
categories.iloc[[36]]

Unnamed: 0,category_name,category_id,main_cat,sub_cat,subsub_cat
36,Payment cards - Windows (Digital),36,Payment cards,Windows,Digital


### Exercise 2:
Apply:

    CountVectorizer (1gram)
    CountVectorizer (1gram + 2grams)
    TfidfVectorizer (1gram)
    TfidfVectorizer (1gram + 2grams)

to the item_name column (items.csv table).

In [68]:
items

Unnamed: 0,item_id,category_id,item_name
0,0,40,!! IN THE POWER OF HAPPINESS (PLAST) D
1,1,76,! ABBYY FineReader 12 Professional Edition Ful...
2,2,40,*** IN THE GLORY OF THE GLORY (UNV) D
3,3,40,*** BLUE WAVE (Univ) D
4,4,40,*** BOX (GLASS) D
...,...,...,...
22165,22165,31,"Nuclear Titbit 2 [PC, Digital Version]"
22166,22166,54,Query language 1C: Enterprise [Digital version]
22167,22167,49,The query language is 1C: Enterprise 8 (+ CD)....
22168,22168,62,Egg for Little Inu


In [69]:
def remove_special_char(name):
    return re.sub('[!+*+/+]', '', name).strip()
    
items['item_name'] = items['item_name'].apply(remove_special_char)

items

Unnamed: 0,item_id,category_id,item_name
0,0,40,IN THE POWER OF HAPPINESS (PLAST) D
1,1,76,ABBYY FineReader 12 Professional Edition Full ...
2,2,40,IN THE GLORY OF THE GLORY (UNV) D
3,3,40,BLUE WAVE (Univ) D
4,4,40,BOX (GLASS) D
...,...,...,...
22165,22165,31,"Nuclear Titbit 2 [PC, Digital Version]"
22166,22166,54,Query language 1C: Enterprise [Digital version]
22167,22167,49,The query language is 1C: Enterprise 8 ( CD). ...
22168,22168,62,Egg for Little Inu


In [70]:
items_list = items['item_name'].tolist()

In [71]:
items[items['item_name'].str.contains('00002')]

Unnamed: 0,item_id,category_id,item_name
7887,7887,6,X360: Wireless Headset - Wireless Headset (P6F...
7893,7893,6,X360: Gamepad Wireless Black - Wireless Contro...
7894,7894,6,X360: Gamepad wired black - Controller BLACK (...
7895,7895,6,X360: Charger Kit for Black Gamepad - BLACK Ch...
7900,7900,6,XBOX 360 HEADSET T XBOX 360 YU HDWR (P5F-00002)
7936,7936,7,"XOne: Stereo Headset for Xbox One ""Green Camou..."
21271,21271,61,T-shirt WOT IS-4-4 male gray L (100002)


In [3]:
# testing regex

string = "XBOX 360 HEADSET T XBOX 360 YU HDWR (P5F-00002)"
re.sub('\(\d+.*\)', '', string)

'XBOX 360 HEADSET T XBOX 360 YU HDWR (P5F-00002)'

**CountVectorizer (1gram)**

In [73]:
bow = CountVectorizer()
items_bow = bow.fit_transform(items_list)
tokens = bow.get_feature_names()
df_items_bow_1 = pd.DataFrame(data=items_bow.toarray(), index=items['item_id'], columns=tokens)

df_items_bow_1

Unnamed: 0_level_0,00,000,00002,00003,00005,00006,00007,00008,00009h,00010,...,zverushki,zvongo,zykina,zz,çudovi,épilog,šapka,širli,žanna,žel
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22168,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**CountVectorizer (1gram + 2grams)**

In [74]:
d_bow = CountVectorizer(ngram_range=(1, 2), dtype='int8')
d_items_bow = d_bow.fit_transform(items_list)
d_tokens = d_bow.get_feature_names()
df_items_bow_2 = pd.DataFrame(data=d_items_bow.toarray(), index=items['item_id'], columns=d_tokens)

df_items_bow_2

Unnamed: 0_level_0,00,00 07,000,000 accessories,000 dawn,000 kill,000 leagues,000 miniatures,000 points,000 rulebook,...,épilog,épilog cd,šapka,šapka nevidimka,širli,širli myrli,žanna,žanna grand,žel,žel boks
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22165,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22166,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22167,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22168,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


**TfidfVectorizer (1gram)**

In [None]:
tfidf = TfidfVectorizer()
items_tfidf = tfidf.fit_transform(items_list)
tfidf_tokens = tfidf.get_feature_names()
df_tfidf_1 = pd.DataFrame(data=items_tfidf.toarray(), index=items['item_id'], columns=tfidf_tokens)
df_tfidf_1

**TfidfVectorizer (1gram + 2gram)**

In [None]:
d_tfidf = TfidfVectorizer(ngram_range=(1, 2), dtype='float32')
d_items_tfidf = d_tfidf.fit_transform(items_list)
d_tfidf_tokens = d_tfidf.get_feature_names()
df_tfidf_2 = pd.DataFrame(data=d_items_tfidf.toarray(), index=items['item_id'], columns=d_tfidf_tokens)
df_tfidf_2

**Playing with hyperparameters**

In [None]:
p_bow = CountVectorizer(ngram_range=(1, 2), dtype='int8', max_df=0.06)
p_items_bow = p_bow.fit_transform(items_list)
p_tokens = p_bow.get_feature_names()
p_df_bow = pd.DataFrame(data=p_items_bow.toarray(), index=items['item_id'], columns=p_tokens)

p_df_bow

In [None]:
p_tfidf = TfidfVectorizer(ngram_range=(1, 2), dtype='float32', use_idf=False, min_df=0.09)
p_items_tfidf = p_tfidf.fit_transform(items_list)
p_tfidf_tokens = p_tfidf.get_feature_names()
p_df_tfidf = pd.DataFrame(data=p_items_tfidf.toarray(), index=items['item_id'], columns=p_tfidf_tokens)
p_df_tfidf