# Read data from `data/rus2016_cat_wd.csv`

In [2]:
import pandas as pd
data = pd.read_csv('data/rus2016_cat_wd.csv',
                   header=0,
                   names=['cat', 'name'], 
                   dtype={'cat': 'category'},
                  )

# Drop rows with NaN values
data = data.dropna()

# Preprocess labels

In [12]:
import re
from functools import reduce, partial
import numpy as np
from nltk.corpus import stopwords
from Stemmer import Stemmer

def create_replace(pattern, sub):
    p = re.compile(pattern)
    return partial(p.sub, sub)

def create_stem():
    stemmer = Stemmer("russian")
    return lambda s: " ".join(stemmer.stemWords(s.split()))

def create_transform_quoted():
    p = re.compile(r'"([^"]+)"')
    return partial(p.sub, lambda m: m.group(1).replace(" ", ""))

def create_remove_stopwords():
    russian_stopwords = stopwords.words('russian')
    russian_stopwords.extend(['что', 'это', 'так', 'вот', 'быть', 'как', 'в', 'к', 'на', 'с'])
    p = re.compile(r'\b('+r'|'.join(russian_stopwords)+')\b\s*')
    return partial(p.sub, '')

def compose(*functions):
    return reduce(lambda f, g: lambda x: f(g(x)), reversed(functions))
    
preprocessor = compose(
    # lowercase
    lambda s: s.lower(),
    # replace underscores with spaces
    create_replace(r"_", " "),
    # replace dimensions
    create_replace(r"[\d.]+x[\d.]+(x[\d.]+)? ?([мс]?м|[mc]?m)", "dimenisions"),
    # replace lengths
    create_replace(r"[\d.]+ ?([мс]?м|[mc]?m)", "length"),
    # replace weights
    create_replace(r"[\d.]+ ?([мк]?г|\w*грамм|[mk]?g)", "weight"),
    # replace volumes
    create_replace(r"[\d.]+ ?([м]?л|[m]?l)", "volume"),
    # replace quantities
    create_replace(r"[\d.]+ ?шт", "quantity"),
    # replace percentages
    create_replace(r"[\d.]+ ?%", "percentage"),
    # transform quoted expressions into a signle token
#     create_transform_quoted(),
    # remove stop words
    create_remove_stopwords(),
    # stem
    create_stem(),
    # replace numbers
    create_replace(r"[\d.]+", "number"),
    # remove special characters
#     create_replace(r"[^\w ]+", ""),
)

%time data['processed'] = data['name'].apply(preprocessor)

# Preview results
data[data['processed'].notnull()].sample(50)

CPU times: user 1min 3s, sys: 46 ms, total: 1min 3s
Wall time: 1min 3s


Unnamed: 0,cat,name,processed
589236,212,картофель молодой,картофел молод
450949,303,Пакет,пакет
1524111,55,масло 1 шт,масл quantity
1669994,95,"Кефир ""веселый молочник"" снежок 2,5% 475 г","кефир ""весел молочник"" снежок number,percentag..."
280713,213,яблоко,яблок
1280285,184,Грудка цыпленка,грудк цыпленк
484134,50,"ржаной хлеб ""Смак"" бабушкин 1 шт","ржан хлеб ""смак"" бабушкин quantity"
834126,134,"палпи ""добрый"" 0.9 л","палп ""добрый"" volume"
687014,198,"Весна салат, огурцы, редис, зелень","весн салат, огурцы, редис, зелен"
662422,212,Капуста Белокачанная,капуст белокача


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
%time feature_transform = vectorizer.fit(data['name'].values.astype('U'))
print('Number of features before processing: %s' % len(feature_transform.get_feature_names()))

vectorizer = CountVectorizer()
%time feature_transform = vectorizer.fit(data['processed'].values.astype('U'))
print('Number of features after processing: %s' % len(feature_transform.get_feature_names()))

CPU times: user 30.6 s, sys: 694 ms, total: 31.3 s
Wall time: 31.3 s
Number of features before processing: 176158
CPU times: user 30.6 s, sys: 984 ms, total: 31.6 s
Wall time: 31.6 s
Number of features after processing: 162175


In [10]:
feature_transform.get_feature_names()

['_number',
 '_numberd',
 '_говядин',
 '_хемофарм',
 'aa',
 'aaa',
 'aaaaa',
 'aaalrnumber',
 'aad',
 'aadelmadeinkirgyzstan',
 'aadnumber',
 'aahrvolumeengthah',
 'aalrnumber',
 'aalrweight',
 'aanumber',
 'aanumbernumberq',
 'aanumberv',
 'aapollo',
 'aasha',
 'ab',
 'abai',
 'abasin',
 'abb',
 'abba',
 'abbaрус',
 'abbi',
 'abbie',
 'abbot',
 'abbott',
 'abbronzante',
 'abc',
 'abcdesign',
 'abcent',
 'abcp',
 'abcспец',
 'abdicompany',
 'abeona',
 'abf',
 'abilita',
 'abitare',
 'abkhaz',
 'abkt',
 'abnumber',
 'about',
 'abranet',
 'abrasador',
 'abrezza',
 'abro',
 'abromasters',
 'abronumber',
 'abroесnumber',
 'abs',
 'absbar',
 'abshabby',
 'absint',
 'absinth',
 'absnumbernnumber',
 'absolue',
 'absolut',
 'absolutcream',
 'absolute',
 'absolutechram',
 'absolutelygorgeous',
 'absolutelygorgeousorange',
 'absolutex',
 'absoluttopnumber',
 'absorb',
 'abstraction',
 'absпластик',
 'abtoys',
 'abx',
 'ac',
 'acacia',
 'academia',
 'academy',
 'academysstyle',
 'academystile',
 