In [1]:
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords 
import re
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
import pickle
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
from gensim.sklearn_api import D2VTransformer
from sklearn.model_selection import train_test_split

import nltk
import string

from sklearn.metrics import f1_score

In [2]:
def clean_name(name):
    name = re.sub(r'[^\w]', ' ', name)
    name = re.sub(r'[,:._\-\[\]\d]', ' ', name)
    
    s = ''
    for i in tokenize(name.lower().strip()):
        s += ' ' + i
    return s#name.lower().strip()

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 3:
                continue
            tokens.append(word.lower())
    return tokens

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue
        yield stem.stem(token)

def find_outlets(clf, X_train):
    pred = clf.predict(X_train)
    d = train.copy()
    d['pred'] = pred
    
    return d[d.category_id != d.pred].groupby('category_id').apply(len).sort_values(), d

def write_submit(tfd_clf, cnv_clf, tfd_freq, cnv_freq):
    pickle.dump(tfd_clf, open('submit_names/tfd_clf', 'wb'))
    pickle.dump(cnv_clf, open('submit_names/cnv_clf', 'wb'))
    pickle.dump(tfd_freq, open('submit_names/tfd_freq', 'wb'))
    pickle.dump(cnv_freq, open('submit_names/cnv_freq', 'wb'))
    !cd submit_names;zip submit_names.zip *  
    
def read_and_clean_data():
    train = pd.read_parquet('data_fusion_train.parquet')
    train = train[train.category_id != -1].drop_duplicates('item_name')
    train['item_name'] = train['item_name'].apply(clean_name)
    
    train.replace('', np.nan, inplace=True)
    train.dropna(subset = ['item_name'], inplace = True)

    return train.sample(frac=1)   

def model_train(train, y):
    stop = stopwords.words('russian')
    tfd_freq = TfidfVectorizer(ngram_range = (3,5), analyzer="char", max_features=300000, binary = False)
    X_train = tfd_freq.fit_transform(train.item_name)
    folds = KFold(3, shuffle=True, random_state=0)
    tfd_clf = LinearSVC()
    print(cross_val_score(tfd_clf, X_train, y, cv=folds, scoring='f1_weighted'))
    tfd_clf.fit(X_train, y)

    cnv_freq = CountVectorizer(stop_words=stop, binary = True)
    X_train = cnv_freq.fit_transform(train.item_name)
    folds = KFold(3, shuffle=True, random_state=0)
    cnv_clf = LinearSVC()
    print(cross_val_score(cnv_clf, X_train, y, cv=folds, scoring='f1_weighted'))
    cnv_clf.fit(X_train, y)
    
    return tfd_clf, cnv_clf, tfd_freq, cnv_freq

def model_predict(test, tfd_clf, cnv_clf, tfd_freq, cnv_freq):
    p1 = tfd_clf._predict_proba_lr(tfd_freq.transform(test.item_name))
    p2 = cnv_clf._predict_proba_lr(cnv_freq.transform(test.item_name))
    
    r = []
    for row in range(0, len(p1)):
        mp1 = max(p1[row])
        mp2 = max(p2[row])
        
        idx_mp1 = np.where(p1[row] == mp1)
        idx_mp2 = np.where(p2[row] == mp2)
        
        if mp1 > mp2:
            r.append(tfd_clf.classes_[idx_mp1])
        else:
            r.append(cnv_clf.classes_[idx_mp2])
    return r
    

In [3]:
train = read_and_clean_data()
X_train, X_test, y_train, y_test = train_test_split(train[['item_name']], train['category_id'], test_size=0.2, random_state=42)

In [4]:
tfd_clf, cnv_clf, tfd_freq, cnv_freq = model_train(X_train, y_train)

[0.81723027 0.81978504 0.82317881]
[0.78982615 0.79137929 0.79508493]


In [5]:
preds = model_predict(X_test, tfd_clf, cnv_clf, tfd_freq, cnv_freq)
f1_score(y_test, preds, average = 'weighted')

0.8306112839959083

In [6]:
write_submit(tfd_clf, cnv_clf, tfd_freq, cnv_freq)

updating: cnv_clf (deflated 75%)
updating: cnv_freq (deflated 51%)
updating: script.py (deflated 58%)
updating: tfd_clf (deflated 79%)
updating: tfd_freq (deflated 75%)
updating: clf_task1 (deflated 80%)
updating: tfidf (deflated 64%)
