In [None]:
import pandas as pd
import numpy as np
import nltk
import re
import pickle
from nltk.corpus import stopwords
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

import sys
sys.path.append('../../../github/tips-triks/utils/text')
import  txtfeat

In [None]:
def clean_name(name):
    name = re.sub(r'[^\w]', ' ', name)
    name = re.sub(r'[,:._\-\[\]\d]', ' ', name)
    
    result = ''
    for word in nltk.word_tokenize(name.lower()):
        if word == 'nan':
            word = 'krvnan'

        result += ' ' + word

    return result.strip()

def read_and_clean_data():
    train = pd.read_parquet('data_fusion_train.parquet')[['category_id', 'item_name']]
    train = train[train.category_id != -1].drop_duplicates('item_name')
    train['item_name'] = train['item_name'].apply(clean_name)

    train.replace('', np.nan, inplace=True)
    train.dropna(subset = ['item_name'], inplace = True)

    return train.sample(frac=1)

def get_freq_nwords_on_category(train):
    df = pd.DataFrame(txtfeat.get_texts(train.set_index('category_id'), 'item_name', unique = False, sort = True, reset_index = False))
    df1 = df.groupby(df.index).apply(lambda c: c.groupby('item_name').apply(len).sort_values(ascending = False))
    df1 = df1.reset_index()
    df1.columns = ['category_id', 'item_name', 'words_count_in_category']

    df2 = pd.DataFrame(df.reset_index().groupby('category_id').apply(len), columns = ['all_corpus_words']).reset_index()

    df1 =  df1.merge(df2)
    df1['freq_in_category'] = df1.words_count_in_category/df1.all_corpus_words
    df_type = df1.groupby('category_id').apply(lambda g : g.freq_in_category.mean() < g.freq_in_category.std()).reset_index()
    df_type.columns = ['category_id', 'word_type_in_category']
    
    df1 =  df1.merge(df_type)
    df1 =  df1.merge(pd.Series(df1.groupby('item_name').apply(lambda g: g.freq_in_category.mean()), name = 'freq_in_corpus').reset_index())

    return df1

def split_data_on_uniq_and_mix(data, vocab_uniq_category_words):
    freq_on_uniq_words = CountVectorizer(binary=True)
    freq_on_uniq_words.fit(vocab_uniq_category_words.item_name)
    data_transformed = pd.DataFrame(freq_on_uniq_words.transform(data.item_name).toarray(),
                                    columns = freq_on_uniq_words.get_feature_names(), index = data.index)
    data_on_uniq_indexes = data_transformed[(data_transformed.sum(axis = 1) > 0).values].index

    data_mix  = data[data.index.isin(data_on_uniq_indexes) == False]
    data_uniq = data[data.index.isin(data_on_uniq_indexes) == True]
    
    return data_mix, data_uniq

def get_vocab_uniq_words(train):
    words_in_categories_stat = get_freq_nwords_on_category(train)
    vocab_uniq_category_words = words_in_categories_stat[words_in_categories_stat.freq_in_category == words_in_categories_stat.freq_in_corpus].reset_index(drop = True)
    return vocab_uniq_category_words

def train_model(x, y):
    clf = LinearSVC(random_state = 42)
    print(cross_val_score(clf, x, y, cv=3, scoring='f1_weighted'))
    _ = clf.fit(x, y)
    return clf

def write_submit_cl_map(freq_mix, clf_task1_mix):
    !cd booster_submit;rm submit.zip freq_mix freq_uniq clf_task1_mix clf_task1_uniq script.py

    pickle.dump(freq_mix, open('booster_submit/freq_mix', 'wb'))
    pickle.dump(clf_task1_mix, open('booster_submit/clf_task1_mix', 'wb'))
    
    !cd booster_submit;cp ../script_cl_map.py script.py
        
def write_submit_cl_cl(freq_mix, freq_uniq, clf_task1_mix, clf_task1_uniq):
    !cd booster_submit;rm submit.zip freq_mix freq_uniq clf_task1_mix clf_task1_uniq script.py

    pickle.dump(freq_mix, open('booster_submit/freq_mix', 'wb'))
    pickle.dump(freq_uniq, open('booster_submit/freq_uniq', 'wb'))
    pickle.dump(clf_task1_mix, open('booster_submit/clf_task1_mix', 'wb'))
    pickle.dump(clf_task1_uniq, open('booster_submit/clf_task1_uniq', 'wb'))

    !cd booster_submit;cp ../script_cl_cl.py script.py


In [None]:
train = read_and_clean_data()
vocab_uniq_category_words = get_vocab_uniq_words(train)
train_mix, train_uniq = split_data_on_uniq_and_mix(train, vocab_uniq_category_words)

In [None]:
vocab_uniq_category_words.set_index('item_name', drop = True)[['category_id']].to_csv('uniq_words.csv')

In [None]:
submit_as_cl_map = True

stop = stopwords.words('russian')
freq_mix = CountVectorizer(binary=True)

X_train_mix = freq_mix.fit_transform(train_mix['item_name'])
clf_mix = train_model(X_train_mix, train_mix.category_id)

if submit_as_cl_map == True:
    write_submit_cl_map(freq_mix, clf_mix)
else:
    freq_uniq = CountVectorizer(binary=True)
    X_train_uniq = freq_uniq.fit_transform(train_uniq['item_name'])
    clf_uniq = train_model(X_train_uniq, train_uniq.category_id)
    write_submit_cl_cl(freq_mix, freq_uniq, clf_mix, clf_uniq)
    