In [1]:
import numpy as np
from sklearn.model_selection import cross_val_score
from nltk.corpus import stopwords 
import re
import pandas as pd
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import nltk
import string

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [2]:
def clean_name(name):
    name = re.sub(r'[^\w]', ' ', name)
    name = re.sub(r'[,:._\-\[\]\d]', ' ', name)
    return name.lower().strip()
    
def read_and_clean_data():
    train = pd.read_parquet('data_fusion_train.parquet')
    train = train[train.category_id != -1].drop_duplicates('item_name')
    train['item_name'] = train['item_name'].apply(clean_name)

    train.replace('', np.nan, inplace=True)
    train.dropna(subset = ['item_name'], inplace = True)

    return train.sample(frac=1)   

def train_model_bi(train):
    freq = CountVectorizer(stop_words=stop, binary = True)
    X_train = freq.fit_transform(train.item_name)

    clf = LinearSVC()
    print(cross_val_score(clf, X_train, train['class'], cv=3, scoring='f1_weighted'))
    _ = clf.fit(X_train, train['class'])
    return clf, freq

def train_model(train):
    freq = CountVectorizer(stop_words=stop, binary = True)
    X_train = freq.fit_transform(train.item_name)

    clf = LinearSVC()
    print(cross_val_score(clf, X_train, train.category_id, cv=3, scoring='f1_weighted'))
    _ = clf.fit(X_train, train.category_id)
    return clf, freq

In [3]:
bad_cats = [71, 204, 79, 114]#, 0, 1, 3, 88, 86, 111]

train = read_and_clean_data()
train['class'] = train.category_id.isin(bad_cats)
stop = stopwords.words('russian')

clf_bi, freq_bi = train_model_bi(train)

[0.92830172 0.92826276 0.92556586]


In [4]:
train_true = train[train['class'] == True]
train_false = train[train['class'] == False]

clf_false, freq_false = train_model(train_false)
clf_true, freq_true   = train_model(train_true)

[0.82920573 0.83155629 0.82854325]
[0.88360154 0.88385569 0.88157547]


In [5]:
pickle.dump(freq_bi, open('booster_submit/freq_bi', 'wb'))
pickle.dump(freq_false, open('booster_submit/freq_false', 'wb'))
pickle.dump(freq_true, open('booster_submit/freq_true', 'wb'))

pickle.dump(clf_bi, open('booster_submit/clf_task1_bi', 'wb'))
pickle.dump(clf_false, open('booster_submit/clf_task1_false', 'wb'))
pickle.dump(clf_true, open('booster_submit/clf_task1_true', 'wb'))

pickle.dump(freq_bi, open('freq_bi', 'wb'))
pickle.dump(freq_false, open('freq_false', 'wb'))
pickle.dump(freq_true, open('freq_true', 'wb'))

pickle.dump(clf_bi, open('clf_task1_bi', 'wb'))
pickle.dump(clf_false, open('clf_task1_false', 'wb'))
pickle.dump(clf_true, open('clf_task1_true', 'wb'))


!cd booster_submit;zip submit.zip *  


updating: clf_task1_bi (deflated 41%)
updating: clf_task1_false (deflated 74%)
updating: freq_bi (deflated 51%)
updating: freq_false (deflated 51%)
updating: script.py (deflated 60%)
  adding: clf_task1_true (deflated 41%)
  adding: freq_true (deflated 54%)
