In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords 
import re
import pandas as pd
from sklearn.metrics import f1_score
from sklearn.svm import LinearSVC
from sklearn.decomposition import PCA

from sklearn.feature_extraction.text import CountVectorizer
import pickle
import nltk
import string

In [2]:
def clean_name(name):
    name = re.sub(r'[^\w]', ' ', name)
    name = re.sub(r'[,:._\-\[\]\d]', ' ', name)
    
    s = ''
    for i in tokenize(name.lower().strip()):
        s += ' ' + i
    return s#name.lower().strip()

def tokenize_text(text):
    tokens = []
    for sent in nltk.sent_tokenize(text):
        for word in nltk.word_tokenize(sent):
            if len(word) < 3:
                continue
            tokens.append(word.lower())
    return tokens

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    
    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue
        yield stem.stem(token)

def find_outlets(clf, X_train):
    pred = clf.predict(X_train)
    d = train.copy()
    d['pred'] = pred
    
    return d[d.category_id != d.pred].groupby('category_id').apply(len).sort_values(), d

def write_submit(freq, clf):
    pickle.dump(freq, open('submit_names/tfidf', 'wb'))
    pickle.dump(clf, open('submit_names/clf_task1', 'wb'))
    !cd submit_names;zip submit_names.zip *  
    
def read_and_clean_data():
    train = pd.read_parquet('data_fusion_train.parquet')[['item_name', 'category_id']]
    train = train[train.category_id != -1].drop_duplicates('item_name')
    train['item_name'] = train['item_name'].apply(clean_name)

    train.replace('', np.nan, inplace=True)
    train.dropna(subset = ['item_name'], inplace = True)

    return train.sample(frac=1)   

def get_random_batch_unlabeled_data(part = 0.1):
    unlbl = pd.read_parquet('data_fusion_train.parquet')
    unlbl = unlbl[unlbl.category_id == -1].drop_duplicates('item_name')
    unlbl['item_name'] = unlbl['item_name'].apply(clean_name)

    unlbl.replace('', np.nan, inplace=True)
    unlbl.dropna(subset = ['item_name'], inplace = True)
    unlbl.sample(frac=1)   
    
    return unlbl.iloc[:int(part*len(unlbl)), :]
    
def get_unlabeled_batch_from_file(max_size = 20000):
    return pd.read_csv('pl_01part.csv', index_col = 0).iloc[:max_size, :]
    
def train_model(train):
    X_train = freq.fit_transform(train.item_name)

    clf = LinearSVC()
    clf = clf.fit(X_train, train.category_id)
    return clf, X_train

In [3]:
train = read_and_clean_data()
train, test = train_test_split(train, test_size=0.3, stratify=train['category_id'])

drop_outlets = True
stop = stopwords.words('russian') 
freq = CountVectorizer(stop_words=stop, binary = True)

clf, X_train = train_model(train)
pred = clf.predict(freq.transform(test.item_name))

score = f1_score(test.category_id, pred, average='weighted')
print('score on test without pseudo-labeling: {}, train shape {}'.format(score, train.shape))

score on test without pseudo-labeling: 0.8122468641088886, train shape (33741, 2)


In [4]:
print('Preparing the new train data with pleudo-labeling')

unlbl = get_unlabeled_batch_from_file(80000)
pred = clf.predict(freq.transform(unlbl.item_name))
unlbl['category_id'] = pred

train = train.append(unlbl).reset_index(drop = True)
print('new tarin data with pleudo-labeling, train shape {}'.format(train.shape))

Preparing the new train data with pleudo-labeling
new tarin data with pleudo-labeling, train shape (113741, 2)


In [5]:
freq = CountVectorizer(stop_words=stop, binary = True)
clf, X_train = train_model(train)
pred = clf.predict(freq.transform(test.item_name))

score = f1_score(test.category_id, pred, average='weighted')
print('score on test with pseudo-labeling: {}, train shape {}'.format(score, train.shape))

score on test with pseudo-labeling: 0.8085560239176613, train shape (113741, 2)


In [None]:
# small improvements....
if drop_outlets == True:
    outlets, train_no_outlets = find_outlets(clf, X_train)
    train_no_outlets = train_no_outlets[train_no_outlets.category_id == train_no_outlets.pred]
    train_no_outlets.sample(frac=1) 
    clf, X_train = train_model(train_no_outlets)
    
write_submit(freq, clf)