In [35]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

In [4]:
from pymystem3 import Mystem
import re


def process(s):
    mystem = Mystem()
    pattern = r'[^0-9a-zа-яё\s]'
    s = s.lower()
    s = re.sub(pattern, '', s)
    s = re.sub(r'[\s\t]+', ' ', s)
    lemmas = mystem.lemmatize(s)
    return [i for i in lemmas[:-1] if i is not ' ']

In [5]:
from multiprocessing import Pool, Lock, Value
import time


mutex = Lock()
n_processed = Value('i', 0)

with open('project/docs_titles.tsv') as f:
    next(f)
    def multiprocessing_func(line):
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            title = process(data[1])
        with mutex:
            global n_processed
            n_processed.value += 1
            if n_processed.value % 100 == 0:
                print(f"\r{n_processed.value} objects are processed...", end='', flush=True)
        return (doc_id, title)
    
    with Pool() as pool:
        starttime = time.time()
        docs = pool.map(multiprocessing_func, [line for line in f])
        print('\nThat took {} seconds.'.format(round((time.time() - starttime), 2)))

28000 objects are processed...
That took 9214.13 seconds.


In [6]:
doc_to_title = {}
starttime = time.time()
for doc in docs:
    doc_to_title[doc[0]] = doc[1]
print('That took {} seconds.'.format(round((time.time() - starttime), 2)))

That took 0.01 seconds.


In [19]:
with open('refinedDocTitles.tsv', 'w') as f:
    for i in doc_to_title:
        s = '{}\t{}\n'.format(i, doc_to_title[i])
        f.write(s)

<h1 style="text-align:center">Preprocessing</h1

In [56]:
import pandas as pd
train_data = pd.read_csv('/home/oleg/sphere/dm/hw/project/train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [57]:
import numpy as np
y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    for k, (doc_id, title, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
        all_dist = []
        words = set(title)
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j, target_j = docs[j]
            words_j = set(title_j)
            all_dist.append(len(words.intersection(words_j)))
        X_train.append(sorted(all_dist, reverse=True)[0:15]    )
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
X_train = StandardScaler().fit_transform(X_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(11690, 15) (11690,) (11690,)


In [58]:
from sklearn.base import BaseEstimator, ClassifierMixin

def batch_generator(X, y, shuffle=True, batch_size=1):
    X_batch, y_batch = X, y
    if shuffle is True:
        from sklearn.utils import shuffle
        X_batch, y_batch = shuffle(X, y)
    for i in range(0, X.shape[0], batch_size):
        yield X_batch[i:i + batch_size], y_batch[i:i + batch_size]

def sigmoid(x):
    sigm_value_x = 1 / (1 + np.exp(-x))
    return sigm_value_x


class MySGDClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, batch_generator, C=1, alpha=0.01,
                 max_epoch=10, model_type='lin_reg', threshold=0.5):
        self.C = C
        self.alpha = alpha
        self.max_epoch = max_epoch
        self.batch_generator = batch_generator
        self.errors_log = {'iter' : [], 'loss' : []}  
        self.model_type = model_type
        self.threshold = threshold
        
    def calc_loss(self, X_batch, y_batch):
        N = X_batch.shape[0]
        new_weights = self.weights.copy()
        new_weights[0, 0] = 0
        reg = 1 / self.C * new_weights.T.dot(new_weights)
        if self.model_type == "lin_reg":
            a = X_batch.dot(self.weights)
            loss = 1 / N * (y_batch - a).T.dot(y_batch - a) + reg
        elif self.model_type == "log_reg":
            a = sigmoid(X_batch.dot(self.weights))
            loss = -1 / N * (y_batch.T.dot(np.log(a)) + 
                             (1 - y_batch).T.dot(np.log(1 - a))) + reg
        return loss
    
    def calc_loss_grad(self, X_batch, y_batch):
        N = X_batch.shape[0]
        new_weights = self.weights.copy()
        new_weights[0, 0] = 0
        reg = 2 / self.C * new_weights
        if self.model_type == "lin_reg":
            a = X_batch.dot(self.weights)
            loss_grad = 2 / N * (X_batch.T).dot(a - y_batch) + reg
        elif self.model_type == "log_reg":
            a = sigmoid(X_batch.dot(self.weights))
            loss_grad = 1 / N * X_batch.T.dot(a - y_batch) + reg
        return loss_grad
    
    def update_weights(self, new_grad):
        self.weights = self.weights - self.alpha * new_grad
        
    def fit(self, X, y):
        X = X.reshape(X.shape[0], -1)
        y = y.reshape(y.shape[0], -1)
        X_ones = np.ones((X.shape[0], 1))
        X = np.hstack((X_ones, X))
        self.weights = np.random.rand(X.shape[1]).reshape(X.shape[1], -1)
        for n in range(0, self.max_epoch):
            new_epoch_generator = self.batch_generator(X, y)
            for batch_num, new_batch in enumerate(new_epoch_generator):
                X_batch = new_batch[0]
                y_batch = new_batch[1]                
                batch_grad = self.calc_loss_grad(X_batch, y_batch)
                self.update_weights(batch_grad)
                batch_loss = self.calc_loss(X_batch, y_batch)
                self.errors_log['iter'].append(batch_num)
                self.errors_log['loss'].append(batch_loss)
        return self
        
    def predict(self, X):
        X_ones = np.ones((X.shape[0], 1))
        X = np.hstack((X_ones, X))
        if self.model_type=="lin_reg":
            y_hat = X.dot(self.weights) > self.threshold
        elif self.model_type=="log_reg":
            y_hat = sigmoid(X.dot(self.weights)) > 0.5
        return y_hat

    def score(self, X, y):
        pred = self.predict(X)
        return np.mean(pred == y)

In [59]:
from sklearn.model_selection import GridSearchCV
grid_param = {
    'threshold': np.arange(0.1, 1, 0.1),
    'C': np.arange(10, 20),
    'alpha': [0.01],
    'max_epoch': np.arange(1, 6),
    'model_type': ['lin_reg'],
}
myclf = MySGDClassifier(batch_generator)
gd_sr = GridSearchCV(estimator=myclf,
                     param_grid=grid_param,
                     scoring='f1',
                     cv=5,
                     n_jobs=-1)


starttime = time.time()
gd_sr.fit(X_train, y_train)
print('That took {} seconds.'.format(round((time.time() - starttime), 2)))

best_parameters = gd_sr.best_params_
best_score = gd_sr.best_score_
print(best_parameters, best_score)
best_parameters['batch_generator'] = batch_generator

That took 1305.01 seconds.
{'C': 14, 'alpha': 0.01, 'max_epoch': 5, 'model_type': 'lin_reg', 'threshold': 0.30000000000000004} 0.6772595884306559


In [60]:
test_data = pd.read_csv('/home/oleg/sphere/dm/hw/project/test_groups.csv')
testgroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in testgroups_titledata:
        testgroups_titledata[doc_group] = []
    testgroups_titledata[doc_group].append((doc_id, title))

X_test = []
groups_test = []
for new_group in testgroups_titledata:
    docs = testgroups_titledata[new_group]
    for k, (doc_id, title) in enumerate(docs):

        groups_test.append(new_group)
        all_dist = []
        words = set(title)
        for j in range(0, len(docs)):
            if k == j:
                continue
            doc_id_j, title_j = docs[j]
            words_j = set(title_j)
            all_dist.append(len(words.intersection(words_j)))
        X_test.append(sorted(all_dist, reverse=True)[0:15])

X_test = np.array(X_test)
groups_test = np.array(groups_test)
X_test = StandardScaler().fit_transform(X_test)
print(X_test.shape, groups_test.shape)

(16627, 15) (16627,)


In [62]:
myclf = MySGDClassifier(**best_parameters)
myclf.fit(X_train, y_train)
y_pred = myclf.predict(X_test).astype(int).flatten()
submission = pd.DataFrame({'pair_id': test_data['pair_id'], 'target': y_pred})
submission.to_csv("submission4.csv", index=False)

In [61]:
best_parameters

{'C': 14,
 'alpha': 0.01,
 'max_epoch': 5,
 'model_type': 'lin_reg',
 'threshold': 0.30000000000000004,
 'batch_generator': <function __main__.batch_generator(X, y, shuffle=True, batch_size=1)>}