In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

In [2]:
%load_ext pycodestyle_magic

In [3]:
#%%pycodestyle
def batch_generator(X, y, shuffle=True, batch_size=1):
    """
    Гератор новых батчей для обучения
    X          - матрица объекты-признаки
    y_batch    - вектор ответов
    shuffle    - нужно ли случайно перемешивать выборку
    batch_size - размер батча ( 1 это SGD, > 1 mini-batch GD)
    Генерирует подвыборку для итерации спуска (X_batch, y_batch)
    """
    size = X.shape[0]
    if shuffle:
        pos = np.random.permutation(np.arange(size))
    else:
        pos = np.arange(size)

    for i in range(0, size - batch_size + 1, batch_size):
        X_batch = X[pos[i: i + batch_size]]
        y_batch = y[pos[i: i + batch_size]]
        yield (X_batch, y_batch)

# Теперь можно сделать генератор по данным ()
#  my_batch_generator = batch_generator(X, y, shuffle=True, batch_size=1):

In [4]:
#%%pycodestyle
from sklearn.base import BaseEstimator, ClassifierMixin


def sigmoid(x):
    """
    Вычисляем значение сигмоида.
    X - выход линейной модели
    """
    sigm_value_x = 1 / (1 + np.exp(-x))
    return sigm_value_x


class MySGDClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, batch_generator, batch_size=1,
                 C=1, alpha=0.01,
                 max_epoch=10, model_type='lin_reg'):
        """
        batch_generator -- функция генератор, которой будем создавать батчи
        C - коэф. регуляризации
        alpha - скорость спуска
        max_epoch - максимальное количество эпох
        model_type - тим модели, lin_reg или log_reg
        """

        self.C = C
        self.alpha = alpha
        self.max_epoch = max_epoch
        self.batch_generator = batch_generator
        self.errors_log = {'iter': [], 'loss': []}
        self.model_type = model_type
        self.batch_size = batch_size

    def calc_loss(self, X_batch, y_batch):
        """
        Считаем функцию потерь по батчу
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """
        if self.model_type == "lin_reg":
            dot = np.dot(X_batch, self.weights)
            loss = np.sum((dot - y_batch) ** 2) / X_batch.shape[0]
        elif self.model_type == "log_reg":
            dot = sigmoid(np.dot(X_batch, self.weights))
            temp1 = y_batch * np.log(dot) + (1 - y_batch) * np.log(1 - dot)
            loss = -np.sum(temp1) / X_batch.shape[0]
        loss += (np.sum(self.weights ** 2) / self.C)
        return loss

    def calc_loss_grad(self, X_batch, y_batch):
        """
        Считаем градиент функции потерь по батчу (то что Вы вывели в задании 1)
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """
        if self.model_type == "lin_reg":
            temp1 = np.dot(X_batch, self.weights)
            loss_grad = 2 * (np.dot(temp1 - y_batch,
                                    X_batch)) / X_batch.shape[0]
        elif self.model_type == "log_reg":
            temp1 = sigmoid(np.dot(X_batch, self.weights))
            loss_grad = -np.dot(y_batch - temp1, X_batch) / X_batch.shape[0]
        loss_grad += (2 * self.weights / self.C)
        return loss_grad

    def update_weights(self, new_grad):
        """
        Обновляем вектор весов
        new_grad - градиент по батчу
        """
        self.weights -= self.alpha * new_grad

    def fit(self, X, y):
        '''
        Обучение модели
        X - матрица объекты-признаки
        y - вектор ответов
        '''
        X_upd = np.insert(X, 0, 1, axis=1)
        self.weights = np.random.randn(X_upd.shape[1])
        
        for n in range(0, self.max_epoch):
            new_epoch_generator = self.batch_generator(
                X_upd, y, batch_size=self.batch_size)
            
            for batch_num, new_batch in enumerate(new_epoch_generator):
                X_batch = new_batch[0]
                y_batch = new_batch[1]
                batch_grad = self.calc_loss_grad(X_batch, y_batch)
                self.update_weights(batch_grad)

                batch_loss = self.calc_loss(X_batch, y_batch)
                self.errors_log['iter'].append(batch_num)
                self.errors_log['loss'].append(batch_loss)
        return self

    def predict(self, X):
        '''
        Предсказание класса
        X - матрица объекты-признаки
        Не забудте тип модели (линейная или логистическая регрессия)!
        '''
        X_upd = np.insert(X, 0, 1, axis=1)
        if self.model_type == "lin_reg":
            y_hat = np.dot(X_upd, self.weights)
        elif self.model_type == "log_reg":
            y_hat = sigmoid(np.dot(X_upd, self.weights))
        # Желательно здесь использовать матричные операции
        # между X и весами, например, numpy.dot
        return y_hat

# Боевое применение (3  балла)

In [154]:
import nltk
from nltk.corpus import brown
import pymorphy2

#nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')
stemmer = nltk.stem.SnowballStemmer('russian')
morph = pymorphy2.MorphAnalyzer()
stop_words.append('')

In [155]:
def title_preprocessing(title, stop_words, preprocessing_type='stemming'):
    title = stop_words_remove(title.strip().split(), stop_words, preprocessing_type)
    return ' '.join(title)

def stop_words_remove(title_words, stop_words, preprocessing_type='stemming'):
    #new_title = str()
    new_title = []
    for i in title_words:
        i = i.strip(' !?@#$^&*"\'()_«»<>-+={}[]/\.,:;') #%
        if i in stop_words:
            pass
        else:
            if preprocessing_type == 'stemming':
                i = title_stemming(i)
            else:
                i = title_lemming(i)
            if len(i) == 1:   # 11879 - problem
                pass
            else:
                new_title.append(i)
    return new_title

def title_stemming(title_words):
    return stemmer.stem(title_words)

def title_lemming(title):
    return morph.parse(title)[0].normal_form

text = "страница..."
text = text.strip('.')
text = stemmer.stem(text)
text

In [156]:
# preprosecced_article = pd.read_csv('preprosecced_article.csv')

doc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            #title = data[1]
            title = title_preprocessing(data[1], stop_words, preprocessing_type='lemming')
#             title = preprosecced_article.loc[preprosecced_article['doc_id'] == doc_id, 'text'].values[0]
        doc_to_title[doc_id] = title
print(len(doc_to_title))

28026


In [5]:
def extract_info_from_csv(file, is_target):
    train_data = pd.read_csv(file)
    traingroups_titledata = {}
    for i in range(len(train_data)):
        new_doc = train_data.iloc[i]
        doc_group = new_doc['group_id']
        doc_id = new_doc['doc_id']
        if is_target:
            target = new_doc['target']
        title = doc_to_title[doc_id]
        if doc_group not in traingroups_titledata:
            traingroups_titledata[doc_group] = []
        if is_target:
            traingroups_titledata[doc_group].append((doc_id, title, target))
        else:
            traingroups_titledata[doc_group].append((doc_id, title))
    return train_data, traingroups_titledata


#traingroups_titledata = extract_info_from_csv('train_groups.csv', False)

def create_x_y_groups_train(traingroups_titledata, is_target):
    if is_target:
        y_train = []
    X_train = []
    groups_train = []
    for new_group in traingroups_titledata:
        if is_target:
            docs = traingroups_titledata[new_group]
        else:
            docs = list(map(lambda x: x + (0,), traingroups_titledata[new_group]))
        for k, (doc_id, title, target_id) in enumerate(docs):
            if is_target:
                y_train.append(target_id)
            groups_train.append(new_group)
            all_dist = []
            words = set(title.strip().split())

            for j in range(0, len(docs)):
                if k == j:
                    continue
                
                doc_id_j, title_j, target_j = docs[j]
                words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
            X_train.append(sorted(all_dist, reverse=True)[0:15])
    
    X_train = np.array(X_train)
    groups_train = np.array(groups_train)
    if is_target:
        y_train = np.array(y_train)
        print (X_train.shape, y_train.shape, groups_train.shape)
        return X_train, y_train, groups_train
    else:
        print (X_train.shape, groups_train.shape)
        return X_train, [], groups_train

#X_train, y_train, groups_train = create_x_y_groups_train(traingroups_titledata, False)

In [159]:
ngram = 3

def get_group_articles(data):
    rez = []
    for i in data:
        rez.append(i)

#     print(len(rez))

    if len(rez) == 0:
        rez.append('_'*ngram)
    if len(rez) == 1:
        if (len(rez[0]) < ngram):
#             print(rez[0], sep='  ')
            rez[0] += '_'*(ngram-len(rez[0]))
    return rez


def create_x_y_groups_train(traingroups_titledata, is_target):
    if is_target:
        y_train = []
    X_train = []
    groups_train = []
    for new_group in traingroups_titledata:
        if is_target:
            docs = traingroups_titledata[new_group]
        else:
            docs = list(map(lambda x: x + (0,), traingroups_titledata[new_group]))
        for k, (doc_id, title, target_id) in enumerate(docs):
            if is_target:
                y_train.append(target_id)
            groups_train.append(new_group)
            all_dist = []
            
            vectorizer = CountVectorizer(analyzer='char', ngram_range=(ngram, ngram), stop_words=None)
            try:
                X1 = vectorizer.fit_transform((title.strip().split()))
                words = set(vectorizer.get_feature_names())
            except:
                words = set(title.strip().split())
            
#             words = set(title.strip().split())

            for j in range(0, len(docs)):
                if k == j:
                    continue
                
                doc_id_j, title_j, target_j = docs[j]
                
                vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,3), stop_words=None)
                try:
                    X2 = vectorizer.fit_transform((title.strip().split()))
                    words_j = set(vectorizer.get_feature_names())
                except:
                    words_j = set(title.strip().split())
                
#                 words_j = set(title_j.strip().split())
                all_dist.append(len(words.intersection(words_j)))
            X_train.append(sorted(all_dist, reverse=True)[0:15])
    
    X_train = np.array(X_train)
    groups_train = np.array(groups_train)
    if is_target:
        y_train = np.array(y_train)
        print (X_train.shape, y_train.shape, groups_train.shape)
        return X_train, y_train, groups_train
    else:
        print (X_train.shape, groups_train.shape)
        return X_train, [], groups_train

#X_train, y_train, groups_train = create_x_y_groups_train(traingroups_titledata, False)

In [160]:
train_data, traingroups_titledata = extract_info_from_csv('train_groups.csv', True)

In [None]:
X_train, y_train, groups_train = create_x_y_groups_train(traingroups_titledata, True)

In [None]:
X_train

In [None]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import time

traingroups_titledata

In [None]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

In [None]:
#%%pycodestyle
def validation(X_train, y_train, groups_train, N, nn):
    validation_size = int(len(set(groups_train)) / N)

    val_arr = np.arange(len(set(groups_train)))+1

    validation_group_numbers = val_arr[nn * validation_size:
                                       (nn + 1) * validation_size]

    X_test_validation = []
    y_test_validation = []
    X_train_validation = []
    y_train_validation = []

    for i_num, i in enumerate(groups_train):
        if i in validation_group_numbers:
            X_test_validation.append(X_train[i_num])
            y_test_validation.append(y_train[i_num])
        else:
            X_train_validation.append(X_train[i_num])
            y_train_validation.append(y_train[i_num])

    X_test_validation = np.array(X_test_validation)
    y_test_validation = np.array(y_test_validation)
    X_train_validation = np.array(X_train_validation)
    y_train_validation = np.array(y_train_validation)

    return X_test_validation, y_test_validation, \
        X_train_validation, y_train_validation

In [89]:
import itertools


a_all = [0.3]
c_all = [20]
e_all = [100]
# threshold = np.linspace(0.25, 0.35, 5)
threshold = [0.35]
N = 10

best = 0.
params = (0., 0., 0., "", 0.)

In [91]:
params_all = itertools.product(a_all, c_all, e_all, threshold)

for (a, c, e, th) in params_all:
    sum_score = 0
    for i in range(N):

        X_test_validation, y_test_validation,\
            X_train_validation, y_train_validation = validation(X_train_norm,
                                                                y_train,
                                                                groups_train,
                                                                N, i)

        model = MySGDClassifier(batch_generator=batch_generator, alpha=a,
                                C=c, max_epoch=e, model_type='log_reg',
                                batch_size=X_train_validation.shape[0])
        model.fit(X_train_validation, y_train_validation)

        score = f1_score(y_test_validation,
                         (model.predict(X_test_validation) > th).astype(int))
        sum_score += score
    sum_score /= N
    if sum_score > best:
        best = sum_score
        params = (a, c, e, 'log_reg', th)
print(params, best)

(0.3, 20, 100, 'log_reg', 0.35) 0.6526988659376126


In [92]:
test_data, traingroups_titledata = extract_info_from_csv('test_groups.csv', False)
X_test, y_test, groups_test = create_x_y_groups_train(traingroups_titledata, False)

(16627, 15) (16627,)


In [93]:
scaler = StandardScaler()
scaler.fit(np.concatenate((X_train, X_test)))
X_test = scaler.transform(X_test)
X_train = scaler.transform(X_train)



С лучшими параметрами на валидации сделайте предсказание на тестовом множестве, отправьте его на проверку на платформу kaggle. Убедитесь, что Вы смогли побить public score первого бейзлайна. Если да, то Вы молодец!

In [94]:
params

(0.3, 20, 100, 'log_reg', 0.35)

In [95]:
best_model = MySGDClassifier(batch_generator=batch_generator, alpha=params[0],
                             C=params[1], max_epoch=params[2],
                             model_type=params[3],
                             batch_size=X_train.shape[0])
best_model.fit(X_train, y_train)

y_predict = (best_model.predict(X_test) > params[4]).astype(int)
rez = pd.DataFrame({'pair_id': test_data['pair_id'], 'target': y_predict})
rez.to_csv("linlogreg.csv", index=False)

In [96]:
y_predict.mean()

0.36891802489926023