In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (12,5)

In [2]:
from sklearn.metrics import f1_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
import time

In [3]:
%load_ext pycodestyle_magic

In [4]:
#%%pycodestyle
def batch_generator(X, y, shuffle=True, batch_size=1):
    """
    Гератор новых батчей для обучения
    X          - матрица объекты-признаки
    y_batch    - вектор ответов
    shuffle    - нужно ли случайно перемешивать выборку
    batch_size - размер батча ( 1 это SGD, > 1 mini-batch GD)
    Генерирует подвыборку для итерации спуска (X_batch, y_batch)
    """
    size = X.shape[0]
    if shuffle:
        pos = np.random.permutation(np.arange(size))
    else:
        pos = np.arange(size)

    for i in range(0, size - batch_size + 1, batch_size):
        X_batch = X[pos[i: i + batch_size]]
        y_batch = y[pos[i: i + batch_size]]
        yield (X_batch, y_batch)

# Теперь можно сделать генератор по данным ()
#  my_batch_generator = batch_generator(X, y, shuffle=True, batch_size=1):

In [5]:
#%%pycodestyle
from sklearn.base import BaseEstimator, ClassifierMixin


def sigmoid(x):
    """
    Вычисляем значение сигмоида.
    X - выход линейной модели
    """
    sigm_value_x = 1 / (1 + np.exp(-x))
    return sigm_value_x


class MySGDClassifier(BaseEstimator, ClassifierMixin):

    def __init__(self, batch_generator, batch_size=1,
                 C=1, alpha=0.01,
                 max_epoch=10, model_type='lin_reg'):
        """
        batch_generator -- функция генератор, которой будем создавать батчи
        C - коэф. регуляризации
        alpha - скорость спуска
        max_epoch - максимальное количество эпох
        model_type - тим модели, lin_reg или log_reg
        """

        self.C = C
        self.alpha = alpha
        self.max_epoch = max_epoch
        self.batch_generator = batch_generator
        self.errors_log = {'iter': [], 'loss': []}
        self.model_type = model_type
        self.batch_size = batch_size

    def calc_loss(self, X_batch, y_batch):
        """
        Считаем функцию потерь по батчу
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """
        if self.model_type == "lin_reg":
            dot = np.dot(X_batch, self.weights)
            loss = np.sum((dot - y_batch) ** 2) / X_batch.shape[0]
        elif self.model_type == "log_reg":
            dot = sigmoid(np.dot(X_batch, self.weights))
            temp1 = y_batch * np.log(dot) + (1 - y_batch) * np.log(1 - dot)
            loss = -np.sum(temp1) / X_batch.shape[0]
        loss += (np.sum(self.weights ** 2) / self.C)
        return loss

    def calc_loss_grad(self, X_batch, y_batch):
        """
        Считаем градиент функции потерь по батчу (то что Вы вывели в задании 1)
        X_batch - матрица объекты-признаки по батчу
        y_batch - вектор ответов по батчу
        Не забудте тип модели (линейная или логистическая регрессия)!
        """
        if self.model_type == "lin_reg":
            temp1 = np.dot(X_batch, self.weights)
            loss_grad = 2 * (np.dot(temp1 - y_batch,
                                    X_batch)) / X_batch.shape[0]
        elif self.model_type == "log_reg":
            temp1 = sigmoid(np.dot(X_batch, self.weights))
            loss_grad = -np.dot(y_batch - temp1, X_batch) / X_batch.shape[0]
        loss_grad += (2 * self.weights / self.C)
        return loss_grad

    def update_weights(self, new_grad):
        """
        Обновляем вектор весов
        new_grad - градиент по батчу
        """
        self.weights -= self.alpha * new_grad

    def fit(self, X, y):
        '''
        Обучение модели
        X - матрица объекты-признаки
        y - вектор ответов
        '''
        X_upd = np.insert(X, 0, 1, axis=1)
        self.weights = np.random.randn(X_upd.shape[1])
        
        for n in range(0, self.max_epoch):
            new_epoch_generator = self.batch_generator(
                X_upd, y, batch_size=self.batch_size)
            
            for batch_num, new_batch in enumerate(new_epoch_generator):
                X_batch = new_batch[0]
                y_batch = new_batch[1]
                batch_grad = self.calc_loss_grad(X_batch, y_batch)
                self.update_weights(batch_grad)

                batch_loss = self.calc_loss(X_batch, y_batch)
                self.errors_log['iter'].append(batch_num)
                self.errors_log['loss'].append(batch_loss)
        return self

    def predict(self, X):
        '''
        Предсказание класса
        X - матрица объекты-признаки
        Не забудте тип модели (линейная или логистическая регрессия)!
        '''
        X_upd = np.insert(X, 0, 1, axis=1)
        if self.model_type == "lin_reg":
            y_hat = np.dot(X_upd, self.weights)
        elif self.model_type == "log_reg":
            y_hat = sigmoid(np.dot(X_upd, self.weights))
        # Желательно здесь использовать матричные операции
        # между X и весами, например, numpy.dot
        return y_hat

# Боевое применение (3  балла)

In [6]:
import nltk
from nltk.corpus import brown
import pymorphy2

#nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('russian')
stemmer = nltk.stem.SnowballStemmer('russian')
morph = pymorphy2.MorphAnalyzer()
stop_words.append('')

In [7]:
def title_preprocessing(title, stop_words, preprocessing_type='stemming'):
    title = stop_words_remove(title.strip().split(), stop_words, preprocessing_type)
    return ' '.join(title)

def stop_words_remove(title_words, stop_words, preprocessing_type='stemming'):
    #new_title = str()
    new_title = []
    for i in title_words:
        i = i.strip(' !?@#$^&*"\'()_«»<>-+={}[]/\.,:;') #%
        if i in stop_words:
            pass
        else:
            if preprocessing_type == 'stemming':
                i = title_stemming(i)
            else:
                i = title_lemming(i)
            if len(i) == 1:   # 11879 - problem
                pass
            else:
                new_title.append(i)
    return new_title

def title_stemming(title_words):
    return stemmer.stem(title_words)

def title_lemming(title):
    return morph.parse(title)[0].normal_form

In [8]:
doc_to_title = {}
with open('docs_titles.tsv', encoding='utf-8') as f:
    for num_line, line in enumerate(f):
        if num_line == 0:
            continue
        data = line.strip().split('\t', 1)
        doc_id = int(data[0])
        if len(data) == 1:
            title = ''
        else:
            #title = data[1]
            title = title_preprocessing(data[1], stop_words, preprocessing_type='lemming')
        doc_to_title[doc_id] = title
print(len(doc_to_title))

28026


doc_to_title

In [9]:
import pandas as pd
train_data = pd.read_csv('train_groups.csv')
traingroups_titledata = {}
for i in range(len(train_data)):
    new_doc = train_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    target = new_doc['target']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title, target))

In [10]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from scipy.sparse.csr import csr_matrix #need this if you want to save tfidf_matrix

from sklearn.decomposition import TruncatedSVD

from scipy.spatial.distance import euclidean

def get_group_articles(data):
    rez = []
    for i in data:
        rez.append(i[1])
    return rez


y_train = []
X_train = []
groups_train = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]

    tf = TfidfVectorizer(analyzer='word', ngram_range=(1,1),
                         min_df=0, stop_words=None, sublinear_tf=True)
    all_group_articles = get_group_articles(docs)
    tfidf_matrix = tf.fit_transform(all_group_articles).toarray()

#     print(tfidf_matrix.shape)
    
    u, s, vt = np.linalg.svd(tfidf_matrix.T)
#     print(u.shape, s.shape, vt.shape)
#     print(vt[:2])

#     plt.scatter(vt[:1], vt[1:2])
#     plt.show()

    

#     y_temp = []
#     for (doc_id, title, target_id) in (docs):
#         if target_id == 1:
#             y_temp.append('green')
#         else:
#             y_temp.append('blue')

#     plt.scatter(vt[0:1], vt[1:2], color=y_temp)
#     plt.show()

    
    
    
#     svd = TruncatedSVD(n_components=2, n_iter=7)
#     svd.fit(tfidf_matrix.T)
    
#     print(svd.singular_values_)
    
#     input()
#     continue

    
    for k, (doc_id, title, target_id) in enumerate(docs):
        y_train.append(target_id)
        groups_train.append(new_group)
#         all_dist = []
        
        
        X_train.append((vt[0, k], vt[1, k]))
#         X_train.append((vt[:, k]))


#         svd = TruncatedSVD(n_components=2, n_iter=7)
#         svd.fit(tfidf_matrix)
        
#         for j in range(0, len(docs)):
#             if k == j:
#                 continue

#             all_dist.append(euclidean(tfidf_matrix[k], tfidf_matrix[j]))
#         X_train.append((sorted(all_dist, reverse=False))[0:10])
        
        
X_train = np.array(X_train)
y_train = np.array(y_train)
groups_train = np.array(groups_train)
print (X_train.shape, y_train.shape, groups_train.shape)

(11690, 2) (11690,) (11690,)


In [11]:
X_train

array([[-2.87205025e-01, -1.62220702e-01],
       [-1.18225148e-01,  2.93127514e-01],
       [-1.23440673e-01,  2.15946873e-02],
       ...,
       [ 8.20404111e-19,  1.30868241e-19],
       [-4.47894396e-02,  1.24900090e-16],
       [-1.08591198e-04,  1.17961196e-16]])

In [13]:
pd.DataFrame(X_train)

Unnamed: 0,0,1
0,-2.872050e-01,-1.622207e-01
1,-1.182251e-01,2.931275e-01
2,-1.234407e-01,2.159469e-02
3,5.551115e-17,4.440892e-16
4,-2.220658e-01,-1.423748e-01
5,-6.788986e-02,4.821970e-03
6,-9.643598e-02,-3.132782e-03
7,-1.282126e-01,5.244821e-02
8,-8.771822e-02,-9.474293e-03
9,-1.085910e-02,2.033767e-02


In [14]:
pd.DataFrame(X_train).to_csv('feature_tf-idf_svd.csv', index=False)

In [110]:
scaler = StandardScaler()
X_train_norm = scaler.fit_transform(X_train)

In [111]:
#%%pycodestyle
def validation(X_train, y_train, groups_train, N, nn):
    validation_size = int(len(set(groups_train)) / N)

    val_arr = np.arange(len(set(groups_train)))+1

    validation_group_numbers = val_arr[nn * validation_size:
                                       (nn + 1) * validation_size]

    X_test_validation = []
    y_test_validation = []
    X_train_validation = []
    y_train_validation = []

    for i_num, i in enumerate(groups_train):
        if i in validation_group_numbers:
            X_test_validation.append(X_train[i_num])
            y_test_validation.append(y_train[i_num])
        else:
            X_train_validation.append(X_train[i_num])
            y_train_validation.append(y_train[i_num])

    X_test_validation = np.array(X_test_validation)
    y_test_validation = np.array(y_test_validation)
    X_train_validation = np.array(X_train_validation)
    y_train_validation = np.array(y_train_validation)

    return X_test_validation, y_test_validation, \
        X_train_validation, y_train_validation

In [112]:
import itertools


#a_all = [0.45, 0.5, 0.55]
a_all = np.linspace(0.45, 0.55, 5)
c_all = [50]
e_all = [50, 100]
threshold = np.linspace(0.3, 0.4, 10)
N = 5

best = 0.
params = (0., 0., 0., "", 0.)

In [113]:
params_all = itertools.product(a_all, c_all, e_all, threshold)

for (a, c, e, th) in params_all:
    sum_score = 0
    for i in range(N):

        X_test_validation, y_test_validation,\
            X_train_validation, y_train_validation = validation(X_train_norm,
                                                                y_train,
                                                                groups_train,
                                                                N, i)

        model = MySGDClassifier(batch_generator=batch_generator, alpha=a,
                                C=c, max_epoch=e, model_type='log_reg',
                                batch_size=X_train_validation.shape[0])
        model.fit(X_train_validation, y_train_validation)

        score = f1_score(y_test_validation,
                         (model.predict(X_test_validation) > th).astype(int))
        sum_score += score
    sum_score /= N
    if sum_score > best:
        best = sum_score
        params = (a, c, e, 'log_reg', th)
print(params, best)



(0.47500000000000003, 50, 100, 'log_reg', 0.32222222222222224) 0.6935435416270745


In [15]:
test_data = pd.read_csv('test_groups.csv')

traingroups_titledata = {}
for i in range(len(test_data)):
    new_doc = test_data.iloc[i]
    doc_group = new_doc['group_id']
    doc_id = new_doc['doc_id']
    title = doc_to_title[doc_id]
    if doc_group not in traingroups_titledata:
        traingroups_titledata[doc_group] = []
    traingroups_titledata[doc_group].append((doc_id, title))
    
X_test = []
groups_test = []
for new_group in traingroups_titledata:
    docs = traingroups_titledata[new_group]
    
    tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1),
                         min_df=0, stop_words=None, sublinear_tf=True)
    all_group_articles = get_group_articles(docs)
    tfidf_matrix = tf.fit_transform(all_group_articles).toarray()
    
    u, s, vt = np.linalg.svd(tfidf_matrix.T)
    
    for k, (doc_id, title) in enumerate(docs):
        groups_test.append(new_group)
        
        X_test.append((vt[0, k], vt[1, k]))
        
#         all_dist = []
#         words = set(title.strip().split())
#         for j in range(0, len(docs)):
#             if k == j:
#                 continue
#             doc_id_j, title_j = docs[j]
#             words_j = set(title_j.strip().split())
#         for j in range(0, len(docs)):
#             if k == j:
#                 continue

#             all_dist.append(euclidean(tfidf_matrix[k], tfidf_matrix[j]))
#         X_test.append(sorted(all_dist, reverse=True)[0:10]    )

X_test = np.array(X_test)
groups_test = np.array(groups_test)
print(X_test.shape, groups_test.shape)

(16627, 2) (16627,)


In [16]:
X_test

array([[-0.19991686, -0.15895257],
       [-0.03976771,  0.17658427],
       [-0.12735686, -0.07487224],
       ...,
       [ 0.1183652 ,  0.00054911],
       [ 0.16051067,  0.18262468],
       [ 0.08604586, -0.15932115]])

In [17]:
pd.DataFrame(X_test).to_csv('feature_tf-idf_svd_TEST.csv', index=False)

In [116]:
scaler = StandardScaler()
scaler.fit(np.concatenate((X_train, X_test)))
X_test = scaler.transform(X_test)
X_train = scaler.transform(X_train)

С лучшими параметрами на валидации сделайте предсказание на тестовом множестве, отправьте его на проверку на платформу kaggle. Убедитесь, что Вы смогли побить public score первого бейзлайна. Если да, то Вы молодец!

In [117]:
params

(0.47500000000000003, 50, 100, 'log_reg', 0.32222222222222224)

In [118]:
best_model = MySGDClassifier(batch_generator=batch_generator, alpha=params[0],
                             C=params[1], max_epoch=params[2],
                             model_type=params[3],
                             batch_size=X_train.shape[0])
best_model.fit(X_train, y_train)

y_predict = 1-(best_model.predict(X_test) > params[4]).astype(int)
rez = pd.DataFrame({'pair_id': test_data['pair_id'], 'target': y_predict})
rez.to_csv("linlogreg.csv", index=False)

In [120]:
y_predict.mean()

0.9959102664341132