In [29]:
import pandas as pd
import numpy as np
import scipy.sparse

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import json

from datetime import datetime

import lightgbm as lgb
import gc

from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

import re
from nltk.corpus import stopwords

from razdel import tokenize

import pymorphy2

import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

%matplotlib inline

from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

##### Импортируем датасет с новостями и данными пользователей

In [2]:
news = pd.read_csv('articles.csv')
news.head(3)

Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
users = pd.read_csv('users_articles.csv')
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


#### Получаем векторное представление новостей

##### Добавим словарь со стоп словами

In [12]:
stopword_ru = stopwords.words('russian')

morph = pymorphy2.MorphAnalyzer()

len(stopword_ru)

151

In [13]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

##### Очистка текста

In [6]:
# Функция для очистки текста

def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

# Функция для лемматизации текста

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

##### Запустим очистку текста

In [7]:
%%time
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 1min 43s, sys: 3.88 s, total: 1min 47s
Wall time: 2min 30s


##### Запустим лемматизацию текста

In [15]:
%%time
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 12min 31s, sys: 3.21 s, total: 12min 34s
Wall time: 17min 21s


##### Обучим нашу модель.
##### Сформируем список текстов, разбив пробелами.

In [16]:
# словарь текстов
texts = [t for t in news['title'].values]

# Создадим словарь и вектор
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

KeyboardInterrupt: 

In [None]:
%time
# Тренируем модель на векторе
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)

In [None]:
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

##### Функция, которая возвращает векторное представление новости

In [17]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

##### Составим матрицу вероятности попадания слов в статью

In [18]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

NameError: name 'lda' is not defined

#### Получаем векторное представление пользователей

In [25]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

NameError: name 'topic_matrix' is not defined

##### Реализуем функцию формирования вектора пользователя. Модифицируем ее, чтобы можно было задавать значение по среднему, медиане или максимуму

In [22]:
user_articles_list = users['articles'].iloc[33]

In [23]:
def get_user_embedding(user_articles_list, criterion='mean'):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    if criterion == 'mean':
        user_vector = np.mean(user_vector, 0)
    elif criterion == 'median':
        user_vector = np.median(user_vector, 0)
    elif criterion == 'max':
        user_vector = np.max(user_vector, 0)
    else:
        user_vector = 'error'
    return user_vector, criterion

##### Получим вектора для 

In [24]:
get_user_embedding(user_articles_list)

NameError: name 'doc_dict' is not defined

In [None]:
get_user_embedding(user_articles_list, 'median')

In [None]:
get_user_embedding(user_articles_list, 'max')

##### Свернем получение эмбедингов по разным критериям в одну функцию

In [26]:
def get_embegings(user_vector, criterion='mean'):
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, criterion), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(25)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(25)]]
    
    target = pd.read_csv("users_churn.csv")
    
    X = pd.merge(user_embeddings, target, 'left')
    
    return X

##### Свернем предсказание в функцию

In [28]:
def model_prediction(X):
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(25)]], 
                                                    X['churn'], random_state=0)
    
    logreg = LogisticRegression()
    #обучим 
    logreg.fit(X_train, y_train)

    #наши прогнозы для тестовой выборки
    preds = logreg.predict_proba(X_test)[:, 1]
    
    return y_test, preds

##### Произведем расчеты

In [None]:
X_mean = get_embedings(user_articles_list)
X_median = get_embedings(user_articles_list, 'median')
X_max = get_embedings(user_articles_list, 'max')

In [30]:
y_mean_train, y_mean_test = model_prediction(X_mean)
y_median_train, y_median_test = model_prediction(X_median)
y_max_train, y_max_test = model_prediction(X_max)

NameError: name 'X_mean' is not defined

#### Рассчитаесм метрики Precision, Recall, F_score, Roc_Auc

In [31]:
def calc_metrics(y, preds):
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    return 'Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                            fscore[ix],
                                                                            precision[ix],
                                                                            recall[ix])

In [None]:
result_mean = calc_metrics(y_mean_train, y_mean_test)
result_median = calc_metrics(y_median_train, y_median_test)
result_max = calc_metrics(y_max_train, y_max_test)

In [None]:
results = pd.DataFrame({
    'Mean': pd.Series(result_mean),
    'Median': pd.Series(result_median),
    'Max': pd.Series(result_Max)
})
results