In [1]:
import pandas as pd

In [2]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [3]:
users = pd.read_csv("users_articles.csv")

Итак, нам нужно получить векторные представления пользователей на основе прочитанным ими новостей и самих новостей

In [4]:
!pip install razdel pymorphy2



You should consider upgrading via the 'C:\Users\huawei\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [5]:
# предобработка текстов
import re
import numpy as np
from gensim.corpora.dictionary import Dictionary
from razdel import tokenize  # сегментация русскоязычного текста на токены и предложения https://github.com/natasha/razdel
import pymorphy2  # Морфологический анализатор

Не все слова равны, не все слова одинаково работают. К примеру, союзы и предлоги в нашей задачи никак не помогут, поэтому можем их выкидывать. (*Но в задачах оценки стиля, к примеру, такие слова будут очень полезны*)

In [6]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\huawei\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
stopword_ru = stopwords.words('russian')
# print(len(stopword_ru))

In [8]:
# stopword_ru[:10]

In [9]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
    
stopword_ru += additional_stopwords
# len(stopword_ru)

In [10]:
# stopword_ru[200:210]

In [11]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    text = re.sub('n', ' ', text)
    
    return text

cache = {}
morph = pymorphy2.MorphAnalyzer()

def lemmatization(text):    
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист лемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w) > 1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords = [i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [12]:
morph = pymorphy2.MorphAnalyzer()
# morph.parse('сбегали')[0].normal_form

In [13]:
news['title'].iloc[:2].apply(lambda x: clean_text(x))

  text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)


0    заместитель председателя правительства рф серг...
1    матч  финала кубка россии по футболу был приос...
Name: title, dtype: object

In [14]:
%%time
from tqdm import tqdm
tqdm.pandas()

# Запускаем очистку текста. Будет долго...
news['title'] = news['title'].progress_apply(lambda x: clean_text(x))

100%|███████████████████████████████████████████████████████████████████████████| 27000/27000 [00:27<00:00, 966.59it/s]

CPU times: total: 28 s
Wall time: 28 s





In [15]:
# news['title'].iloc[:10]

In [16]:
news['title'].iloc[:2].apply(lambda x: lemmatization(x))

0    [заместитель, председатель, правительство, рф,...
1    [матч, финал, кубок, россия, футбол, приостано...
Name: title, dtype: object

In [17]:
%%time
# Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].progress_apply(lambda x: lemmatization(x))

100%|███████████████████████████████████████████████████████████████████████████| 27000/27000 [03:53<00:00, 115.67it/s]


CPU times: total: 3min 53s
Wall time: 3min 53s


А теперь в 3 строчки обучим нашу модель

In [18]:
# сформируем список наших текстов
texts = list(news['title'].values)

# Создадим корпус из списка с текстами
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

Что такое common_dictionary и как он выглядит

In [19]:
len(common_dictionary)

135723

In [20]:
common_dictionary[4]

'банк'

In [21]:
# common_dictionary.id2token

In [22]:
common_dictionary.doc2bow(['коллега', 'пошел', 'пить', 'чай'], allow_update=True)

[(1036, 1), (6204, 1), (12347, 1), (135723, 1)]

Все просто - это словарь наших слов

Запускаем обучение

In [23]:
N_topic = 40

In [24]:
%%time
from gensim.models import LdaModel

# Обучаем модель на корпусе
lda = LdaModel(common_corpus, num_topics=N_topic, id2word=common_dictionary, passes=15)

CPU times: total: 39min 7s
Wall time: 35min 26s


In [25]:
from gensim.test.utils import datapath

# Сохраняем модель на диск
temp_file = datapath("model.lda")
lda.save(temp_file)

In [26]:
# Загружаем обученную модель с диска
lda = LdaModel.load(temp_file)

In [27]:
# Создаем новый корпус документов, которые раньше не видели
other_texts = list(news['title'].iloc[:3])
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'свой', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'провести', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'работа', 'сказать', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'мочь', 'играть', 'ещё', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'это', 'хороший']


[(3, 0.18441282),
 (4, 0.022598917),
 (15, 0.22566308),
 (21, 0.020144101),
 (23, 0.022547606),
 (27, 0.060572412),
 (33, 0.31674802),
 (35, 0.022681735),
 (36, 0.05869327),
 (39, 0.049233317)]

Обучили модель. Теперь 2 вопроса:

1. как выглядят наши темы
2. как получить для документа вектор значений (вероятности принадлежности каждой теме)

In [28]:
x = lda.show_topics(num_topics=N_topic, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

# Печатаем только слова
for topic, words in topics_words:
    print(f"topic_{topic}: " + " ".join(words))

topic_0: рак превысить выдать кость саммит малое антонов
topic_1: свет студент продолжительность сочи ньюйорк тенденция аналог
topic_2: иран снизить конструкция модернизация освобождение ким обязательство
topic_3: это мочь который всё весь говорить эксперт
topic_4: мозг улица лётчик автобус необычный седьмой грунт
topic_5: боевой сила министерство японский бой оборона противник
topic_6: человек который исследование работа научный год статья
topic_7: тело экипаж объект км скорость метр пилот
topic_8: операция карта парка вылет перенести правительственный вылететь
topic_9: земля вода китай повышение пациент китайский опубликовать
topic_10: дело это новость снижение москва россия сообщить
topic_11: год который первый новый время это советский
topic_12: солнце берег вуз порт мышь грузия reuters
topic_13: суд гражданин закон решение депутат право госдума
topic_14: сша американский погибнуть американец северный чёрный корея
topic_15: взрыв продукция москва столица московский сезон ночью
topi

Очень неплохо - большинство тем вполне можно описать о чем они

Давайте напишем функцию, которая будет нам возвращать векторное представление новости

In [29]:
def get_lda_vector(lda, text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]

    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(N_topic):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [30]:
get_lda_vector(lda, news['title'].iloc[0])

array([0.        , 0.01057281, 0.        , 0.06667945, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.02696209, 0.        , 0.12555218, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.02000924, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.06566251, 0.        ,
       0.        , 0.66161233, 0.        , 0.        , 0.        ])

In [31]:
%%time
topic_matrix = pd.DataFrame([get_lda_vector(lda, text) for text in news['title'].values])
topic_matrix.columns = [f'topic_{i}' for i in range(N_topic)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+[f'topic_{i}' for i in range(N_topic)]]

CPU times: total: 2min 13s
Wall time: 2min 1s


In [32]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[[f'topic_{i}' for i in range(N_topic)]].values))

In [33]:
def get_user_embedding(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    # print(user_vector)
    user_vector = np.mean(user_vector, 0)  # можно не среднее
    return user_vector

In [34]:
def get_user_embedding_median(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    # print(user_vector)
    user_vector = np.median(user_vector, 0)  # можно не среднее
    return user_vector

In [35]:
def get_user_embedding_max(user_articles_list, doc_dict):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    # print(user_vector)
    user_vector = np.max(user_vector, 0)  # можно не среднее
    return user_vector

In [36]:
user_articles_list = users['articles'].iloc[33]

get_user_embedding(user_articles_list, doc_dict)

array([0.        , 0.        , 0.00358976, 0.11085354, 0.0101708 ,
       0.00921569, 0.04306472, 0.00468135, 0.        , 0.00520137,
       0.13080177, 0.04540992, 0.        , 0.05987561, 0.01832657,
       0.        , 0.00189629, 0.10098949, 0.        , 0.        ,
       0.03496249, 0.02080615, 0.00996015, 0.        , 0.        ,
       0.00259724, 0.02314653, 0.13221316, 0.0133737 , 0.00314877,
       0.        , 0.        , 0.00334058, 0.        , 0.11328334,
       0.00407024, 0.00660431, 0.        , 0.01734719, 0.05684691])

Построим user_embeddings для 3 различных видов embedding.

In [38]:
%%time
user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x, doc_dict))])
user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings['uid'] = users['uid'].values
user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings.head(3)

CPU times: total: 859 ms
Wall time: 882 ms


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39
0,u105138,0.011247,0.002513,0.005291,0.053419,0.0,0.002416,0.042379,0.004389,0.0,...,0.0,0.008237,0.011963,0.0,0.075149,0.035619,0.020558,0.036703,0.024163,0.037011
1,u108690,0.001781,0.0,0.0,0.118837,0.0,0.006324,0.025715,0.0,0.0,...,0.0,0.0,0.0,0.0,0.113376,0.030914,0.013963,0.043987,0.011901,0.031017
2,u108339,0.0,0.0,0.0,0.074348,0.023745,0.002001,0.019782,0.00891,0.004725,...,0.0,0.0,0.0,0.0,0.133661,0.0,0.029176,0.025609,0.009585,0.04454


In [39]:
%%time
user_embeddings_median = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_median(x, doc_dict))])
user_embeddings_median.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings_median['uid'] = users['uid'].values
user_embeddings_median = user_embeddings_median[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings_median.head(3)

CPU times: total: 1.42 s
Wall time: 1.42 s


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39
0,u105138,0.0,0.0,0.0,0.041325,0.0,0.0,0.031841,0.0,0.0,...,0.0,0.0,0.0,0.0,0.084627,0.0,0.0,0.0,0.0,0.0
1,u108690,0.0,0.0,0.0,0.117806,0.0,0.0,0.019403,0.0,0.0,...,0.0,0.0,0.0,0.0,0.107707,0.020314,0.005652,0.027576,0.0,0.0
2,u108339,0.0,0.0,0.0,0.075069,0.013614,0.0,0.020514,0.0,0.0,...,0.0,0.0,0.0,0.0,0.157725,0.0,0.023175,0.0,0.007359,0.049775


In [40]:
%%time
user_embeddings_max = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding_max(x, doc_dict))])
user_embeddings_max.columns = [f'topic_{i}' for i in range(N_topic)]
user_embeddings_max['uid'] = users['uid'].values
user_embeddings_max = user_embeddings_max[['uid']+[f'topic_{i}' for i in range(N_topic)]]
user_embeddings_max.head(3)

CPU times: total: 797 ms
Wall time: 800 ms


Unnamed: 0,uid,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_30,topic_31,topic_32,topic_33,topic_34,topic_35,topic_36,topic_37,topic_38,topic_39
0,u105138,0.06748,0.015079,0.031747,0.13425,0.0,0.014498,0.112659,0.026332,0.0,...,0.0,0.049421,0.041594,0.0,0.145774,0.190811,0.101995,0.128946,0.097664,0.181435
1,u108690,0.010689,0.0,0.0,0.183391,0.0,0.025611,0.071247,0.0,0.0,...,0.0,0.0,0.0,0.0,0.204929,0.079595,0.048909,0.155018,0.071403,0.186105
2,u108339,0.0,0.0,0.0,0.079756,0.079375,0.012007,0.044887,0.053459,0.028347,...,0.0,0.0,0.0,0.0,0.238843,0.0,0.068948,0.126861,0.027885,0.089825


In [41]:
target = pd.read_csv("users_churn.csv")
# target.head(3)

Создадим датасеты, а так же разобьём их на трейн и тест так же в 3 различных вариантах.

In [42]:
X = pd.merge(user_embeddings, target, 'left')
X_median = pd.merge(user_embeddings_median, target, 'left')
X_max = pd.merge(user_embeddings_max, target, 'left')


In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

In [44]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(X[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)


X_train_median, X_test_median, y_train, y_test = train_test_split(X_median[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)


X_train_max, X_test_max, y_train, y_test = train_test_split(X_max[[f'topic_{i}' for i in range(N_topic)]], 
                                                    X['churn'], random_state=0)

Обучаем 3 различные модели логистической регрессии и делаем предсказание вероятностей.

In [45]:
logreg = LogisticRegression()
logreg_median = LogisticRegression()
logreg_max = LogisticRegression()
# обучим 
logreg.fit(X_train, y_train)
logreg_median.fit(X_train_median, y_train)
logreg_max.fit(X_train_max, y_train)

In [46]:
# наши прогнозы для тестовой выборки
preds = logreg.predict_proba(X_test)[:, 1]
preds_median = logreg_median.predict_proba(X_test_median)[:, 1]
preds_max = logreg_max.predict_proba(X_test_max)[:, 1]

Рассчитаем Precision, Recall, F_score, Roc_auc_score для всех трёх случаев использования разлиичных методов embedding.

In [47]:
from sklearn.metrics import (f1_score, roc_auc_score, precision_score,
                             classification_report, precision_recall_curve, confusion_matrix)

In [48]:
# В этом списке сформируем необходимый набор данных для итоговой таблицы метрик по различным моделям
metrics_matrix = []

In [49]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')
metrics_matrix.append(['mean', precision[ix], recall[ix], fscore[ix], roc_auc_score(y_test, preds_max)])

Best Threshold=0.25658121473498224, F-Score=0.772, Precision=0.722, Recall=0.829


In [50]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_median)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')
metrics_matrix.append(['median', precision[ix], recall[ix], fscore[ix], roc_auc_score(y_test, preds_median)])

Best Threshold=0.26424804602287816, F-Score=0.797, Precision=0.778, Recall=0.816


In [51]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_max)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')
metrics_matrix.append(['max', precision[ix], recall[ix], fscore[ix], roc_auc_score(y_test, preds_max)])

Best Threshold=0.36998637959286085, F-Score=0.841, Precision=0.850, Recall=0.833


In [52]:
metrics_matrix

[['mean',
  0.7224199288256228,
  0.8285714285714286,
  0.7718631178707225,
  0.9826106168963312],
 ['median',
  0.7782101167315175,
  0.8163265306122449,
  0.7968127490039841,
  0.979080179080179],
 ['max', 0.85, 0.8326530612244898, 0.8412371134020619, 0.9826106168963312]]

In [53]:
metrics_table = pd.DataFrame(metrics_matrix, columns=['Embedding method', 'Precision', 'Recall', 'F score', 'Roc Auc'])

In [55]:
metrics_table

Unnamed: 0,Embedding method,Precision,Recall,F score,Roc Auc
0,mean,0.72242,0.828571,0.771863,0.982611
1,median,0.77821,0.816327,0.796813,0.97908
2,max,0.85,0.832653,0.841237,0.982611


Показатели метрик отличались, когда я использовал разные параметры при обучении моделей LDA, которые не вошли в итоговый ноутбук, но обычно лидировали показания с median-методом или max-методом.

Как мне кажется, в таком случае наиболее оправданно использования max-метод.