### Домашнее задание к уроку 2

1. Самостоятельно разобраться с тем, что такое tfidf (документация https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html и еще - https://scikit-learn.org/stable/modules/feature_extraction.html#text-feature-extraction)
2. Модифицировать код функции get_user_embedding таким образом, чтобы считалось не среднее (как в примере np.mean), а медиана. Применить такое преобразование к данным, обучить модель прогнозирования оттока и посчитать метрики качества и сохранить их: roc auc, precision/recall/f_score (для 3 последних - подобрать оптимальный порог с помощью precision_recall_curve, как это делалось на уроке)
3. Повторить п.2, но используя уже не медиану, а max
4. (опциональное, если очень хочется) Воспользовавшись полученными знаниями из п.1, повторить пункт 2, но уже взвешивая новости по tfidf (подсказка: нужно получить веса-коэффициенты для каждого документа. Не все документы одинаково информативны и несут какой-то положительный сигнал). Подсказка 2 - нужен именно idf, как вес.
5. Сформировать на выходе единую таблицу, сравнивающую качество 3 разных метода получения эмбедингов пользователей: mean, median, max, idf_mean по метрикам roc_auc, precision, recall, f_score
6. Сделать самостоятельные выводы и предположения о том, почему тот или ной способ оказался эффективнее остальных

#### Ссылки

1. http://www.machinelearning.ru/wiki/images/d/d5/Voron17survey-artm.pdf
2. https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

#### Библиотеки, которые нужно установить:

1. gensim
2. razdel
3. pymorphy2
4. nltk

#### Дополнительно, библиотеки для анализа текста
1. spacy 
2. natasha
3. nltk

In [72]:
import pandas as pd
import re
import numpy as np
import itertools

import gensim
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models import LdaModel
from gensim.test.utils import datapath

import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from razdel import tokenize

import pymorphy2  

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

import matplotlib.pyplot as plt

%matplotlib inline

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/phocaman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [73]:
news = pd.read_csv("articles.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [74]:
users = pd.read_csv("users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [75]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [76]:
with open('stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [77]:
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())

    return text

cache = {}

def lemmatization(text):
    
    if not isinstance(text, str):
        text = str(text)
    
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-':
            w = w[1:]
        if len(w) > 1:
            if w in cache:
                words_lem.append(cache[w])
            else:
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru]
    
    return words_lem_without_stopwords

In [78]:
num_topics = 25

In [79]:
%%time

news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 21.4 s, sys: 101 ms, total: 21.5 s
Wall time: 21.6 s


In [80]:
%%time

news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 2min 58s, sys: 207 ms, total: 2min 58s
Wall time: 2min 58s


In [81]:
# news['title']

In [82]:
texts = [t for t in news['title'].values]

common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [83]:
# common_dictionary[10]

In [84]:
# common_corpus

In [85]:
%%time

lda = LdaModel(common_corpus, num_topics=num_topics, id2word=common_dictionary)

CPU times: user 1min 5s, sys: 488 ms, total: 1min 5s
Wall time: 21.4 s


In [86]:
temp_file = datapath('model.lda')
lda.save(temp_file)

lda = LdaModel.load(temp_file)

In [87]:
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'играть', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'хороший']


[(9, 0.4258436), (16, 0.25808102), (19, 0.03796775), (20, 0.25594822)]

In [88]:
x = lda.show_topics(num_topics=num_topics, num_words=7, formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: восток общество операция кость ск стол гражданский
topic_1: исследование женщина обнаружить мужчина погибнуть найти жизнь
topic_2: квартира рак цена спрос городской планета мероприятие
topic_3: россия nn российский млрд москва путин день
topic_4: ракета смерть доллар источник обращение белый налог
topic_5: сша исследование россия государство пациент российский правительство
topic_6: фонд пенсия технология федеральный государственный проект реформа
topic_7: закон депутат документ форум законопроект госдума завод
topic_8: ребёнок экономика гражданин экономический расход активность доклад
topic_9: рубль журнал nn управление всё выяснить первый
topic_10: снижение дональд писать разместить треть климат параметр
topic_11: россиянин место производитель век nn тело мышь
topic_12: банк рф экономический заместитель министр фсб председатель
topic_13: проект рост северный программа рынок запуск новый
topic_14: млн тыс составить доход ставка сумма стоимость
topic_15: всё большой очень мозг

In [89]:
def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(num_topics):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [90]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(num_topics)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id'] + ['topic_{}'.format(i) for i in range(num_topics)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.03622,0.0,0.0,0.0,0.043982,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.333311,0.299708,0.0,0.343187,0.0,0.0
2,4897,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.257973,0.0,0.0,0.037964,0.255955,0.0,0.0,0.0,0.0
3,4898,0.0,0.0,0.0,0.076099,0.0,0.0,0.026678,0.0,0.0,...,0.700474,0.0,0.082514,0.0,0.0,0.104075,0.0,0.0,0.0,0.0
4,4899,0.0,0.0,0.0,0.0,0.0,0.0,0.482346,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.125898,0.0,0.0,0.0,0.043097


In [91]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [92]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(num_topics)]].values))

In [93]:
doc_dict[293622]

array([0.        , 0.17187116, 0.05562072, 0.12524131, 0.        ,
       0.        , 0.        , 0.17610952, 0.        , 0.        ,
       0.03674516, 0.        , 0.        , 0.        , 0.08323139,
       0.21252991, 0.10239127, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.02537297])

In [94]:
results = {
    'method': [],
    'roc_auc': [],
    'precision': [],
    'recall': [],
    'f_score': []
}

In [95]:
vectorizer = TfidfVectorizer()

weights = vectorizer.fit_transform(users['articles'])

weights_idf = pd.DataFrame(vectorizer.idf_, index=vectorizer.get_feature_names(), columns=['weights'])

In [96]:
def get_user_embedding(user_articles_list, func=np.mean):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])
    user_vector = func(user_vector, 0)
    return user_vector

In [97]:
target = pd.read_csv('users_churn.csv')
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [98]:
functions = [np.mean, np.median, np.max]

In [99]:
for function in functions:
    func_name = str(function).split()[1]
    user_embeddings = pd.DataFrame([i for i in users['articles'].apply(lambda x: get_user_embedding(x,function), 1)])
    user_embeddings.columns = ['topic_{}'.format(i) for i in range(num_topics)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+['topic_{}'.format(i) for i in range(num_topics)]]
    user_embeddings.head(3)
    
    X = pd.merge(user_embeddings, target, 'left')
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(num_topics)]], 
                                                    X['churn'], random_state=42)
    
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    
    
    preds = logreg.predict_proba(X_test)[:, 1]
    preds[:10]
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    print(func_name)
    print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                            fscore[ix],
                                                                            precision[ix],
                                                                            recall[ix]))
    print("-"*10)
    results['method'].append(func_name)
    results['roc_auc'].append(roc_auc_score(y_test, preds))
    results['precision'].append(precision[ix])
    results['recall'].append(recall[ix])
    results['f_score'].append(fscore[ix])
    
    roc_auc_score(y_test, preds)

mean
Best Threshold=0.271938, F-Score=0.728, Precision=0.794, Recall=0.672
----------
median
Best Threshold=0.263249, F-Score=0.857, Precision=0.819, Recall=0.897
----------
amax
Best Threshold=0.392821, F-Score=0.839, Precision=0.893, Recall=0.791
----------


In [100]:
res = pd.DataFrame(data=results).sort_values('f_score', ascending=False)

In [101]:
res

Unnamed: 0,method,roc_auc,precision,recall,f_score
1,median,0.987339,0.819495,0.897233,0.856604
2,amax,0.984036,0.892857,0.790514,0.838574
0,mean,0.961585,0.794393,0.671937,0.728051
