In [294]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import itertools
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [295]:
news = pd.read_csv("./Lection2/materials.csv")
print(news.shape)
news.head(3)

(27000, 2)


Unnamed: 0,doc_id,title
0,6,Заместитель председателяnправительства РФnСерг...
1,4896,Матч 1/16 финала Кубка России по футболу был п...
2,4897,Форвард «Авангарда» Томаш Заборский прокоммент...


In [296]:
users = pd.read_csv("./Lection2/users_articles.csv")
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [297]:
#from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary

In [298]:
#предобработка текстов
import re
import numpy as np
from nltk.corpus import stopwords
#from nltk.tokenize import word_tokenize

from razdel import tokenize # https://github.com/natasha/razdel
#!pip install razdel

import pymorphy2  # pip install pymorphy2

In [299]:
stopword_ru = stopwords.words('russian')
len(stopword_ru)

morph = pymorphy2.MorphAnalyzer()

In [300]:
with open('./Lection2/stopwords.txt') as f:
    additional_stopwords = [w.strip() for w in f.readlines() if w]
stopword_ru += additional_stopwords
len(stopword_ru)

776

In [301]:
def clean_text(text):
    '''
    очистка текста
    
    на выходе очищеный текст
    
    '''
    if not isinstance(text, str):
        text = str(text)
    
    text = text.lower()
    text = text.strip('\n').strip('\r').strip('\t')
    text = re.sub("-\s\r\n\|-\s\r\n|\r\n", '', str(text))

    text = re.sub("[0-9]|[-—.,:;_%©«»?*!@#№$^•·&()]|[+=]|[[]|[]]|[/]|", '', text)
    text = re.sub(r"\r\n\t|\n|\\s|\r\t|\\n", ' ', text)
    text = re.sub(r'[\xad]|[\s+]', ' ', text.strip())
    
    #tokens = list(tokenize(text))
    #words = [_.text for _ in tokens]
    #words = [w for w in words if w not in stopword_ru]
    
    #return " ".join(words)
    return text

cache = {}

def lemmatization(text):
    '''
    лемматизация
        [0] если зашел тип не `str` делаем его `str`
        [1] токенизация предложения через razdel
        [2] проверка есть ли в начале слова '-'
        [3] проверка токена с одного символа
        [4] проверка есть ли данное слово в кэше
        [5] лемматизация слова
        [6] проверка на стоп-слова

    на выходе лист отлемматизированых токенов
    '''

    # [0]
    if not isinstance(text, str):
        text = str(text)
    
    # [1]
    tokens = list(tokenize(text))
    words = [_.text for _ in tokens]

    words_lem = []
    for w in words:
        if w[0] == '-': # [2]
            w = w[1:]
        if len(w)>1: # [3]
            if w in cache: # [4]
                words_lem.append(cache[w])
            else: # [5]
                temp_cach = cache[w] = morph.parse(w)[0].normal_form
                words_lem.append(temp_cach)
    
    words_lem_without_stopwords=[i for i in words_lem if not i in stopword_ru] # [6]
    
    return words_lem_without_stopwords

In [302]:
%%time
#Запускаем очистку текста. Будет долго...
news['title'] = news['title'].apply(lambda x: clean_text(x), 1)

CPU times: user 19.2 s, sys: 185 ms, total: 19.4 s
Wall time: 19.4 s


In [303]:
%%time
#Запускаем лемматизацию текста. Будет очень долго...
news['title'] = news['title'].apply(lambda x: lemmatization(x), 1)

CPU times: user 2min 30s, sys: 1.38 s, total: 2min 31s
Wall time: 2min 32s


In [304]:
#сформируем список наших текстов, разбив еще и на пробелы
texts = [t for t in news['title'].values]

# Create a corpus from a list of texts
common_dictionary = Dictionary(texts)
common_corpus = [common_dictionary.doc2bow(text) for text in texts]

In [305]:
common_dictionary[10]

'ватутин'

In [306]:
from gensim.models import LdaModel
# Train the model on the corpus.
lda = LdaModel(common_corpus, num_topics=25, id2word=common_dictionary)#, passes=10)

In [307]:
from gensim.test.utils import datapath
# Save model to disk.
temp_file = datapath("model.lda")
lda.save(temp_file)

# Load a potentially pretrained model from disk.
lda = LdaModel.load(temp_file)

In [308]:
# Create a new corpus, made of previously unseen documents.
other_texts = [t for t in news['title'].iloc[:3]]
other_corpus = [common_dictionary.doc2bow(text) for text in other_texts]

unseen_doc = other_corpus[2]
print(other_texts[2])
lda[unseen_doc] 

['форвард', 'авангард', 'томаш', 'заборский', 'прокомментировать', 'игра', 'команда', 'матч', 'чемпионат', 'кхл', 'против', 'атланта', 'nnnn', 'плохой', 'матч', 'нижний', 'новгород', 'против', 'торпедо', 'настраиваться', 'первый', 'минута', 'включиться', 'заборский', 'получиться', 'забросить', 'быстрый', 'гол', 'задать', 'хороший', 'темп', 'поединок', 'играть', 'хороший', 'сторона', 'пять', 'очко', 'выезд', 'девять', 'хороший']


[(1, 0.12776233), (5, 0.35176405), (16, 0.1105626), (17, 0.38757)]

In [309]:
x=lda.show_topics(num_topics=25, num_words=7,formatted=False)
topics_words = [(tp[0], [wd[0] for wd in tp[1]]) for tp in x]

#Below Code Prints Only Words 
for topic,words in topics_words:
    print("topic_{}: ".format(topic)+" ".join(words))

topic_0: млрд бюджет писать министерство расход учреждение инвестиция
topic_1: новый банк газ система часть россия территория
topic_2: британский великобритания лондон противник спасти nn подсчитать
topic_3: исследование рост тыс млн составить вырасти показатель
topic_4: доллар памятник выдавать сближение давность стол оплачивать
topic_5: солнце свидетель виза почва хороший испания пляж
topic_6: район эксперимент сотрудник статья задержать чиновник граница
topic_7: журнал фотография кровь прогнозировать характерный этаж нос
topic_8: проект квартира дональд строительство дыра место рейтинг
topic_9: фонд nn журнал статья писать рак первый
topic_10: nn москва область украина век улица московский
topic_11: мотив гарантия прибытие олимпийский шутка горка собирать
topic_12: россия сша российский ракета военный nn американский
topic_13: земля применение след ii пространство снг анализ
topic_14: россия газета российский nn опубликовать источник ru
topic_15: цена эксперт млн рынок технология ст

In [310]:
#text = news['title'].iloc[0]

def get_lda_vector(text):
    unseen_doc = common_dictionary.doc2bow(text)
    lda_tuple = lda[unseen_doc]
    not_null_topics = dict(zip([i[0] for i in lda_tuple], [i[1] for i in lda_tuple]))

    output_vector = []
    for i in range(25):
        if i not in not_null_topics:
            output_vector.append(0)
        else:
            output_vector.append(not_null_topics[i])
    return np.array(output_vector)

In [311]:
topic_matrix = pd.DataFrame([get_lda_vector(text) for text in news['title'].values])
topic_matrix.columns = ['topic_{}'.format(i) for i in range(25)]
topic_matrix['doc_id'] = news['doc_id'].values
topic_matrix = topic_matrix[['doc_id']+['topic_{}'.format(i) for i in range(25)]]
topic_matrix.head(5)

Unnamed: 0,doc_id,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,...,topic_15,topic_16,topic_17,topic_18,topic_19,topic_20,topic_21,topic_22,topic_23,topic_24
0,6,0.0,0.892146,0.0,0.0,0.0,0.060988,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4896,0.0,0.0,0.0,0.0,0.0,0.214279,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.713997,0.0,0.048562,0.0,0.0
2,4897,0.0,0.101796,0.0,0.0,0.0,0.344929,0.0,0.0,0.0,...,0.0,0.0,0.424339,0.0,0.0,0.0,0.0,0.0,0.0,0.068976
3,4898,0.0,0.091485,0.0,0.0,0.0,0.104541,0.033057,0.0,0.098348,...,0.0,0.0,0.476947,0.0,0.0,0.0,0.090958,0.0,0.095385,0.0
4,4899,0.0,0.790703,0.0,0.0,0.0,0.113873,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.032687,0.0,0.0,0.0,0.0,0.0


In [312]:
users.head(3)

Unnamed: 0,uid,articles
0,u105138,"[293672, 293328, 293001, 293622, 293126, 1852]"
1,u108690,"[3405, 1739, 2972, 1158, 1599, 322665]"
2,u108339,"[1845, 2009, 2356, 1424, 2939, 323389]"


In [313]:
doc_dict = dict(zip(topic_matrix['doc_id'].values, topic_matrix[['topic_{}'.format(i) for i in range(25)]].values))

In [314]:
doc_dict[293622]

array([0.        , 0.04917023, 0.11297118, 0.1299625 , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.31980696, 0.03262204, 0.        , 0.        , 0.08744357,
       0.        , 0.        , 0.        , 0.01782515, 0.        ,
       0.02290731, 0.21643203, 0.        , 0.        , 0.        ])

In [315]:
# Создаем DataFrame с результатами
results = pd.DataFrame()

In [316]:
N_topic = 25

In [317]:
# Список методов
methods = ['mean', 'median', 'max', 'idf']

In [318]:
# Список метрик
metrics = ['roc_auc', 'precision', 'recall', 'f_score']

In [319]:
target = pd.read_csv("./Lection2/users_churn.csv")
target.head(3)

Unnamed: 0,uid,churn
0,u107120,0
1,u102277,0
2,u102444,0


In [320]:
users['articles_str'] = users['articles'].apply(lambda x: x.replace('[','').replace(']', '').replace(',', ''))

users['articles_str'].iloc[0]

'293672 293328 293001 293622 293126 1852'

In [321]:
tfidf = TfidfVectorizer()
tfidf.fit(users['articles_str'])

TfidfVectorizer()

In [322]:
idf = pd.DataFrame({'article_id': tfidf.get_feature_names_out(),
                    'idf': tfidf.idf_})

In [323]:
def get_user_embedding(user_articles_list, method, doc_dict=None, idf=None):
    user_articles_list = eval(user_articles_list)
    user_vector = np.array([doc_dict[doc_id] for doc_id in user_articles_list])

    if method == 'mean':
        user_vector = np.mean(user_vector, 0)
    elif method == 'median':
        user_vector = np.median(user_vector, 0)
    elif method == 'max':
        user_vector = np.max(user_vector, 0)
    elif method == 'idf':
        user_vector = np.zeros((len(user_articles_list), 25))
        for i, doc_id in enumerate(user_articles_list):
            try:
                weight = idf[idf['article_id'] == str(doc_id)]['idf'].values[0]
            except Exception as e:
                weight = 0
            user_vector[i] = doc_dict[doc_id] * weight

        user_vector = np.median(user_vector, axis=0)
    else:
        raise ValueError('Invalid method:', method)

    return user_vector


In [324]:
def evaluate_method(method, doc_dict=None, idf=None):
    if method == 'idf':
        if doc_dict is None or idf is None:
            raise ValueError('doc_dict and idf arguments are required for the idf method')
        user_embeddings = pd.DataFrame([get_user_embedding(x, method, doc_dict=doc_dict, idf=idf) for x in users['articles']])
    else:
        user_embeddings = pd.DataFrame([get_user_embedding(x, method, doc_dict=doc_dict) for x in users['articles']])
    
    user_embeddings.columns = [f'topic_{i}' for i in range(N_topic)]
    user_embeddings['uid'] = users['uid'].values
    user_embeddings = user_embeddings[['uid']+[f'topic_{i}' for i in range(N_topic)]]
    
    X = pd.merge(user_embeddings, target, 'left')
    X_train, X_test, y_train, y_test = train_test_split(X[['topic_{}'.format(i) for i in range(N_topic)]], X['churn'], random_state=0)
    
    logreg = LogisticRegression()
    logreg.fit(X_train, y_train)
    preds = logreg.predict_proba(X_test)[:, 1]
    
    precision, recall, thresholds = precision_recall_curve(y_test, preds)
    fscore = (2 * precision * recall) / (precision + recall)
    ix = np.argmax(fscore)
    f_score = fscore[ix]
    precision = precision[ix]
    recall = recall[ix]
    best_threshold = thresholds[ix]
    roc_auc = roc_auc_score(y_test, preds)
    
    result = pd.DataFrame({
        'method': method,
        'roc_auc': roc_auc,
        'precision': precision,
        'recall': recall,
        'f_score': f_score,
        'threshold': best_threshold
    }, index=[0])
    
    return result


In [325]:
methods = ['mean', 'median', 'max', 'idf']
results = []
for method in methods:
    result = evaluate_method(method, doc_dict=doc_dict, idf=idf)
    results.append(result)

df_results = pd.concat(results, ignore_index=True)
#print(df_results)
styled_table = df_results.style.format({
    'roc_auc': '{:.4f}',
    'precision': '{:.4f}',
    'recall': '{:.4f}',
    'f_score': '{:.4f}',
    'best_threshold': '{:.4f}'
}).set_caption('Сводная таблица результатов').set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'black'),
        ('font-size', '16px')
    ]
}, {
    'selector': 'th',
    'props': [
        ('background-color', '#f2f2f2'),
        ('color', 'black')
    ]
}, {
    'selector': 'tr:nth-child(odd)',
    'props': [
        ('background-color', '#f9f9f9')
    ]
}, {
    'selector': 'tr:hover',
    'props': [
        ('background-color', '#f5f5f5')
    ]
}, {
    'selector': 'th:hover',
    'props': [
        ('background-color', 'white')
    ]
}])

display(styled_table)


Unnamed: 0,method,roc_auc,precision,recall,f_score,threshold
0,mean,0.9459,0.6579,0.7143,0.6849,0.25252
1,median,0.9684,0.7668,0.7918,0.7791,0.275834
2,max,0.9743,0.7289,0.8449,0.7826,0.321221
3,idf,0.987,0.8231,0.8735,0.8475,0.36054


## Выводы

Метод IDF учитывает важность каждой статьи в истории чтения пользователя, что, по-видимому, является хорошим индикатором. Медианный и максимальный также работают хорошо, но их показатели уступают IDF.