# Домашняя работа

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import spacy
import pyLDAvis
import pyLDAvis.gensim
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import warnings
import logging
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

  regargs, varargs, varkwargs, defaults, formatvalue=lambda value: ""
  from collections import Sequence, defaultdict


Загружаем датасет

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)
warnings.filterwarnings("ignore",category=DeprecationWarning)

stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
df.head()

Unnamed: 0,content,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [3]:
data = df.content.values.tolist()
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]
data = [re.sub('\s+', ' ', sent) for sent in data]
data = [re.sub("\'", "", sent) for sent in data]

def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out


data_words_nostops = remove_stopwords(data_words)
data_words_bigrams = make_bigrams(data_words_nostops)
nlp = spacy.load('en', disable=['parser', 'ner'])
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

id2word = corpora.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

Прогоняем через генсим с маллетом.

In [4]:
mallet_path = '/home/marynepo/anaconda3/envs/py37/lib/python3.7/site-packages/mallet-2.0.8/bin/mallet'
ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=20, id2word=id2word)

pprint(ldamallet.show_topics(formatted=False))
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\nCoherence Score: ', coherence_ldamallet)

[(19,
  [('window', 0.03419808653563432),
   ('run', 0.019112025874742724),
   ('set', 0.019066790310542148),
   ('application', 0.018116843462330084),
   ('problem', 0.018026372333928935),
   ('server', 0.0156062696491982),
   ('call', 0.013683758170673783),
   ('work', 0.013525433695971773),
   ('find', 0.011037477664940176),
   ('display', 0.009793499649424378)]),
 (11,
  [('line', 0.05276666206706223),
   ('buy', 0.03151648958189596),
   ('price', 0.026355733406927005),
   ('good', 0.02599696426107355),
   ('sell', 0.02467227818407617),
   ('mail', 0.02022906030081413),
   ('sale', 0.019263143369670208),
   ('interested', 0.016889747481716573),
   ('call', 0.013274458396577895),
   ('cost', 0.012832896370912101)]),
 (7,
  [('game', 0.03315688562854285),
   ('year', 0.028926308769589862),
   ('team', 0.028363621207069024),
   ('play', 0.025237579193064356),
   ('player', 0.019610703567855953),
   ('win', 0.0175891963987996),
   ('good', 0.016818106035345115),
   ('season', 0.0120665

Функция, находящая оптимальное кол-во топиков. Можно интервал, на котором выбирается лучшее число топиков ([a, b] с шагом k). Я взяла небольшой интервал, потому что работает очень долго, но в принципе, можно изменить шаг и расширить интервал и посмотреть точнее

In [15]:
def best_top_num(mallet_path, a, b, k):
    b_num = 1
    b_ch = 0
    for i in tqdm(range(a, b+1, k)):
        ldamallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=corpus, num_topics=i, id2word=id2word)
        coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
        coherence_ldamallet = coherence_model_ldamallet.get_coherence()
        if b_ch < coherence_ldamallet:
            b_num = i
            b_ch = coherence_ldamallet
            b_model = ldamallet
    return b_num, b_ch, b_model

In [16]:
b_n, b_ch, b_model = best_top_num(mallet_path, 5, 30, 5)
print(b_n, b_ch)



  0%|          | 0/6 [00:00<?, ?it/s][A[A

 17%|█▋        | 1/6 [01:11<05:57, 71.48s/it][A[A

 33%|███▎      | 2/6 [02:28<04:52, 73.03s/it][A[A

 50%|█████     | 3/6 [03:48<03:45, 75.30s/it][A[A

 67%|██████▋   | 4/6 [05:18<02:39, 79.73s/it][A[A

 83%|████████▎ | 5/6 [07:11<01:29, 89.64s/it][A[A

100%|██████████| 6/6 [09:12<00:00, 92.07s/it][A[A

20 0.5437548459723776





Создадим список со словами и их весами для каждой темы, а потом определим главную тему у каждого текста

In [17]:
def get_top_words(mdl):
    tops = mdl.show_topics(-1)
    tp_wrds = []
    for i in tops:
        words = re.findall('"(.+?)"', i[1])
        wgts = re.findall('(0.[\d]+?)\*', i[1])
        tp_wrds.append(list(zip(words, wgts)))
    return tp_wrds

def main_topic(texts, tp_wrds):
    m_ts = []
    for i in texts:
        tps = [0]*len(tp_wrds)
        for j in range(len(tp_wrds)):
            for x, y in tp_wrds[j]:
                tps[j] += float(y)*i.count(x)
        m_t = sorted(tps)[-1]
        m_ts.append(tps.index(m_t))
        
    return m_ts

In [18]:
top_words = get_top_words(b_model)
topics = main_topic(texts, top_words)

Создадим словарь, в котором ключ - номер темы, а значение - список текстов этой темы. Потом найдем топ-5 слов для каждого документа по tf-idf и занесем их в датафрейм. 

In [19]:
def get_texts(texts, topics):
    ts = {}
    for i in range(len(texts)):
        if topics[i] in ts:
            ts[topics[i]].append(' '.join(texts[i]))
        else:
            ts[topics[i]] = [' '.join(texts[i])]
        
    return ts

top_t = get_texts(texts[:5000], topics[:5000])

In [20]:
n_data = {'Word': [], 'TF-IDF': [], 'Text': [], 'Topic': []}

def get_fill_data(top_t):
    vectorizer = TfidfVectorizer()
    for i in top_t:
        vectors = vectorizer.fit_transform(top_t[i])
        feature_names = vectorizer.get_feature_names()
        dense = vectors.todense()
        denselist = dense.tolist()
        for j in range(len(denselist)):
            top_ws = sorted(zip(feature_names, denselist[j]), key= lambda u: u[1], reverse=True)[:5]
            for w in top_ws:
                n_data['Topic'].append(i)
                n_data['Word'].append(w[0])
                n_data['TF-IDF'].append(w[1])
                n_data['Text'].append(top_t[i][j])

In [21]:
get_fill_data(top_t)

In [23]:
n_df = pd.DataFrame(n_data)
n_df.head(10)

Unnamed: 0,Word,TF-IDF,Text,Topic
0,car,0.529976,where thing car nntp_poste host park line wond...,0
1,door,0.283337,where thing car nntp_poste host park line wond...,0
2,bricklin,0.194586,where thing car nntp_poste host park line wond...,0
3,lerxst,0.194586,where thing car nntp_poste host park line wond...,0
4,neighborhood,0.194586,where thing car nntp_poste host park line wond...,0
5,poll,0.407783,poll final call summary final call clock repor...,0
6,clock,0.364615,poll final call summary final call clock repor...,0
7,upgrade,0.236677,poll final call summary final call clock repor...,0
8,final,0.207898,poll final call summary final call clock repor...,0
9,speed,0.193089,poll final call summary final call clock repor...,0


Coherence score - вычисляет, как часто слова из одного топика встречаются вместе рядом (попарно) и насколько это неслучайно (насколько сильно изменяется вероятность того, что встретится слово 1 при условии, что встретилось слово 2, и так для всех пар слов из топиков. Потом, насколько я поняла, это все усредняется)