### Настя Скутина (БКЛ182)

In [13]:
from tqdm import tqdm

In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.feature_extraction.text import TfidfVectorizer

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet, stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
nltk.download('stopwords')

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to C:\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Сначала работаем с материалом, как на семинаре: скачиваем, чистим от лишнего

In [2]:
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

In [3]:
data = df.content.values.tolist()

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

разбиваем на слова

In [4]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

In [5]:
data_words = list(sent_to_words(data))

собираем распространенные биграммы и триграммы

In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

А вот тут начинается интересное. У меня почему-то не идет ни spacy, ни flair (выдает одну и ту же длинющую непонятную ошибку в консоли и ни один из способов в интернете не помогает, я даже питон переустановила, но увы). Поэтому я решила использовать nltk, но с ним все сложно. В ворднете есть лемматизатор, но он работает намного лучше, если определить часть речи для слова (без него он даже формы глагола to be не распознает). Но пос-теггер есть только в самом nltk, причем его теги и теги, принимаемые ворднетом отличаются (+ ворднет знает только 4 части речи). Поэтому ниже функция, которая берет слово, определяет его часть речи и возвращает аналогичный тег для ворднета (или если такого тега нет, то возвращает none). Эту функцию я где-то нашла, пока пыталась разобраться с тегами.

Следующая функция получает предложение, токенизирует его, для каждого слова определяет часть речи, лемматизирует его и добавляет в список кортеж из леммы и первой буквы части речи. Возвращает список кортежей.

In [7]:
# Lemmatize with POS Tag
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, 'none')


lemmatizer = WordNetLemmatizer()


def nlp(sentence):
    lemm_tag = []
    for w in nltk.word_tokenize(sentence):
        pos = get_wordnet_pos(w)
        if pos == 'none':
            lemm_tag.append((lemmatizer.lemmatize(w),
                             nltk.pos_tag([w])[0][1][0].upper()))
        else:
            lemm_tag.append((lemmatizer.lemmatize(w, pos),
                             nltk.pos_tag([w])[0][1][0].upper()))
    return lemm_tag

Дальше опять функции с семинара, которые убирают стоп-слова, находят биграммы, триграммы, лемматизируют тексты с помощью предыдущих функций и возвращают список лемм заданных частей речи

In [15]:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in tqdm(texts):
        doc = nlp(" ".join(sent))
        texts_out.append([token[0] for token in doc
                          if token[1] in allowed_postags])
    return texts_out

применяем эти функции:

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

In [11]:
# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

In [16]:
# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams,
                                allowed_postags=['N', 'J', 'V', 'R'])

100%|████████████████████████████████████████████████████████████████████████████| 11314/11314 [57:54<00:00,  3.26it/s]


Создаем корпус на основе лемматизировванных текстов:

In [17]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

Функция, которая высчитывает coherence_values для моделей с разным количеством топиков и возращает словарь, в котором ключ - это coherence_value, а значение - модель (использую gensim, потому что )

In [18]:
def compute_coherence_values(dictionary, corpus, texts,
                             limit, start=2, step=3):
    coherence_values = []
    model_list = []
    for num_topics in tqdm(range(start, limit, step)):
        model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                                id2word=id2word,
                                                num_topics=num_topics,
                                                random_state=100,
                                                update_every=1,
                                                chunksize=100,
                                                passes=10,
                                                alpha='auto',
                                                per_word_topics=True)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts,
                                        dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())
        coh_model = {}
        for i, value in enumerate(coherence_values):
            coh_model[value] = model_list[i]
    return coh_model

Применяем функцию

In [19]:
coh_model = compute_coherence_values(dictionary=id2word, 
                                     corpus=corpus, 
                                     texts=data_lemmatized, 
                                     start=7, 
                                     limit=30, 
                                     step=6)

100%|███████████████████████████████████████████████████████████████████████████████████| 4/4 [18:43<00:00, 280.82s/it]


Находим лучшую модель, то есть ту, у которой coherence_value больше, чем у других:

In [22]:
max_coh = max([coh for coh in coh_model])

In [24]:
best_model = coh_model[max_coh]

Создаем словарь, в котором ключи - индекс топика, а значение - список из кортежей вида (слово, его вес) для каждого слова, определяющего топик

In [25]:
topic_dic = {}
for index, topic in best_model.show_topics(formatted=False,
                                           num_words=10):
    topic_dic[index] = [(w[0], w[1]) for w in topic]

Дальше функция, которая определяет главный топик текста. Она получает на вход текст и словарь топиков, проходится по каждому слову и если оно совпадает со словом, определяющим топик, то в словарь с индексом этого топика к значению добавляется вес этого слова. Дальше проходим по словарю и смотрим значения. Топик, у которого наибольшее значение возвращается.

In [26]:
def get_text_topic(text, topic_dic):
    count = {0:0, 1:0, 2:0, 3:0, 4:0, 5:0, 6:0}
    for word in text:
        for key in topic_dic:
            for item in topic_dic[key]:
                if item[0] == word:
                    count[key] += item[1]
    maxim = 0
    topic = 0
    for key, value in count.items():
        if max([maxim, value]) == value:
            maxim = value
            topic = key
    return topic

Обрабатываем этой функцией все тексты и помещаем их словарь (как бы группируем по топикам)

In [27]:
texts_by_topics = {0:[], 1:[], 2:[], 3:[], 4:[], 5:[], 6:[]}
for text in texts:
    topic = get_text_topic(text, topic_dic)
    texts_by_topics[topic].append(text)

Посмотрим на соотношение текстов (надеюсь, такой явный перевес - это нормально)

In [28]:
for key in texts_by_topics:
    print(key, len(texts_by_topics[key]))

0 109
1 676
2 645
3 200
4 9141
5 483
6 60


Функция, которая получает номер топика, создает таблицу с Tfidf для каждого слова в топике, затем добаляет столбики с номером топика и 5 словами с максимальным Tfidf для каждого текста и возвращает получившуюся таблицу.  У нее есть maxlen, потому что в какой-то момент у меня оказалось, что я использую больше памяти, чем возможно, поэтому пришлось сократить количество текстов в таблице.

In [29]:
def df_for_topic(texts_by_topics, topic, maxlen=200):
    mem = min([maxlen, len(texts_by_topics[topic])])
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([' '.join(text)
                                        for text in texts_by_topics[topic][:mem]])
    feature_names = vectorizer.get_feature_names()
    dense = vectors.todense()
    denselist = dense.tolist()
    df = pd.DataFrame(denselist, columns=feature_names)
    
    top_words = []
    for i in range(min([maxlen, len(texts_by_topics[topic])])):
        s = pd.Series(df.loc[i])
        top_words.append(', '.join(s.sort_values(ascending=False).index[:5]))
    df['topic_index'] = topic
    df['highest_TfIdf_words'] = top_words
    return df

Ну и применяем эту функцию к каждому топику (надеюсь, так можно и не нужно обязательно создавать огромную таблицу для всех топиков сразу)

In [31]:
df_topic0 = df_for_topic(texts_by_topics, 0)
df_topic0

Unnamed: 0,aa,aao,aavso,ababs,abandon,abandonded,abbreviations_,abdi,abdo,abdomen,...,zone,zucker,zulu,zuma,zumabot,zumrut,zur,zwischen,topic_index,highest_TfIdf_words
0,0.000000,0.0,0.0,0.000000,0.018585,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0,"probe, titan, mission, launch, cassini"
1,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.017612,...,0.000000,0.0,0.0,0.018786,0.000000,0.0,0.0,0.0,0,"armenian, russian, latvia, ottoman, army"
2,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0,"none, kill, security_council, slaughter, brad_..."
3,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.024699,0.0,0.0,0.0,0,"greek, turkish, greece, turkish_minority, kill"
4,0.000000,0.0,0.0,0.036304,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0,"israel, civilian, attack, guerilla, word"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0,"israel, pisga, palestinian, interim, opt"
105,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0,"armenian, muslim_population, slaughter, press,..."
106,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0,"hiding, research, anti, israel, center"
107,0.024257,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0,"space, face, shuttle, launch, ozone"


In [32]:
df_topic1 = df_for_topic(texts_by_topics, 1)
df_topic1

Unnamed: 0,aaa,aacc,aaron,aaron_bryce,aaron_ray,aarp,ab,abandon,abberation,abc,...,zhdanov,zhiguli,zil,zion,zionist,zonker,zoo,zx,topic_index,highest_TfIdf_words
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"handiwork, intricate, environment, tapestry, h..."
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"encryption, clipper, clause, class, weak"
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"sabbath, gentile, tc, ceremonial, applicable"
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"religion, war, chris_blask, bobby, horror"
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"mail, tool, christianity, sherlette, willing"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"legislate, moral, law, pro, morality"
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"overhead, fee, nasa, dennis, cost"
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"armenian, baku, parent, sumgait, karabagh"
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,"butcher, side, muslim, want, victim"


In [33]:
df_topic2 = df_for_topic(texts_by_topics, 2)
df_topic2

Unnamed: 0,aa,aaai,aab,aachen,aanp,aarnet,aaron,aaronson,ab,abad,...,zwn,zx,zxrm,zy__i,zyz,zzh_,zzt,ªl,topic_index,highest_TfIdf_words
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"board, autodoubler, diskdoubler, stac, sigma_d..."
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.08727,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"tiff, image, complexity, inability, application"
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"ncd, terminal, tcp_ip, boot, marie"
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"font, alavi, window, do, small"
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"armenian, serdar_argic, army, russian, command"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"uudecode, cut, begin, delete, mark"
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"texas, flex, ghost, ati_ultra, desktop"
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"network, desqview, slowdown, run, quarterdeck"
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,"battery, remember, construct_home, electrolyte..."


In [34]:
df_topic3 = df_for_topic(texts_by_topics, 3)
df_topic3

Unnamed: 0,a_,aa,ab,abc,aberdeen,abide,ability,able,abnormal,abound,...,zeppo,zero,zeus_bei,zfl,zimmermann,zip,zone,zurich_ch,topic_index,highest_TfIdf_words
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.054627,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3,"phone, register, key, swap, isnt"
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3,"key, bit, dane, large, anywhere"
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.093632,3,"private, bb, germano_caronni, key, reveal"
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3,"yo, jumper, western_digital, michael_gerhards,..."
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3,"motherboard, ncd, davis, brad, case"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3,"cpsr, nsa, nist, public, development"
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3,"grappler_ls, deskjet, print, pnt, swa"
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,3,"serial_number, lie, channel, key, spoof_chip"
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.004608,0.010169,0.0,...,0.0,0.008508,0.0,0.0,0.0,0.009434,0.034032,0.000000,3,"jumper, drive, slave, master, st"


In [36]:
df_topic4 = df_for_topic(texts_by_topics, 4)
df_topic4

Unnamed: 0,aarghhhh,aario,aaron_bryce,aaron_lung,ab,abad,abates,abbott,abilene,ability,...,zimrings,zinc,zion,zod,zone,zoom,zwart,zyklon,topic_index,highest_TfIdf_words
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"car, door, anyone, front_bumper, funky"
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"clock, si, poll, guy_kuo, upgrade"
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"powerbook, display, anybody, willis, machine"
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"harris, weitek, green, division, joe"
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"error, warn, bug, jonathan, tom"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"woof, piano, shapiro, david, back"
196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"printer, print, utah, okidata, driver"
197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.15313,0.0,4,"class, catholic_church, poland, religious, rel..."
198,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,4,"pixel, disappear, powerbook, screen, msu"


In [37]:
df_topic5 = df_for_topic(texts_by_topics, 5)
df_topic5

Unnamed: 0,aa,aaa,aargh,aaron,ab,ab_hr,abate,abates,abberation,abbot,...,zhitnik,zhivov,zlin,zmolek,zombie,zombo,zone,zupcic,topic_index,highest_TfIdf_words
0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"option, module, flight, capability, power"
1,0.0,0.0,0.0,0.0,0.018475,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"car, insurance, rate, year, turbo"
2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"bullpen, austin, yesterday, manzanillo, fetter"
3,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"octopus, plymouth, harold_zazula, detroit, ice"
4,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"roussel, flyer, shutout, picture, low_gaa"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"uk, lancaster, genesis, hz, megadrive"
196,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"boy, sale, silent_assassin, public_access, cas..."
197,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"pierre, quebec, remain, boston, breton"
198,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,"north_star, star, franchise, sad, fine"


In [38]:
df_topic6 = df_for_topic(texts_by_topics, 6)
df_topic6

Unnamed: 0,a_,a_h,a_lv,a_r,a_sl,a_t,a_u,aa,aa_vj_,aaef,...,zzng,zzngvz,zznki,zznkj,zznkjrlb,zzpgvz,zzy_,zzzoh,topic_index,highest_TfIdf_words
0,0.000645,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,"ax, max, tm, di_di, pl"
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,"shuttle, roll, maneuver, attitude, mission"
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,"religion, motto, tool, schneider, keith"
3,0.062923,0.015878,0.015878,0.0,0.0,0.015878,0.01442,0.053546,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.015878,0.0,6,"mp, mj, mv, mu, bt"
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009031,0.009031,0.0,0.0,0.0,0.0,0.0,0.0,6,"tl, pl, chz, pu, hz"
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,"ax, max, bxn_bxn, giz_giz, au"
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.006616,0.0,0.006616,0.006616,0.0,0.0,6,"hz, lk, ck, zd, chz"
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,"atf, andrew, diederich, leader, heard"
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,"possesion, offense, congress, capital, illegal"
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,"watson, vertex, plane, dave, circumcenter"
