In [1]:
import spacy
spacy.load('en')
from spacy.lang.en import English
parser = English()
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [2]:
import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma
    
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [get_lemma2(token) for token in tokens]
    return tokens

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rafayet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rafayet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import random
import pandas as pd

text_data = []
file  = pd.read_csv('all_w_sent.csv',encoding = "ISO-8859-1")
file_names = file['Filename'].tolist()
file_names_set = set(file_names)
speaker = file['Speaker'].tolist()
text_ = file['Text'].tolist()

In [4]:
for t in text_:
	tokens = prepare_text_for_lda(t)
	text_data.append(tokens)

In [5]:
from gensim import corpora
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]
print (corpus)
import pickle
pickle.dump(corpus, open('HDP_LDA_corp.pkl', 'wb'))
dictionary.save('HDP_LDA_dict.gensim')



[[(0, 1)], [(1, 1)], [(2, 1), (3, 1), (4, 1), (5, 1)], [], [(6, 1)], [(7, 1), (8, 1)], [], [], [], [], [], [(7, 1), (9, 1), (10, 1)], [], [(11, 1), (12, 1), (13, 1)], [], [], [], [(7, 1), (14, 1), (15, 1), (16, 1)], [], [(5, 1), (17, 1), (18, 1), (19, 1), (20, 1)], [], [(21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 1), (28, 1), (29, 1), (30, 1)], [(26, 1)], [(28, 1)], [(28, 1), (31, 1), (32, 1)], [(33, 1), (34, 1), (35, 1)], [(35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1)], [(44, 1), (45, 1), (46, 1)], [(47, 1)], [(48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1)], [], [(55, 1)], [(56, 1), (57, 1)], [(5, 1), (58, 1)], [], [(59, 1), (60, 1), (61, 1), (62, 1), (63, 1)], [(64, 1), (65, 1)], [], [(56, 1), (66, 1)], [(66, 1)], [], [(67, 1)], [(68, 1)], [(55, 1)], [], [(69, 2), (70, 1)], [(71, 1), (72, 1)], [(48, 1), (54, 1), (73, 1)], [], [], [(74, 1)], [(16, 1), (22, 1), (70, 1), (75, 1), (76, 1)], [], [(77, 1), (78, 1), (79, 1), (8

In [6]:
import gensim
hdp = gensim.models.HdpModel(corpus, id2word=dictionary)
hdp.save('HDPmodel.gensim')

In [15]:
topic_info = hdp.print_topics(num_topics=5, num_words=20)
hdp.print_topic(0, topn=20)
for topic in topic_info:
    print(topic)


(0, '0.015*think + 0.013*would + 0.013*thing + 0.012*right + 0.010*cancer + 0.008*going + 0.007*really + 0.007*treatment + 0.007*little + 0.007*something + 0.006*could + 0.006*blood + 0.005*better + 0.005*people + 0.005*month + 0.005*chemo + 0.005*actually + 0.005*still + 0.004*probably + 0.004*chemotherapy')
(1, '0.007*right + 0.005*think + 0.004*would + 0.004*thing + 0.003*cancer + 0.003*going + 0.003*little + 0.002*blood + 0.002*treatment + 0.002*really + 0.002*something + 0.002*could + 0.002*month + 0.002*effect + 0.002*always + 0.002*still + 0.002*chemo + 0.002*better + 0.002*probably + 0.002*actually')
(2, '0.006*right + 0.004*think + 0.003*thing + 0.003*would + 0.002*little + 0.002*cancer + 0.002*really + 0.002*could + 0.002*still + 0.002*going + 0.002*blood + 0.002*month + 0.002*getting + 0.001*something + 0.001*better + 0.001*treatment + 0.001*actually + 0.001*anything + 0.001*week + 0.001*probably')
(3, '0.006*right + 0.003*think + 0.003*would + 0.003*thing + 0.002*going + 0.

In [30]:
hdp.show_topic(0)
hdp.show_topic(1)
hdp.show_topic(2)

[('right', 0.006347227302475449),
 ('think', 0.004337795752249051),
 ('thing', 0.003488753461537189),
 ('would', 0.0032707666506881367),
 ('little', 0.0022621806340109656),
 ('cancer', 0.0021829555712580776),
 ('really', 0.002003038189078698),
 ('could', 0.001987088539588128),
 ('still', 0.0019629676070806305),
 ('going', 0.001824542221899828),
 ('blood', 0.0017387901100683059),
 ('month', 0.0016043275574652933),
 ('getting', 0.0015084439644560816),
 ('something', 0.0014411197928683877),
 ('better', 0.0014403354942520267),
 ('treatment', 0.001286392572137799),
 ('actually', 0.001164177626425272),
 ('anything', 0.001127808566243902),
 ('week', 0.001063539772059664),
 ('probably', 0.0010404121057507897)]