In [1]:
# Create a function to pull out nouns from a string of text
from nltk import word_tokenize, pos_tag
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

def nouns(text):
    '''Given a string of text, tokenize the text and pull out only the nouns.'''
    is_noun = lambda pos: pos[:2] == 'NN'
    tokenized = word_tokenize(text)
    all_nouns = [word for (word, pos) in pos_tag(tokenized) if is_noun(pos)] 
    return ' '.join(all_nouns)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tobiaskarentiuskromanndahl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/tobiaskarentiuskromanndahl/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Apply the nouns function to the transcripts to filter only on nouns
'''data_clean = pd.read_pickle('pickle/data_nouns.pkl')
data_nouns = pd.DataFrame(data_clean.content.apply(nouns))
data_nouns.to_pickle('pickle/data_nouns.pkl')'''

In [2]:
import pandas as pd
import pickle

data_nouns = pd.read_pickle('pickle/data_nouns.pkl')
data_df = pd.read_pickle('pickle/data_df.pkl')

data_nouns = data_nouns.join(data_df.date)

data_nouns["date"] = pd.to_datetime(data_nouns["date"]).dt.strftime("%W")
#data_nouns = data_nouns.groupby('date', as_index=False, sort=False).agg({'content': ' '.join})
data_nouns.head()

Unnamed: 0,content,date
0,cancer i cancer treatment today life i disbeli...,0
1,something link page commission work people gin...,0
2,cluster pneumonia city member family world hea...,1
3,airport mystery illness china credit update st...,1
4,finding outbreak pneumonia china people family...,1


In [13]:
# Import the necessary modules for LDA with gensim
from gensim import matutils, models
import scipy.sparse

In [21]:
ldas = dict()

for week in data_nouns.date.unique():

    # CountVectorizer will create a matrix of how many times each word has been mentioned.
    # The stop_words parameter will remove uninformative words such as 'him','her','the'.
    cv = CountVectorizer(stop_words='english')
    data_cv = cv.fit_transform(data_nouns[data_nouns.date == week].content)

    # Creating a new dataframe where columns is the newly found feature names.
    data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
    data_dtm.index = data_nouns[data_nouns.date == week].index
    
    # One of the required inputs is a term-document matrix
    tdm = data_dtm.transpose()
    
    # We're going to put the term-document matrix into a new gensim format, from df --> sparse matrix --> gensim corpus
    sparse_counts = scipy.sparse.csr_matrix(tdm)
    corpus = matutils.Sparse2Corpus(sparse_counts)

    id2word = dict((v, k) for k, v in cv.vocabulary_.items())

    # LDA for num_topics
    ldas[week] = models.LdaModel(corpus=corpus, id2word=id2word, num_topics=5, passes=10)
    print('\nWeek ', week)


Week  00

Week  01

Week  02

Week  03

Week  04

Week  05

Week  06

Week  07

Week  08

Week  09

Week  10

Week  11

Week  12

Week  13

Week  14

Week  15

Week  16

Week  17

Week  18


In [46]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

for week, lda in ldas.items():
    print(f'\nWeek: {week}\n')
    print(lda.print_topics())


Week: 00

[(0, '0.139*"ginger" + 0.035*"health" + 0.028*"research" + 0.028*"nausea" + 0.024*"effect" + 0.016*"body" + 0.016*"people" + 0.016*"inflammation" + 0.016*"medicine" + 0.016*"pain"'), (1, '0.005*"ginger" + 0.005*"health" + 0.005*"research" + 0.005*"nausea" + 0.005*"effect" + 0.004*"cancer" + 0.004*"supplement" + 0.004*"medicine" + 0.004*"body" + 0.004*"inflammation"'), (2, '0.006*"ginger" + 0.005*"health" + 0.005*"cancer" + 0.005*"nausea" + 0.005*"research" + 0.005*"effect" + 0.005*"time" + 0.005*"treatment" + 0.005*"supplement" + 0.005*"extract"'), (3, '0.005*"cancer" + 0.005*"time" + 0.005*"story" + 0.005*"ginger" + 0.004*"round" + 0.004*"treatment" + 0.004*"diagnosis" + 0.004*"day" + 0.004*"doctor" + 0.004*"life"'), (4, '0.059*"cancer" + 0.028*"time" + 0.020*"day" + 0.020*"life" + 0.016*"treatment" + 0.016*"recovery" + 0.016*"doctor" + 0.016*"diagnosis" + 0.016*"round" + 0.016*"story"')]

Week: 01

[(0, '0.003*"anxiety" + 0.003*"body" + 0.002*"people" + 0.002*"person" + 0.

In [None]:
# Let's take a look at which topics each transcript contains
corpus_transformed = lda[corpus]
list(zip([a for [(a,b)] in corpus_transformed], data_dtm.index))