In [49]:
import pandas as pd
import numpy as np
import nltk
from nltk.util import ngrams
import gensim
from gensim import corpora, models
from gensim.utils import effective_n_jobs
import ast

import spacy
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

In [52]:
print(gensim.__version__)

3.8.3


In [2]:
env = pd.read_csv('../Data/Environmental Discourse/env.csv', index_col=0)

In [3]:
env['date'] = pd.to_datetime(env.date)

In [4]:
env['year'] = env.date.dt.year

In [5]:
small = env.groupby('year').sample(100, random_state=3291995)

In [6]:
small.head()

Unnamed: 0,source,url,title,date,author,text,year
65099,Grist,https://grist.org/article/im-lovin-it/,I’m lovin’ it!,2005-02-23,David Roberts,"Okay, sorry I put that song in your head. Thi...",2005
73994,Grist,https://grist.org/article/the-atlantic-ocean-i...,The Atlantic Ocean is going to kill you,2005-12-01,David Roberts,"Speaking of Oil Drum, they remind me to point ...",2005
60665,Grist,https://grist.org/article/girl-you-trippin/,"Girl, You Trippin’!",2005-03-29,Grist staff,"Cameron, Arnold lead brigade of celeb eco-spok...",2005
39378,EMagazine,https://emagazine.com/bombing-bali-hai/,Bombing Bali Ha’i,2005-03-16,From the Editors of E Magazine,"We Love Tropical Islands, But That Doesn’t Pre...",2005
56527,Grist,https://grist.org/article/going-it-alone/,Vashon Island goes energy independent,2005-08-25,Andy Brett,"From the Seattle PI: Vashon Island, located ju...",2005


In [7]:
small.shape

(1700, 7)

In [8]:
def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):
    
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner", "lemmatizer"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [9]:
def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "tagger", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # we check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

In [10]:
import dask.dataframe as dd
from dask.multiprocessing import get

In [41]:
# Figure out how to parallelize the different topic models
# Thinking 4, 6, 8, and 10 topics in each of 2007, 2013, and 2019
# For now, I need to write the code to replace bigrams with bi_grams, ja feel?

In [18]:
quadgrams = [('intergovernmental', 'panel', 'climate', 'change'),
             ('natural', 'resources', 'defense', 'council'),
             ('coal', 'fired', 'power', 'plants'),
             ('national', 'oceanic', 'atmospheric', 'administration')]

In [19]:
tr = pd.read_csv('../Data/Environmental Discourse/trigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
tr.columns = ['trigram', 'freq', 'tag']
trigrams = [t for t in tr[tr.tag == 1].trigram]

In [20]:
b = pd.read_csv('../Data/Environmental Discourse/bigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
b.columns = ['bigram', 'freq', 'tag']
bigrams = [t for t in b[b.tag == 1].bigram]

In [21]:
quadgrams = ['_'.join(t) for t in quadgrams]
trigrams = ['_'.join(t) for t in trigrams]
bigrams = ['_'.join(t) for t in bigrams]

In [22]:
def ngram_tagger(tokens):
    n = len(tokens)
    i = 0
    tokens_q = []
    tokens_qt = []
    tokens_qtb = []
    
    # quadgrams
    while i < n:
        words = '_'.join(tokens[i:i+4])
        if words in quadgrams:
            tokens_q.append(words)
            i += 4
        else:
            tokens_q.append(tokens[i])
            i += 1
    
    # trigrams
    n = len(tokens_q)
    i = 0
    while i < n:
        words = '_'.join(tokens_q[i:i+3])
        if words in trigrams:
            tokens_qt.append(words)
            i += 3
        else:
            tokens_qt.append(tokens_q[i])
            i += 1
    
    # bigrams
    n = len(tokens_qt)
    i = 0
    while i < n:
        words = '_'.join(tokens_qt[i:i+2])
        if words in bigrams:
            tokens_qtb.append(words)
            i += 2
        else:
            tokens_qtb.append(tokens_qt[i])
            i += 1
    
    return tokens_qtb

In [23]:
small = env.groupby('year').sample(100, random_state=3291995)
dsmall = dd.from_pandas(small, npartitions=effective_n_jobs(-1))
dsmall['normalized_tokens'] = dsmall.text.map(lambda x: ngram_tagger(
                                                            normalizeTokens(
                                                                word_tokenize(x))), meta=('x', str))



In [24]:
small_tok = dsmall.compute()

In [25]:
dictionary = corpora.Dictionary([i for i in small_tok.normalized_tokens])

In [26]:
bow_corpus = [dictionary.doc2bow(text) for text in small_tok.normalized_tokens]

In [27]:
tfidf = models.TfidfModel(bow_corpus)

In [159]:
type(bow_corpus)

list

In [162]:
type(tfidf[bow_corpus])

gensim.interfaces.TransformedCorpus

In [167]:
for i in range(2, 11, 2):
    print(i)

2
4
6
8
10


In [31]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=2):
    '''
    Computes Coherence values for LDA models with differing numbers of topics.
    
    Returns list of models along with their respective coherence values (pick
    models with the highest coherence)
    '''
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = models.ldamodel.LdaModel(corpus=corpus,
                                                 id2word=dictionary,
                                                 num_topics=num_topics,
                                                 )
        model_list.append(model)
        coherence_model = models.coherencemodel.CoherenceModel(model=model, 
                                                               corpus=corpus,
                                                               dictionary=dictionary,
                                                               coherence='u_mass')
        coherence_values.append(coherence_model.get_coherence())

    return model_list, coherence_values

In [168]:
mask_07 = small_tok.year == 2007
#tfidf[bow_corpus[mask_07]]

In [30]:
model_list, coherence_values = compute_coherence_values(dictionary=dictionary, 
                                                        corpus=bow_corpus, 
                                                        texts=small.normalized_tokens,
                                                        start=4, limit=10, step=2)
plt.plot(range(2, 40, 6), coherence_values)
plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')

Process ForkPoolWorker-11:
Process ForkPoolWorker-13:
Process ForkPoolWorker-12:
Process ForkPoolWorker-10:
Process ForkPoolWorker-15:
Process ForkPoolWorker-14:
Process ForkPoolWorker-16:
Traceback (most recent call last):
Traceback (most recent call last):


In [34]:
model = models.ldamulticore.LdaMulticore(corpus=bow_corpus,
                                         id2word=dictionary,
                                         num_topics=4,
                                         workers=effective_n_jobs(-1))

In [172]:
mylist = [1,2,3]
mylist[True, False, True]

TypeError: list indices must be integers or slices, not tuple

In [175]:
mask_07

1        False
49       False
89       False
90       False
133      False
         ...  
95706    False
95715    False
95763    False
95776    False
95787    False
Name: year, Length: 1700, dtype: bool

In [29]:
mask_07 = small_tok.year == 2007
mask_13 = small_tok.year == 2013
mask_19 = small_tok.year == 2019

bow_corpus_07 = [doc for i, doc in enumerate(bow_corpus) if mask_07.iloc[i]]
bow_corpus_13 = [doc for i, doc in enumerate(bow_corpus) if mask_13.iloc[i]]
bow_corpus_19 = [doc for i, doc in enumerate(bow_corpus) if mask_19.iloc[i]]

In [37]:
models_07, coherence_07 = compute_coherence_values(dictionary=dictionary, 
                                                   corpus=tfidf[bow_corpus_07], 
                                                   texts=small_tok.normalized_tokens,
                                                   start=4, limit=11, step=2)

models_13, coherence_13 = compute_coherence_values(dictionary=dictionary, 
                                                   corpus=tfidf[bow_corpus_13], 
                                                   texts=small_tok.normalized_tokens,
                                                   start=4, limit=11, step=2)

models_19, coherence_19 = compute_coherence_values(dictionary=dictionary, 
                                                   corpus=tfidf[bow_corpus_19], 
                                                   texts=small_tok.normalized_tokens,
                                                   start=4, limit=11, step=2)

In [41]:
all_models = models_07 + models_13 + models_19
names = ['tm_{}_{}'.format(yr, tp) for yr in ['07', '13', '19'] for tp in ['04', '06', '08', '10']]

for model, filename in zip(all_models, names):
    topicsDict = {}
    for topicNum in range(model.num_topics):
        topicWords = [w for w, p in model.show_topic(topicNum)]
        topicsDict['Topic_{}'.format(topicNum)] = topicWords

    wordRanksDF = pd.DataFrame(topicsDict)
    wordRanksDF.to_csv('../Data/Environmental Discourse/Single-Year-TMs/Top-Words/' + filename + '.pkl')
    model.save('../Data/Environmental Discourse/Single-Year-TMs/Models/' + filename)

In [46]:
coherence = coherence_07 + coherence_13 + coherence_19
coh = pd.DataFrame({'model': names, 'coherence': coherence})
coh.to_pickle('../Data/Environmental Discourse/Single-Year-TMS/coherence_scores.pkl')

In [47]:
dictionary.save('../Data/Environmental Discourse/Single-Year-TMs/dictionary')
gensim.corpora.MmCorpus.serialize('../Data/Environmental Discourse/Single-Year-TMs/bow_corpus.mm', bow_corpus)
tfidf.save('../Data/Environmental Discourse/Single-Year-TMs/tfidf')

In [189]:
all_models = models_07 + models_13 + models_19
names = ['tm_07_02.pkl',
        'tm_07_04.pkl',
        'tm_07_06.pkl',
        'tm_07_08.pkl',
        'tm_07_10.pkl',
        'tm_13_02.pkl',
        'tm_13_04.pkl',
        'tm_13_06.pkl',
        'tm_13_08.pkl',
        'tm_13_10.pkl',
        'tm_19_02.pkl',
        'tm_19_04.pkl',
        'tm_19_06.pkl',
        'tm_19_08.pkl',
        'tm_19_10.pkl']

In [177]:
model = models.ldamodel.LdaModel(corpus=tfidf[bow_corpus_07],
                                 id2word=dictionary,
                                 num_topics=4)

In [178]:
coherence_model = models.coherencemodel.CoherenceModel(model=model, 
                                                       corpus=tfidf[bow_corpus],
                                                       dictionary=dictionary,
                                                       coherence='u_mass')

In [179]:
coherence_model.get_coherence()

-11.51415597567715