In [36]:
import pandas as pd
import ast

In [3]:
path = 'Environmental Discourse'

In [32]:
env = pd.read_pickle('../Data/'+path+'/env_0.pkl')
env = env.sample(10)

In [10]:
env.head()

Unnamed: 0,source,url,title,date,author,text,year
20339,Resilience,https://www.resilience.org/stories/2008-07-08/...,Asia – July 8,2008-07-08,Staff,. Japan sees a chance to promote its energ...,2008
87817,Grist,https://grist.org/article/the-blame-the-enviro...,A possible smear campaign fingers greens...,2005-09-17,Emily Gertz,The Gonzales Justice Department may be seeking...,2005
63974,Grist,https://grist.org/climate/heres-how-coronaviru...,Here’s how coronavirus affected carbon e...,2020-05-19,Zoya Teirstein,"The pandemic is far from over, but some states...",2020
94514,Grist,https://grist.org/article/2011-01-21-meet-the-...,Meet the Climate Fockers,2011-01-22,Auden Schendler,Here’a repost from my Climateprogress.org blog...,2011
57887,Grist,https://grist.org/living/3-disturbing-facts-pr...,3 disturbing facts prove sexual harassme...,2015-04-16,Eve Andrews,"Hello! On this spring-shiny day, are you think...",2015


# Sentence tokenize

In [21]:
import dask.dataframe as dd
from gensim.utils import effective_n_jobs
import spacy
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

In [28]:
def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):
    
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner", "lemmatizer"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [29]:
def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # we check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

In [30]:
def sent_tokenize(word_list, model=nlp):
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

In [38]:
def ngram_tagger(tokens):
    n = len(tokens)
    i = 0
    tokens_q = []
    tokens_qt = []
    tokens_qtb = []
    
    # quadgrams
    while i < n:
        words = '_'.join(tokens[i:i+4])
        if words in quadgrams:
            tokens_q.append(words)
            i += 4
        else:
            tokens_q.append(tokens[i])
            i += 1
    
    # trigrams
    n = len(tokens_q)
    i = 0
    while i < n:
        words = '_'.join(tokens_q[i:i+3])
        if words in trigrams:
            tokens_qt.append(words)
            i += 3
        else:
            tokens_qt.append(tokens_q[i])
            i += 1
    
    # bigrams
    n = len(tokens_qt)
    i = 0
    while i < n:
        words = '_'.join(tokens_qt[i:i+2])
        if words in bigrams:
            tokens_qtb.append(words)
            i += 2
        else:
            tokens_qtb.append(tokens_qt[i])
            i += 1
    
    return tokens_qtb

In [37]:
quadgrams = [('intergovernmental', 'panel', 'climate', 'change'),
             ('natural', 'resources', 'defense', 'council'),
             ('coal', 'fired', 'power', 'plants'),
             ('national', 'oceanic', 'atmospheric', 'administration')]

tr = pd.read_csv('../Data/' + path + '/trigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
tr.columns = ['trigram', 'freq', 'tag']
trigrams = [t for t in tr[tr.tag == 1].trigram]

b = pd.read_csv('../Data/' + path + '/bigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
b.columns = ['bigram', 'freq', 'tag']
bigrams = [t for t in b[b.tag == 1].bigram]

quadgrams = ['_'.join(t) for t in quadgrams]
trigrams = ['_'.join(t) for t in trigrams]
bigrams = ['_'.join(t) for t in bigrams]

In [33]:
%%time
d_env = dd.from_pandas(env, npartitions=effective_n_jobs(-1))
d_env['tokenized_sents'] = d_env.text.map(lambda x: [word_tokenize(s) for s in sent_tokenize(x)])
d_env['normalized_sents'] = d_env.tokenized_sents.map(lambda x: [normalizeTokens(s, lemma=False) for s in x])
env = d_env.compute()

CPU times: user 1min 29s, sys: 1min 32s, total: 3min 2s
Wall time: 1min 8s


In [34]:
%%time
d_env = dd.from_pandas(env, npartitions=effective_n_jobs(-1))
d_env['tokenized_sents'] = d_env.text.map(lambda x: [normalizeTokens(word_tokenize(s), lemma=False) for s in sent_tokenize(x)])
env = d_env.compute()

CPU times: user 45.6 s, sys: 47.4 s, total: 1min 33s
Wall time: 35.2 s


In [66]:
env = pd.read_pickle('../Data/'+path+'/env_0.pkl')
env = env.sample(100, random_state=1)

In [67]:
%%time
d_env = dd.from_pandas(env, npartitions=1)
d_env['sents'] = d_env.text.map(lambda x: [ngram_tagger(
                                           normalizeTokens(
                                           word_tokenize(s), lemma=False)) for s in sent_tokenize(x)])
d_env['sents'] = d_env.sents.map(lambda x: [s for s in x if len(s)>0])
env = d_env.compute()

CPU times: user 58.9 s, sys: 0 ns, total: 58.9 s
Wall time: 58.9 s


In [68]:
env = pd.read_pickle('../Data/'+path+'/env_0.pkl')
env = env.sample(100, random_state=1)

In [69]:
%%time
d_env = dd.from_pandas(env, npartitions=effective_n_jobs(-1))
d_env['sents'] = d_env.text.map(lambda x: [ngram_tagger(
                                           normalizeTokens(
                                           word_tokenize(s), lemma=False)) for s in sent_tokenize(x)])
d_env['sents'] = d_env.sents.map(lambda x: [s for s in x if len(s)>0])
env = d_env.compute()

CPU times: user 2min 39s, sys: 1min 59s, total: 4min 39s
Wall time: 2min 1s


In [68]:
env = pd.read_pickle('../Data/'+path+'/env_0.pkl')
env = env.sample(100, random_state=1)

In [71]:
%%time
env['sents'] = env.text.apply(lambda x: [ngram_tagger(
                                           normalizeTokens(
                                           word_tokenize(s), lemma=False)) for s in sent_tokenize(x)])
env['sents'] = env.sents.apply(lambda x: [s for s in x if len(s)>0])

CPU times: user 45 s, sys: 0 ns, total: 45 s
Wall time: 45 s


In [46]:
len(env.sents.sum())

474

In [47]:
env.sents.apply(len)

13914      9
17262    140
20294     58
21056     19
42392     79
43608     36
49334     17
49402     75
59600     30
78212     11
Name: sents, dtype: int64

In [19]:
env.shape

(1000, 7)

In [None]:
d_env = dd.from_pandas(env_tok, npartitions=effective_n_jobs(-1))

In [None]:
envW2V_skipgram = gensim.models.word2vec.Word2Vec(env['normalized_sents'].sum(), sg=1)