In [1]:
import pandas as pd

### Read in environmental discourse data, drop duplicates

In [59]:
env = pd.read_csv('../Data/Environmental Discourse/env.csv', index_col=0)

In [60]:
env = env[['source', 'url', 'title', 'date', 'author', 'text']]

In [48]:
env = env[~env.duplicated(subset='url')]

In [33]:
env = env[~env.text.isna()]

In [61]:
env.shape

(93763, 6)

### Clean up the text a bit

In [62]:
def clean(text):
    
    #text = text.replace('Ed. note: ', '')
    #text = text.replace(' Grist thanks its sponsors. Become one.', '')
    #text = text.replace('\xa0', ' ')
    #text = text.replace('\n', '')
    text = text.replace('Click on the headline (link) for the full text', '')
    text = text.replace('Many more articles are available through the Energy Bulletin homepage', '')
    
    return text

env['text'] = env.text.apply(clean)

In [66]:
env.to_csv('../Data/Environmental Discourse/env.csv') # 2/28/22 10:21 pm

### Prepare
Now I'm going to work with a small sample so that I can move quickly and make decisions, and then run things on the RCC. Hopefully!

In [17]:
! python -m spacy download en

[38;5;3m⚠ As of spaCy v3.0, shortcuts like 'en' are deprecated. Please use the
full pipeline package name 'en_core_web_sm' instead.[0m
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 11.0 MB/s eta 0:00:01
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.2.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [45]:
import pandas as pd
import nltk
from nltk.util import ngrams

import spacy
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

In [40]:
env = pd.read_csv('../Data/Environmental Discourse/env.csv', index_col=0)

In [41]:
env = env.sample(1000, random_state=827)

In [67]:
env.shape

(93763, 6)

In [68]:
def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):
    
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner", "lemmatizer"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [69]:
def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "tagger", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # we check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

In [None]:
# Apply tokenization and normalization functions
env['tokenized_text'] = env['text'].apply(lambda x: word_tokenize(x))
env['normalized_tokens'] = env['tokenized_text'].apply(lambda x: normalizeTokens(x, lemma=False))

In [None]:
env['bigrams'] = env['normalized_tokens'].apply(lambda x: [i for i in ngrams(x, 2)])
bigrams = pd.Series(env['bigrams'].sum()).value_counts().head(100)
bigram_df = pd.DataFrame({'bigram': bigrams})
bigram_df.to_csv('../Data/Environmental Discourse/bigrams.csv')

In [None]:
env['trigrams'] = env['normalized_tokens'].apply(lambda x: [i for i in ngrams(x, 3)])
trigrams = pd.Series(env['trigrams'].sum()).value_counts().head(100)
trigram_df = pd.DataFrame({'trigram': trigrams})
trigram_df.to_csv('../Data/Environmental Discourse/trigrams.csv')

In [None]:
env.to_pkl('../Data/Environmental Discourse/env_toknorm.pkl') # 2/28/22 9:59 pm

In [None]:
env_tok = pd.read_pickle('../Data/Environmental Discourse/env_tok.pkl')

In [None]:
env_tok.shape