In [9]:
import pandas as pd

import plotly.express as px

In [2]:
wos = pd.read_csv('../../Data/WoS/wos-env.tab', sep='\t')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
wos = wos[~wos.abstract.isna()]

In [4]:
pd.Series(wos.source.value_counts().index).sort_values()

14                                             ANTIPODE
10                                                 AREA
23                                 CULTURAL GEOGRAPHIES
16                                            DISASTERS
4                                  ECOLOGICAL ECONOMICS
7                                   ECOLOGY AND SOCIETY
21                                 ENERGY & ENVIRONMENT
2                                         ENERGY POLICY
13                     ENERGY RESEARCH & SOCIAL SCIENCE
12                             ENVIRONMENT AND BEHAVIOR
18                         ENVIRONMENT AND URBANIZATION
3                     ENVIRONMENTAL HEALTH PERSPECTIVES
5                        ENVIRONMENTAL RESEARCH LETTERS
0                    ENVIRONMENTAL SCIENCE & TECHNOLOGY
20                                 ENVIRONMENTAL VALUES
29                       Environmental Research Letters
6                                              GEOFORUM
27                        GLOBAL ENVIRONMENTAL P

#### List journals with none found in db
 - Case Studies in the Environment
 - Children, Youth and Environments
 - Conservation and Society
 - Environment and Planning
 - Environmental Sociology
 - Global Environmental Change
 - Hastings West-Northwest Journal of Environmental Law and Policy
 - Indoor and Built Environment
 - International Journal of Ecology & Development
 - The Journal of Environment & Development
 - Journal of Environmental Assessment Policy and Management
 - Journal of Environmental Studies and Sciences
 - Journal of Environmental Psychology
 - Journal of Political Ecology
 - Nature and Culture
 
Now a quick look back through the database to see if any of these were missed

In [11]:
wos = wos[(wos.pubyear >=2005) & (wos.pubyear <= 2019)] 

In [15]:
wos = wos[wos.doctype == 'Article']

In [16]:
px.line(wos.groupby('pubyear').abstract.count())

In [17]:
wos.to_pickle('../../Data/WoS/wos-env-processed.pkl')

### Processing

In [20]:
import pandas as pd
import nltk
from nltk.util import ngrams
import spacy
import ast
import dask.dataframe as dd
from gensim.utils import effective_n_jobs
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")





In [22]:
def sent_tokenize(word_list, model=nlp):
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):
    
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner", "lemmatizer"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized


def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # we check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized


def ngram_tagger(tokens):
    n = len(tokens)
    i = 0
    tokens_q = []
    tokens_qt = []
    tokens_qtb = []
    
    # quadgrams
    while i < n:
        words = '_'.join(tokens[i:i+4])
        if words in quadgrams:
            tokens_q.append(words)
            i += 4
        else:
            tokens_q.append(tokens[i])
            i += 1
    
    # trigrams
    n = len(tokens_q)
    i = 0
    while i < n:
        words = '_'.join(tokens_q[i:i+3])
        if words in trigrams:
            tokens_qt.append(words)
            i += 3
        else:
            tokens_qt.append(tokens_q[i])
            i += 1
    
    # bigrams
    n = len(tokens_qt)
    i = 0
    while i < n:
        words = '_'.join(tokens_qt[i:i+2])
        if words in bigrams:
            tokens_qtb.append(words)
            i += 2
        else:
            tokens_qtb.append(tokens_qt[i])
            i += 1
    
    return tokens_qtb


path = 'Environmental Discourse'

quadgrams = [('intergovernmental', 'panel', 'climate', 'change'),
             ('natural', 'resources', 'defense', 'council'),
             ('coal', 'fired', 'power', 'plants'),
             ('national', 'oceanic', 'atmospheric', 'administration')]

tr = pd.read_csv('../../Data/' + path + '/trigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
tr.columns = ['trigram', 'freq', 'tag']
trigrams = [t for t in tr[tr.tag == 1].trigram]

b = pd.read_csv('../../Data/' + path + '/bigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
b.columns = ['bigram', 'freq', 'tag']
bigrams = [t for t in b[b.tag == 1].bigram]

quadgrams = ['_'.join(t) for t in quadgrams]
trigrams = ['_'.join(t) for t in trigrams]
bigrams = ['_'.join(t) for t in bigrams]

In [23]:
df = wos

In [None]:
d_df = dd.from_pandas(df, npartitions=effective_n_jobs(-1))
d_df['sents'] = d_df.abstract.map(lambda x: [ngram_tagger(
                                           normalizeTokens(
                                           word_tokenize(s), lemma=False)) for s in sent_tokenize(x)])
d_df['sents'] = d_df.sents.map(lambda x: [s for s in x if len(s)>0])
df = d_df.compute()