In [7]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [41]:
wos = pd.read_csv('../../Data/WoS/wos.tab', sep='\t')

In [42]:
# Limit to those for which I have abstracts and only articles
df = wos[(~wos.abstract.isna()) & (wos.doctype == 'Article')]

In [43]:
# Limit to date range 1992-2019
df = df[(df.pubyear >= 1992) & (df.pubyear <= 2019)]

In [15]:
# Plot annual count
annual_count = df.groupby('pubyear').agg({'abstract':'count'})
px.line(annual_count)

In [37]:
import nltk
from nltk.util import ngrams
import spacy
import ast
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

In [21]:
def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):
    
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner", "lemmatizer"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [26]:
def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # we check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

In [34]:
def ngram_tagger(tokens):
    n = len(tokens)
    i = 0
    tokens_q = []
    tokens_qt = []
    tokens_qtb = []
    
    # quadgrams
    while i < n:
        words = '_'.join(tokens[i:i+4])
        if words in quadgrams:
            tokens_q.append(words)
            i += 4
        else:
            tokens_q.append(tokens[i])
            i += 1
    
    # trigrams
    n = len(tokens_q)
    i = 0
    while i < n:
        words = '_'.join(tokens_q[i:i+3])
        if words in trigrams:
            tokens_qt.append(words)
            i += 3
        else:
            tokens_qt.append(tokens_q[i])
            i += 1
    
    # bigrams
    n = len(tokens_qt)
    i = 0
    while i < n:
        words = '_'.join(tokens_qt[i:i+2])
        if words in bigrams:
            tokens_qtb.append(words)
            i += 2
        else:
            tokens_qtb.append(tokens_qt[i])
            i += 1
    
    return tokens_qtb

In [45]:
def sent_tokenize(word_list, model=nlp):
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

In [39]:
path = 'Environmental Discourse'

quadgrams = [('intergovernmental', 'panel', 'climate', 'change'),
             ('natural', 'resources', 'defense', 'council'),
             ('coal', 'fired', 'power', 'plants'),
             ('national', 'oceanic', 'atmospheric', 'administration')]

tr = pd.read_csv('../../Data/' + path + '/trigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
tr.columns = ['trigram', 'freq', 'tag']
trigrams = [t for t in tr[tr.tag == 1].trigram]

b = pd.read_csv('../../Data/' + path + '/bigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
b.columns = ['bigram', 'freq', 'tag']
bigrams = [t for t in b[b.tag == 1].bigram]

quadgrams = ['_'.join(t) for t in quadgrams]
trigrams = ['_'.join(t) for t in trigrams]
bigrams = ['_'.join(t) for t in bigrams]

In [31]:
import dask.dataframe as dd
from gensim.utils import effective_n_jobs





In [None]:
d_df = dd.from_pandas(df, npartitions=effective_n_jobs(-1))
d_df['tokens'] = d_df.abstract.map(lambda x: 
            [w for s in sent_tokenize(x) for w in ngram_tagger(normalizeTokens(word_tokenize(s)))], meta=('x', str))
d_df['text_reconstructed'] = d_df.tokens.map(lambda x: ' '.join(x))
df = d_df.compute()

In [33]:
df.head()

Unnamed: 0,wos_id,issn,doi,title,pubtype,doctype,volume,issue,pubyear,pubmonth,pubday,source,abstract,tokens,text_reconstructed
62174,WOS:A1992JW66400007,0003-1224,,RACE AND JOB DISMISSALS IN A FEDERAL BUREAUCRACY,Journal,Article,57.0,5,1992.0,10.0,1.0,AMERICAN SOCIOLOGICAL REVIEW,We examine the racial differential in job dism...,"[examine, racial, differential, job, dismissal...",examine racial differential job dismissal unex...
62177,WOS:A1992JN26100011,0093-5301,,KNOWLEDGE DEVELOPMENT AND SCIENTIFIC STATUS IN...,Journal,Article,19.0,2,1992.0,9.0,1.0,JOURNAL OF CONSUMER RESEARCH,The communication patterns (1977 through 1988)...,"[communication, pattern, journal, consumer, re...",communication pattern journal consumer researc...
62179,WOS:A1992JT62300003,0012-9682,,ASYMPTOTIC EFFICIENCY IN LARGE EXCHANGE ECONOM...,Journal,Article,60.0,6,1992.0,11.0,1.0,ECONOMETRICA,We provide conditions on an exchange economy w...,"[provide, condition, exchange, economy, asymme...",provide condition exchange economy asymmetric ...
62188,WOS:A1992KB22900007,0002-8282,,"THE TIMING OF INTERGENERATIONAL TRANSFERS, TAX...",Journal,Article,82.0,5,1992.0,12.0,1.0,AMERICAN ECONOMIC REVIEW,We analyze an overlapping-generations framewor...,"[analyze, overlap, generation, framework, acco...",analyze overlap generation framework accommoda...
62189,WOS:A1992JR53400003,0034-6527,,THE POLITICS OF 1992 - FISCAL-POLICY AND EUROP...,Journal,Article,59.0,4,1992.0,10.0,1.0,REVIEW OF ECONOMIC STUDIES,The internal market in Europe will greatly inc...,"[internal, market, europe, greatly, increase, ...",internal market europe greatly increase intern...
