In [1]:
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
wos = pd.read_csv('../../Data/WoS/wos.tab', sep='\t')

In [3]:
# Limit to those for which I have abstracts and only articles
df = wos[(~wos.abstract.isna()) & (wos.doctype == 'Article')]

In [4]:
df.shape

(27891, 13)

In [6]:
# Limit to date range 1992-2019
df = df[(df.pubyear >= 1992) & (df.pubyear <= 2019)]

In [7]:
# Plot annual count
annual_count = df.groupby('pubyear').agg({'abstract':'count'})
px.line(annual_count)

In [18]:
disc = pd.read_excel('../../Figures/Wos Journals.xlsx', sheet_name='data')
disc['source'] = disc.source.apply(lambda x: x.upper())


In [19]:
df = pd.merge(df, disc, on='source', how='left', indicator=True)

In [22]:
df.shape

(27657, 15)

In [39]:
fig = px.line(df.groupby(['pubyear', 'discipline']).abstract.count().reset_index(),
        x='pubyear', y='abstract', color='discipline',
        labels={'discipline': 'Social Science', 'abstract':'Number of Abstracts', 'pubyear':'Year'},
        title='Figure XX: Web of Science Abstracts by Discipline, 1992-2019')

fig.update_layout(
    font_family='Times New Roman',
    plot_bgcolor='rgba(0,0,0,0)',
    font_color='black',
    xaxis = dict(
        tickmode='linear',
        tick0=4,
        dtick=2,
        showline=True,
        mirror=True,
        linecolor='black'
    ),
    yaxis = dict(
        showline=True,
        mirror=True,
        linecolor='black'
        
    ),
    legend = dict(
        xanchor='left',
        x=0.1,
        yanchor='top',
        y=0.95
    )
)

fig.show()

In [37]:
import nltk
from nltk.util import ngrams
import spacy
import ast
try:
    nlp = spacy.load("en")
except OSError:
    nlp = spacy.load("en_core_web_sm")

In [21]:
def word_tokenize(word_list, model=nlp, MAX_LEN=1500000):
    
    tokenized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 
    # since we're only tokenizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list, disable=["parser", "tagger", "ner", "lemmatizer"])
    
    for token in doc:
        if not token.is_punct and len(token.text.strip()) > 0:
            tokenized.append(token.text)
    return tokenized

In [26]:
def normalizeTokens(word_list, extra_stop=[], model=nlp, lemma=True, MAX_LEN=1500000):
    #We can use a generator here as we just need to iterate over it
    normalized = []
    if type(word_list) == list and len(word_list) == 1:
        word_list = word_list[0]

    if type(word_list) == list:
        word_list = ' '.join([str(elem) for elem in word_list]) 

    # since we're only normalizing, I remove RAM intensive operations and increase max text size

    model.max_length = MAX_LEN
    doc = model(word_list.lower(), disable=["parser", "ner"])

    if len(extra_stop) > 0:
        for stopword in extra_stop:
            lexeme = nlp.vocab[stopword]
            lexeme.is_stop = True

    # we check if we want lemmas or not earlier to avoid checking every time we loop
    if lemma:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.lemma_))
    else:
        for w in doc:
            # if it's not a stop word or punctuation mark, add it to our article
            if w.text != '\n' and not w.is_stop and not w.is_punct and not w.like_num and len(w.text.strip()) > 0:
            # we add the lematized version of the word
                normalized.append(str(w.text.strip()))

    return normalized

In [34]:
def ngram_tagger(tokens):
    n = len(tokens)
    i = 0
    tokens_q = []
    tokens_qt = []
    tokens_qtb = []
    
    # quadgrams
    while i < n:
        words = '_'.join(tokens[i:i+4])
        if words in quadgrams:
            tokens_q.append(words)
            i += 4
        else:
            tokens_q.append(tokens[i])
            i += 1
    
    # trigrams
    n = len(tokens_q)
    i = 0
    while i < n:
        words = '_'.join(tokens_q[i:i+3])
        if words in trigrams:
            tokens_qt.append(words)
            i += 3
        else:
            tokens_qt.append(tokens_q[i])
            i += 1
    
    # bigrams
    n = len(tokens_qt)
    i = 0
    while i < n:
        words = '_'.join(tokens_qt[i:i+2])
        if words in bigrams:
            tokens_qtb.append(words)
            i += 2
        else:
            tokens_qtb.append(tokens_qt[i])
            i += 1
    
    return tokens_qtb

In [45]:
def sent_tokenize(word_list, model=nlp):
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences

In [39]:
path = 'Environmental Discourse'

quadgrams = [('intergovernmental', 'panel', 'climate', 'change'),
             ('natural', 'resources', 'defense', 'council'),
             ('coal', 'fired', 'power', 'plants'),
             ('national', 'oceanic', 'atmospheric', 'administration')]

tr = pd.read_csv('../../Data/' + path + '/trigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
tr.columns = ['trigram', 'freq', 'tag']
trigrams = [t for t in tr[tr.tag == 1].trigram]

b = pd.read_csv('../../Data/' + path + '/bigrams.csv', converters={'Unnamed: 0': ast.literal_eval})
b.columns = ['bigram', 'freq', 'tag']
bigrams = [t for t in b[b.tag == 1].bigram]

quadgrams = ['_'.join(t) for t in quadgrams]
trigrams = ['_'.join(t) for t in trigrams]
bigrams = ['_'.join(t) for t in bigrams]

In [31]:
import dask.dataframe as dd
from gensim.utils import effective_n_jobs





In [46]:
d_df = dd.from_pandas(df, npartitions=effective_n_jobs(-1))
d_df['tokens'] = d_df.abstract.map(lambda x: 
            [w for s in sent_tokenize(x) for w in ngram_tagger(normalizeTokens(word_tokenize(s)))], meta=('x', str))
d_df['text_reconstructed'] = d_df.tokens.map(lambda x: ' '.join(x))
df = d_df.compute()

In [48]:
df.to_pickle('../../Data/WoS/wos_processed.pkl')

## Explore

In [50]:
df['climate_change'] = df.tokens.apply(lambda x: 'climate_change' in x)

In [54]:
df[df.climate_change].source.value_counts()

JOURNAL OF PEASANT STUDIES               22
SOCIAL SCIENCE QUARTERLY                 11
BRITISH JOURNAL OF POLITICAL SCIENCE      8
AMERICAN ECONOMIC REVIEW                  7
PERSPECTIVES ON PSYCHOLOGICAL SCIENCE     6
PSYCHOLOGICAL SCIENCE                     5
AMERICAN JOURNAL OF POLITICAL SCIENCE     5
SOCIAL SCIENCE RESEARCH                   3
SOCIAL FORCES                             3
JOURNAL OF POLITICAL ECONOMY              2
POLITICAL ANALYSIS                        1
AMERICAN JOURNAL OF SOCIOLOGY             1
ANTHROPOLOGICAL THEORY                    1
HUMAN COMMUNICATION RESEARCH              1
ECONOMETRICA                              1
Name: source, dtype: int64

In [55]:
df['global_warming'] = df.tokens.apply(lambda x: 'global_warming' in x)

In [56]:
df[df.global_warming].source.value_counts()

SOCIAL SCIENCE QUARTERLY                 3
PSYCHOLOGICAL SCIENCE                    3
AMERICAN ECONOMIC REVIEW                 2
JOURNAL OF POLITICAL ECONOMY             2
SOCIAL FORCES                            2
AMERICAN JOURNAL OF POLITICAL SCIENCE    1
BRITISH JOURNAL OF POLITICAL SCIENCE     1
Name: source, dtype: int64

Very, very few of these articles mention climate change or global warming.

In [57]:
df['environment'] = df.tokens.apply(lambda x: 'environment' in x)

In [58]:
df[df.environment].source.value_counts()

PSYCHOLOGICAL SCIENCE                    225
SOCIAL SCIENCE QUARTERLY                 112
SOCIAL SCIENCE RESEARCH                   84
REVIEW OF ECONOMIC STUDIES                84
ECONOMETRICA                              80
AMERICAN ECONOMIC REVIEW                  76
AMERICAN JOURNAL OF POLITICAL SCIENCE     76
AMERICAN SOCIOLOGICAL REVIEW              63
SOCIAL FORCES                             62
JOURNAL OF CONSUMER RESEARCH              60
QUARTERLY JOURNAL OF ECONOMICS            51
AMERICAN POLITICAL SCIENCE REVIEW         51
PSYCHOLOGICAL REVIEW                      49
PERSPECTIVES ON PSYCHOLOGICAL SCIENCE     47
JOURNAL OF POLITICAL ECONOMY              43
AMERICAN JOURNAL OF SOCIOLOGY             43
HUMAN COMMUNICATION RESEARCH              36
PSYCHOLOGICAL BULLETIN                    30
BRITISH JOURNAL OF POLITICAL SCIENCE      27
JOURNAL OF PEASANT STUDIES                20
POLITICAL ANALYSIS                         6
ANNUAL REVIEW OF POLITICAL SCIENCE         4
ANTHROPOLO