In [48]:
import pandas as pd
import numpy as np

Read in topic modeling data

In [4]:
env = pd.read_csv('../Data/Environmental Discourse/env_processed_text.csv', index_col=0)

In [5]:
env.head()

Unnamed: 0,url,title,date,text_processed
4,https://www.resilience.org/stories/2021-07-12/...,The show is over,2021-07-12,piece originally publish july tomorrow week pa...
5,https://www.resilience.org/stories/2021-07-12/...,The Energy Bulletin Weekly 12 July 2021,2021-07-12,tom whipple steve andrews editor quote week sa...
7,https://www.resilience.org/stories/2021-07-12/...,On the Existence of A Man,2021-07-12,philosopher mystic time show connect human uni...
9,https://www.resilience.org/stories/2021-07-08/...,"Climate change, domination and the temporality...",2021-07-08,master possesor nature meaning enslave society...
11,https://www.resilience.org/stories/2021-07-07/...,Least worst politics,2021-07-07,part ii small farm future build argument local...


In [6]:
env.shape

(46825, 4)

In [9]:
env['tokens'] = env.text_processed.apply(lambda x: x.split())

In [10]:
env_small = env.sample(500)

Read in word embedding data

In [23]:
dfs = []
for i in range(5,22):
    j = str(i)
    k = (2 - len(j)) * '0' + j
    dfs.append(pd.read_pickle('../Data/Environmental Discourse/sent_processing/env_processed_sent_{}.pkl'.format(k)))

[['autism',
  'linked',
  'environmental',
  'factors',
  'research',
  'says',
  'new',
  'study',
  'looking',
  'possible',
  'environmental',
  'causes',
  'autism',
  'neurological',
  'disorder',
  'affects',
  'communication',
  'social',
  'interaction',
  'abilities',
  'demonstrates',
  'suite',
  'pollutants',
  'working',
  'combination',
  'critically',
  'affect',
  'developing',
  'embryo'],
 ['research',
  'focused',
  'chemicals',
  'found',
  'decade',
  'ago',
  'brick',
  'n.j',
  'area',
  'unusually',
  'high',
  'rate',
  'disorder'],
 ['federal',
  'study',
  'dismissed',
  'connection',
  'pollutants',
  'autism',
  'found',
  'children',
  'brick',
  'residents',
  'pressed',
  'study',
  'story',
  'reached',
  'team',
  'scientists',
  'decided',
  'look',
  'effects',
  'contaminants',
  'combination'],
 ['said',
  'carol',
  'l.',
  'reinisch',
  'marine',
  'biological',
  'laboratory',
  'woods',
  'hole',
  'mass',
  'led',
  'research',
  'community',


In [50]:
def get_wc(word, data):
    wc = 0
    for word_list in data:
        for w in word_list:
            if w == word:
                wc += 1
    return wc

In [19]:
def get_adjacent_words(word, data):
    '''
    Data to be provided as a list of lists.
    '''
    after = []
    before = []

    for word_list in data:
        for i, w in enumerate(word_list):
            if w == word:
                if i < len(word_list) - 1:
                    after.append(word_list[i+1])
                if i > 0:
                    before.append(word_list[i-1])
    
    return pd.Series(before).value_counts(), pd.Series(after).value_counts()

In [20]:
before, after = get_adjacent_words('environmental', env_small.tokens)

In [27]:
before, after = get_adjacent_words('environmental', dfs[0].sample(500).sents.sum())

In [43]:
for i in range(17):
    before, after = get_adjacent_words('environmental', dfs[i].sample(500).sents.sum())
    
    print("Year:", 2005 + i)
    print("Rank:", after.index.get_loc('justice'))
    print("Freqeuncy:", after.loc['justice'])
    print("Percentage:", 100 * after.loc['justice'] / after.sum())
    print("")

Year: 2005
Rank: 32
Freqeuncy: 4
Percentage: 0.6791171477079796

Year: 2006
Rank: 32
Freqeuncy: 3
Percentage: 0.746268656716418

Year: 2007
Rank: 33
Freqeuncy: 3
Percentage: 0.7109004739336493

Year: 2008
Rank: 5
Freqeuncy: 6
Percentage: 2.3622047244094486

Year: 2009
Rank: 56
Freqeuncy: 2
Percentage: 0.5524861878453039

Year: 2010
Rank: 28
Freqeuncy: 5
Percentage: 1.152073732718894

Year: 2011
Rank: 12
Freqeuncy: 8
Percentage: 1.5238095238095237

Year: 2012
Rank: 71
Freqeuncy: 1
Percentage: 0.3546099290780142

Year: 2013
Rank: 1
Freqeuncy: 13
Percentage: 3.812316715542522

Year: 2014
Rank: 1
Freqeuncy: 21
Percentage: 4.805491990846682

Year: 2015
Rank: 0
Freqeuncy: 47
Percentage: 9.021113243761997

Year: 2016
Rank: 1
Freqeuncy: 25
Percentage: 4.520795660036167

Year: 2017
Rank: 0
Freqeuncy: 58
Percentage: 7.561929595827901

Year: 2018
Rank: 0
Freqeuncy: 30
Percentage: 3.8412291933418694

Year: 2019
Rank: 0
Freqeuncy: 73
Percentage: 10.5643994211288

Year: 2020
Rank: 0
Freqeuncy: 162
P

In [49]:
for i in range(17):
    word1='environmental'
    word2='racism'
    
    before, after = get_adjacent_words(word1, dfs[i].sample(500).sents.sum())
    
    try:
        rank = after.index.get_loc(word2)
        freq = after.loc[word2]
        pct = 100 * after.loc[word2] / after.sum()
    except KeyError:
        rank = np.nan
        freq = 0
        pct = 0
    
    print("Year:", 2005 + i)
    print("Rank:", rank)
    print("Freqeuncy:", freq)
    print("Percentage:", pct)
    print("")

Year: 2005
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2006
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2007
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2008
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2009
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2010
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2011
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2012
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2013
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2014
Rank: 55
Freqeuncy: 2
Percentage: 0.4672897196261682

Year: 2015
Rank: 14
Freqeuncy: 7
Percentage: 1.2089810017271156

Year: 2016
Rank: 27
Freqeuncy: 5
Percentage: 0.9107468123861566

Year: 2017
Rank: 43
Freqeuncy: 3
Percentage: 0.3931847968545216

Year: 2018
Rank: nan
Freqeuncy: 0
Percentage: 0

Year: 2019
Rank: 40
Freqeuncy: 5
Percentage: 0.6426735218508998

Year: 2020
Rank: 40
Freqeuncy: 5
Percentage: 0.5500550055005501

Year: 2021
Rank: 13
Freqeuncy: 16
Percentage: 1.2759170653907497



In [55]:
for i in range(17):
    print(get_wc('extraction', dfs[i].sample(500).sents.sum()))

45
23
18
6
28
16
53
30
34
39
65
68
72
52
47
79
68


In [41]:
after.sum()

621