In [1]:
from gensim.models import Word2Vec
from scipy.spatial.distance import cosine
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [2]:
with open ('./Guardian_Pre.pkl', 'rb') as p:
    guardian_pre = pickle.load(p)
    guardianpre_transmat = guardian_pre[2]
    guardian_pre = dict(zip(guardian_pre[0], guardian_pre[1]))

with open ('./Guardian_Post.pkl', 'rb') as p:
    guardian_post = pickle.load(p)
    guardianpost_transmat = guardian_post[2]
    guardian_post = dict(zip(guardian_post[0], guardian_post[1]))
    
with open ('./Daily Mail_Pre.pkl', 'rb') as p:
    dm_pre = pickle.load(p)
    dmpre_transmat = dm_pre[2]
    dm_pre = dict(zip(dm_pre[0], dm_pre[1]))

with open ('./Daily Mail_Post.pkl', 'rb') as p:
    dm_post = pickle.load(p)
    dmpost_transmat = dm_post[2]
    dm_post = dict(zip(dm_post[0], dm_post[1]))

In [3]:
common_words = list(
    set.intersection(
    set(guardian_pre.keys()), 
    set(guardian_post.keys()),
    set(dm_pre.keys()),
    set(dm_post.keys())
    )
)
len(common_words)

2979

In [4]:
guardian_change = dict(zip(common_words, np.array([guardian_post[w] - guardian_pre[w] for w in common_words])))
dm_change = dict(zip(common_words, np.array([dm_post[w] - dm_pre[w] for w in common_words])))

In [5]:
change_df = pd.DataFrame({
    'word': common_words,
    'change_similarity': [1 - cosine(guardian_change[w], dm_change[w]) for w in common_words]
})

### Most Polarising Words

In [22]:
change_df.sort_values('change_similarity', ascending = True).head(50)

Unnamed: 0,word,change_similarity
405,honest,-0.36592
1104,profile,-0.353575
1189,forest,-0.345167
1250,sleep,-0.340956
2441,attach,-0.315799
2093,worth,-0.305722
171,constantly,-0.299763
2044,numerous,-0.298154
2202,friend,-0.286876
2021,equal,-0.281106


### Least Polarising Words

In [23]:
change_df.sort_values('change_similarity', ascending = False).head(50)

Unnamed: 0,word,change_similarity
895,2016,0.522402
385,2015,0.509888
2779,2013,0.439129
728,2014,0.381713
1047,legislation,0.363693
2582,become,0.346639
2592,importance,0.336429
2461,north korea,0.330585
1205,principle,0.330336
1868,philippines,0.328946


# Foreign Policy

# Similar changes

A lot of countries seem to have changed in similar direction between the two outlets. Lets look at Russia for example.

In [24]:
1 - cosine(guardian_change['russia'], dm_change['russia']) 

0.2939436435699463

In [28]:
simchan = [(w, 1 - cosine(guardian_change['russia'], guardian_post[w])) for w in common_words]
simchan.sort(key = lambda x: x[1], reverse = True)
print('Top 10 Closest Words to Change')
simchan[0:10]

Top 10 Closest Words to Change


[('raid', 0.24807405471801758),
 ('investigate', 0.2474696934223175),
 ('assault', 0.24744580686092377),
 ('arrest', 0.2356753945350647),
 ('guilty', 0.23482714593410492),
 ('escape', 0.23182569444179535),
 ('overturn', 0.22936154901981354),
 ('trace', 0.22883063554763794),
 ('perpetrator', 0.22597208619117737),
 ('hide', 0.2254197597503662)]

In [30]:
print('Top 10 Distant Words to Change')
simchan[-10:]

Top 10 Distant Words to Change


[('import', -0.3016241490840912),
 ('euro', -0.30785349011421204),
 ('gas', -0.3084356188774109),
 ('steel', -0.30867668986320496),
 ('industry', -0.31018462777137756),
 ('gold', -0.31270831823349),
 ('jet', -0.3280877470970154),
 ('energy', -0.3385530710220337),
 ('oil', -0.34803643822669983),
 ('market', -0.356217622756958)]

# Changes in Opposite Directions

## Beijing

### Daily Mail

In [32]:
simchan = [(w, 1 - cosine(dm_change['beijing'], dm_post[w])) for w in common_words]
simchan.sort(key = lambda x: x[1], reverse = True)
print('Top 10 Closest Words to Change')
simchan[0:10]

Top 10 Closest Words to Change


[('undertake', 0.2703135311603546),
 ('beijing', 0.24956800043582916),
 ('service', 0.24872776865959167),
 ('assistance', 0.24673257768154144),
 ('export', 0.23308145999908447),
 ('investment', 0.22976772487163544),
 ('application', 0.22653239965438843),
 ('organisation', 0.2263706624507904),
 ('commander', 0.22329914569854736),
 ('supplier', 0.22234591841697693)]

In [33]:
print('Top 10 Distant Words to Change')
simchan[-10:]

Top 10 Distant Words to Change


[('constituency', -0.25596514344215393),
 ('mouth', -0.2589796483516693),
 ('scare', -0.2589893937110901),
 ('texas', -0.2614404857158661),
 ('old', -0.2661895155906677),
 ('trouble', -0.2663823962211609),
 ('string', -0.27048492431640625),
 ('rarely', -0.27249789237976074),
 ('afraid', -0.27864041924476624),
 ('scared', -0.2791215777397156)]

### Guardian

In [34]:
simchan = [(w, 1 - cosine(guardian_change['beijing'], guardian_post[w])) for w in common_words]
simchan.sort(key = lambda x: x[1], reverse = True)
print('Top 10 Closest Words to Change')
simchan[0:10]

Top 10 Closest Words to Change


[('beijing', 0.4008871614933014),
 ('somebody', 0.3350100815296173),
 ('unfair', 0.32837921380996704),
 ('everything', 0.32667508721351624),
 ('lot_of_people', 0.3201741874217987),
 ('someone', 0.30441147089004517),
 ("n't_think", 0.2978893518447876),
 ('ordinary', 0.29530584812164307),
 ('honest', 0.29243141412734985),
 ('nice', 0.2917780578136444)]

In [35]:
print('Top 10 Distant Words to Change')
simchan[-10:]

Top 10 Distant Words to Change


[('resume', -0.1954725831747055),
 ('partnership', -0.19810833036899567),
 ('emergency', -0.1984114795923233),
 ('service', -0.20034588873386383),
 ('unit', -0.21551457047462463),
 ('attend', -0.2208581566810608),
 ('rank', -0.2291947901248932),
 ('three_year', -0.24972888827323914),
 ('session', -0.2639145255088806),
 ('schedule', -0.27402547001838684)]

## China

In [36]:
1 - cosine(guardian_change['china'], dm_change['china']) 

0.052781831473112106

### Daily Mail

In [38]:
simchan = [(w, 1 - cosine(dm_change['china'], dm_post[w])) for w in common_words]
simchan.sort(key = lambda x: x[1], reverse = True)
print('Top 10 Closest Words to Change')
simchan[0:10]

Top 10 Closest Words to Change


[('ally', 0.3023122549057007),
 ('leader', 0.28770801424980164),
 ('adviser', 0.280696302652359),
 ('attacker', 0.2754685878753662),
 ('ball', 0.2751913070678711),
 ('sanction', 0.2679229974746704),
 ('intervention', 0.2655593454837799),
 ('commander', 0.26430821418762207),
 ('aide', 0.2626778483390808),
 ('weapon', 0.26155105233192444)]

In [41]:
print('Top 10 Distant Words to Change')
simchan[-10:]

Top 10 Distant Words to Change


[('hong kong', -0.2294284999370575),
 ('2,000', -0.22972124814987183),
 ('thousand', -0.23895806074142456),
 ('100,000', -0.24575449526309967),
 ('65', -0.24616773426532745),
 ('80', -0.24673137068748474),
 ('mostly', -0.2601843774318695),
 ('every_year', -0.2660401165485382),
 ('handful', -0.28748205304145813),
 ('rarely', -0.34868142008781433)]

### Guardian

In [42]:
simchan = [(w, 1 - cosine(guardian_change['china'], guardian_post[w])) for w in common_words]
simchan.sort(key = lambda x: x[1], reverse = True)
print('Top 10 Closest Words to Change')
simchan[0:10]

Top 10 Closest Words to Change


[('deserve', 0.37707704305648804),
 ('take_action', 0.36520788073539734),
 ('make_sure', 0.3183402717113495),
 ('pursue', 0.30869999527931213),
 ('seriously', 0.3000405430793762),
 ('come_forward', 0.2922962009906769),
 ('punish', 0.28934183716773987),
 ('fight', 0.2882879078388214),
 ('assure', 0.2869657576084137),
 ('accusation', 0.2866135239601135)]