In [2]:
import json
import os
import pickle
from abc import ABC

import gensim
import numpy as np
import pandas as pd
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
from nltk.corpus import stopwords
from sklearn.cluster import KMeans
from sklearn.cross_decomposition import CCA
from sklearn.metrics.pairwise import cosine_similarity

#nltk.download('punkt')
#nltk.download('stopwords')

In [5]:
stopWords_nltk = set(stopwords.words('english'))
stopWords_spc = {'those', 'on', 'own', '’ve', 'yourselves', 'around', 'between', 'four', 'been', 'alone', 'off', 'am',
                 'then', 'other', 'can', 'regarding', 'hereafter', 'front', 'too', 'used', 'wherein', '‘ll', 'doing',
                 'everything', 'up', 'onto', 'never', 'either', 'how', 'before', 'anyway', 'since', 'through', 'amount',
                 'now', 'he', 'was', 'have', 'into', 'because', 'not', 'therefore', 'they', 'n’t', 'even', 'whom', 'it',
                 'see', 'somewhere', 'thereupon', 'nothing', 'whereas', 'much', 'whenever', 'seem', 'until', 'whereby',
                 'at', 'also', 'some', 'last', 'than', 'get', 'already', 'our', 'once', 'will', 'noone', "'m", 'that',
                 'what', 'thus', 'no', 'myself', 'out', 'next', 'whatever', 'although', 'though', 'which', 'would',
                 'therein', 'nor', 'somehow', 'whereupon', 'besides', 'whoever', 'ourselves', 'few', 'did', 'without',
                 'third', 'anything', 'twelve', 'against', 'while', 'twenty', 'if', 'however', 'herself', 'when', 'may',
                 'ours', 'six', 'done', 'seems', 'else', 'call', 'perhaps', 'had', 'nevertheless', 'where', 'otherwise',
                 'still', 'within', 'its', 'for', 'together', 'elsewhere', 'throughout', 'of', 'others', 'show', '’s',
                 'anywhere', 'anyhow', 'as', 'are', 'the', 'hence', 'something', 'hereby', 'nowhere', 'latterly', 'say',
                 'does', 'neither', 'his', 'go', 'forty', 'put', 'their', 'by', 'namely', 'could', 'five', 'unless',
                 'itself', 'is', 'nine', 'whereafter', 'down', 'bottom', 'thereby', 'such', 'both', 'she', 'become',
                 'whole', 'who', 'yourself', 'every', 'thru', 'except', 'very', 'several', 'among', 'being', 'be',
                 'mine', 'further', 'n‘t', 'here', 'during', 'why', 'with', 'just', "'s", 'becomes', '’ll', 'about',
                 'a', 'using', 'seeming', "'d", "'ll", "'re", 'due', 'wherever', 'beforehand', 'fifty', 'becoming',
                 'might', 'amongst', 'my', 'empty', 'thence', 'thereafter', 'almost', 'least', 'someone', 'often',
                 'from', 'keep', 'him', 'or', '‘m', 'top', 'her', 'nobody', 'sometime', 'across', '‘s', '’re',
                 'hundred', 'only', 'via', 'name', 'eight', 'three', 'back', 'to', 'all', 'became', 'move', 'me', 'we',
                 'formerly', 'so', 'i', 'whence', 'under', 'always', 'himself', 'in', 'herein', 'more', 'after',
                 'themselves', 'you', 'above', 'sixty', 'them', 'your', 'made', 'indeed', 'most', 'everywhere',
                 'fifteen', 'but', 'must', 'along', 'beside', 'hers', 'side', 'former', 'anyone', 'full', 'has',
                 'yours', 'whose', 'behind', 'please', 'ten', 'seemed', 'sometimes', 'should', 'over', 'take', 'each',
                 'same', 'rather', 'really', 'latter', 'and', 'ca', 'hereupon', 'part', 'per', 'eleven', 'ever', '‘re',
                 'enough', "n't", 'again', '‘d', 'us', 'yet', 'moreover', 'mostly', 'one', 'meanwhile', 'whither',
                 'there', 'toward', '’m', "'ve", '’d', 'give', 'do', 'an', 'quite', 'these', 'everyone', 'towards',
                 'this', 'cannot', 'afterwards', 'beyond', 'make', 'were', 'whether', 'well', 'another', 'below',
                 'first', 'upon', 'any', 'none', 'many', 'serious', 'various', 're', 'two', 'less', '‘ve'}
stopWords_s = stopWords_spc | stopWords_nltk | STOPWORDS

In [3]:
model_general = gensim.models.Word2Vec.load(os.path.join('./models', 'general.model'))

FASTEXTW2V = os.path.join('./models', 'wiki-news-300d-1M-subword.vec')
groundw2v = KeyedVectors.load_word2vec_format(FASTEXTW2V)

In [107]:
vocab = list(sorted(list(model_general.wv.index_to_key)))
vocab = [w for w in vocab if w not in stopWords_s]
for idx, w in enumerate(vocab):
    if w not in groundw2v.key_to_index.keys():
        vocab[idx] = w.title()
mtx = np.vstack([groundw2v[w] for w in vocab if w in groundw2v.key_to_index.keys()])
drop_words = [w for w in vocab if w not in groundw2v.key_to_index.keys()]
print(drop_words)

['Aclu', 'Aew', 'Aoc', 'Arbery', 'Backgrid', 'Bankman', 'Bhr', 'Blm', 'Bobulinski', 'Bodycam', 'Boebert', 'Bolsonaro', 'Breonna', 'Cefc', 'Cnbc', 'Cnnmoney', 'Cnnopinion', 'Cnp', 'Covid', 'Crimo', 'Crumbley', 'Dcpi', 'Depape', 'Desantis', 'Doj', 'Fdny', 'Filmmagic', 'Ftx', 'Gofundme', 'Heastie', 'Hhs', 'Iaea', 'Ioc', 'Istockphoto', 'Jpmorgan', 'Keivom', 'Kohberger', 'Kuleba', 'Kyrsten', 'Lapd', 'Laundrie', 'Lgbtq', 'Lightrocket', 'Malliotakis', 'Martinka', 'Mcauliffe', 'Mccabe', 'Mccaul', 'Mcconnell', 'Mcdermott', 'Mcmaster', 'Mcmichael', 'Metoo', 'Murdaugh', 'Naacp', 'Nbcu', 'Nurphoto', 'Nypost', 'Prigozhin', 'Qanon', 'Realdonaldtrump', 'Rnc', 'Saipov', 'Snl', 'Somodevilla', 'Sondland', 'Spacex', 'Splashnews', 'Swns', 'Tiktok', 'Tlaib', 'Tmz', 'Vindman', 'Vucci', 'Wenzelberg', 'Wireimage', 'Wnba', 'Youngkin', 'Yovanovitch', 'Zelensky', 'Zumapress']


In [108]:
clustering = KMeans(n_clusters=300).fit(mtx)
res = {}
for c, w in zip(clustering.labels_, vocab):
    c = str(c)
    if c not in res:
        res[c] = []
    res[c].append(w)
json.dump(res, open(os.path.join('./models', 'ground_clustering.tpc'), 'w'))

KeyboardInterrupt: 

In [38]:
t_num = []
t_words = []
t_name = []
for k in res.keys():
    t_num.append(k)
    t_words.append(res[k])
    t_name.append([])
pd.DataFrame({'id': t_num, 'name': t_name, 'words': t_words}).to_csv('./ground_topics.csv')

In [None]:
vocab = list(sorted(list(model_general.wv.index_to_key)))
vocab = [w for w in vocab if w not in stopWords_s]
mtx = np.vstack([model_general.wv[w] for w in vocab])

clustering = KMeans(n_clusters=300).fit(mtx)
res = {}
t_align = {}
for c, w in zip(clustering.labels_, vocab):
    t_align[w] = c
    c = str(c)
    if c not in res:
        res[c] = []
    res[c].append(w)

In [43]:
#json.dump(t_align, open(os.path.join('./models', 'news_clustering.ali'), 'w'))
json.dump(res, open(os.path.join('./models', 'news_clustering.tpc'), 'w'))

t_num = []
t_words = []
t_name = []
for k in res.keys():
    t_num.append(k)
    t_words.append(res[k])
    t_name.append([])
pd.DataFrame({'id': t_num, 'name': t_name, 'words': t_words}).to_csv('./news_topics.csv')

In [3]:
model_general=gensim.models.Word2Vec.load(os.path.join('./models', 'general.model'))
model_nypost = gensim.models.Word2Vec.load(os.path.join('./models', 'nypost.model'))
model_cnn = gensim.models.Word2Vec.load(os.path.join('./models', 'cnn.model'))

In [4]:
general_counts = json.load(open(os.path.join('./models', 'general_word_freq.json'), 'r'))
nypost_counts = json.load(open(os.path.join('./models', 'nypost_word_freq.json'), 'r'))
cnn_counts = json.load(open(os.path.join('./models', 'cnn_word_freq.json'), 'r'))
shared_vocab = set.intersection(set(nypost_counts.keys()), set(cnn_counts.keys()))

sorted_w_nypost = sorted(shared_vocab, key=lambda x: nypost_counts[x], reverse=True)
sorted_w_cnn = sorted(shared_vocab, key=lambda x: cnn_counts[x], reverse=True)
[(i, sorted_w_nypost[i], sorted_w_cnn[i]) for i in range(6000) if sorted_w_cnn[i] != sorted_w_nypost[i]]

[(3, 'and', 'of'),
 (4, 'of', 'and'),
 (7, 'for', 'on'),
 (8, 'on', 'for'),
 (10, 'was', 'said'),
 (11, 'with', 'it'),
 (13, 'it', 'was'),
 (14, 'said', 'with'),
 (16, 'his', 'at'),
 (17, 'at', 'his'),
 (18, 'by', 'have'),
 (19, 'from', 'are'),
 (20, 'have', 'has'),
 (21, 'be', 'from'),
 (22, 'has', 'by'),
 (23, 'are', 'be'),
 (24, 'who', 'not'),
 (25, 'an', 'this'),
 (26, 'this', 'an'),
 (27, 'they', 'but'),
 (28, 'not', 'they'),
 (29, 'her', 'we'),
 (30, 'but', 'trump'),
 (31, 'we', 'who'),
 (32, 'she', 'cnn'),
 (33, 'you', 'will'),
 (34, 'their', 'were'),
 (35, 'after', 'their'),
 (36, 'new', 'been'),
 (37, 'were', 'people'),
 (39, 'had', 'us'),
 (40, 'will', 'more'),
 (41, 'one', 'president'),
 (42, 'been', 'she'),
 (44, 'up', 'had'),
 (45, 'biden', 'after'),
 (46, 'more', 'one'),
 (47, 'when', 'you'),
 (48, 'out', 'her'),
 (49, 'president', 'police'),
 (50, 'year', 'would'),
 (51, 'all', 'there'),
 (52, 'people', 'new'),
 (53, 'would', 'what'),
 (55, 'which', 'when'),
 (56, 'us', 

In [15]:
#shared_vocab = list(sorted(list(set.intersection(set(model_nypost.wv.vocab), set(model_cnn.wv.vocab)))))
counts = json.load(open(os.path.join('./models', 'word_freq.json'), 'r'))

# get the anchors
shared_vocab = set(model_nypost.wv.key_to_index).intersection(set(model_cnn.wv.key_to_index)).intersection(set(model_general.wv.key_to_index))
w_counts = [(w, counts[w]) for w in list(shared_vocab)]
sorted_w = sorted(shared_vocab, key=lambda x: counts[x], reverse=True)
anchors = sorted_w[:1000]

stopWords = list(stopWords_s & shared_vocab)
len(stopWords)

311

In [5]:
class Aligner(ABC):
    def __init__(self, method, source, target, w2id, id2w, mtxA, mtxB, trainvoc):
        self.method = method
        self.src = source
        self.tgt = target
        self.w2idA = w2id
        self.id2wB = id2w
        self.mtxA = mtxA
        self.mtxB = mtxB
        self.anchors = trainvoc

    def translate_mtx(self, mtx):
        """
        MTX -> MTX
        """
        pass

    def encode_input(self, words):
        """
        [STRING] -> MTX
        """
        embs = [self.mtxA[self.w2idA[w], :] for w in words]
        return np.vstack(embs)

    def decode_output(self, mtx, k=1):
        """
        MTX -> [[STRING]]
        """
        similarities = cosine_similarity(mtx, self.mtxB)
        most_similar = np.argsort(similarities, axis=1)[:, ::-1]
        topsims = np.sort(similarities, axis=1)[:, ::-1][:, :k]
        res = [[self.id2wB[i] for i in row[:k]] for row in most_similar]
        return res, topsims

    def translate_word(self, word, k=1):
        """
        STRING -> STRING
        """
        encoding = self.encode_input([word])
        translated = self.translate_mtx(encoding)
        decoded = self.decode_output(translated, k=k)
        return decoded[0][:k]

    def translate_words(self, words, k=1):
        """
        [STRING] -> [STRING]
        """
        encoding = self.encode_input(words)
        translated = self.translate_mtx(encoding)
        decoded, simscores = self.decode_output(translated, k=k)
        return decoded, simscores


class CCAAligner(Aligner):
    def set_params(self, cca):
        self.cca = cca

    def translate_mtx(self, mtx):
        return mtx

    def translate_word(self, word, k=1):
        tmpA = self.mtxA
        tmpB = self.mtxB
        self.mtxA, self.mtxB = self.cca.transform(tmpA, tmpB)
        res = super().translate_word(word, k=k)
        self.mtxA = tmpA
        self.mtxB = tmpB
        return res

    def translate_words(self, words, k=1):
        tmpA = self.mtxA
        tmpB = self.mtxB
        self.mtxA, self.mtxB = self.cca.transform(tmpA, tmpB)
        res, simscores = super().translate_words(words, k=k)
        self.mtxA = tmpA
        self.mtxB = tmpB
        return res, simscores


class SVDAligner(Aligner):
    def set_params(self, T):
        self.T = T

    def translate_mtx(self, mtx):
        return mtx.dot(self.T)

In [6]:
def align_cca(source, target):
    N_dims = source.shape[1]
    cca = CCA(n_components=N_dims, max_iter=2000)
    cca.fit(source, target)
    return cca


def align_svd(source, target):
    product = np.matmul(source.transpose(), target)
    U, s, V = np.linalg.svd(product)
    T = np.matmul(U, V)
    return T


def get_cca_aligner(model_a, model_b, anchorlist):
    # get wordmaps
    awords = list(sorted(list(model_a.wv.key_to_index)))
    bwords = list(sorted(list(model_b.wv.key_to_index)))
    w2idA = {w: i for i, w in enumerate(awords)}
    id2wA = {i: w for i, w in enumerate(awords)}
    w2idB = {w: i for i, w in enumerate(bwords)}
    id2wB = {i: w for i, w in enumerate(bwords)}

    # build the base matrices
    a_mtx = np.vstack([model_a.wv[w] for w in awords])
    b_mtx = np.vstack([model_b.wv[w] for w in bwords])

    # get the anchors
    a_anchor = np.vstack([a_mtx[w2idA[w], :] for w in anchorlist])
    b_anchor = np.vstack([b_mtx[w2idB[w], :] for w in anchorlist])

    # compute CCA
    cca = align_cca(a_anchor, b_anchor)

    # build and return the aligner
    aligner = CCAAligner('cca', model_a, model_b, w2idA, id2wB, a_mtx, b_mtx, anchorlist)
    aligner.set_params(cca)
    return aligner


def get_svd_aligner(model_a, model_b, anchorlist):
    # get wordmaps
    awords = list(sorted(list(model_a.wv.vocab)))
    bwords = list(sorted(list(model_b.wv.vocab)))
    w2idA = {w: i for i, w in enumerate(awords)}
    w2idB = {w: i for i, w in enumerate(bwords)}
    id2wB = {i: w for i, w in enumerate(bwords)}

    # build the base matrices
    a_mtx = np.vstack([model_a.wv[w] for w in awords])
    b_mtx = np.vstack([model_b.wv[w] for w in bwords])
    print(a_mtx.shape, b_mtx.shape)

    # get the anchors
    a_anchor = np.vstack([a_mtx[w2idA[w], :] for w in anchorlist])
    b_anchor = np.vstack([b_mtx[w2idB[w], :] for w in anchorlist])

    # get the translation matrix
    T = align_svd(a_anchor, b_anchor)

    # build and return the aligner
    aligner = SVDAligner('svd', model_a, model_b, w2idA, id2wB, a_mtx, b_mtx, anchorlist)
    aligner.set_params(T)
    return aligner

In [18]:
aligner_cnn = get_cca_aligner(model_cnn, model_general, stopWords)
pickle.dump(aligner_cnn, open(os.path.join('./models', 'align_cnn.pkl'), 'wb'))
aligner_nypost = get_cca_aligner(model_nypost, model_general, stopWords)
pickle.dump(aligner_nypost, open(os.path.join('./models', 'align_nypost.pkl'), 'wb'))

In [7]:
forward_cnn = pickle.load(open(os.path.join('./models', 'align_cnn.pkl'), 'rb'))
forward_nypost = pickle.load(open(os.path.join('./models', 'align_nypost.pkl'), 'rb'))
check = 'obama'
print(forward_cnn.translate_word(check, k=10), forward_nypost.translate_word(check, k=10))
ny_map=forward_nypost.translate_mtx(model_nypost.wv[check])

[['prosecutor', 'younger', 'hugely', 'mysterious', 'spin', 'realizing', 'toe', 'berlin', 'bowl', 'dealings']] [['aligned', 'president', 'oprah', 'relentless', 'cefc', 'mayors', 'disclosure', 'traveling', 'david', 'iran']]


In [14]:
check='Trump'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,uphold,institutions,psychological
1,buildings,militarily,fueling
2,accidentally,won,afternoon
3,christina,comey,arresting
4,cameron,curious,payment
5,test,willingness,prices
6,labor,rioting,argue
7,brennan,grown,pakistani
8,thousands,restrictive,vest
9,grab,determine,function


In [8]:
check='democratic'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,masking,federation,kenya
1,section,heinous,arpaio
2,absence,computers,mccarthy
3,clerk,ended,required
4,angeles,systemic,knicks
5,offenders,employed,authors
6,limits,rocket,email
7,invite,exercise,lawmakers
8,collection,manufacturing,harden
9,olive,sides,chip


In [9]:
check='war'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,paramount,provoked,balls
1,culture,duke,mulvaney
2,causes,states,shock
3,organizing,megan,james
4,answers,deliberately,interested
5,championship,fields,expression
6,colleague,stark,supporting
7,fantasy,reopening,subcommittee
8,euro,emailed,pennsylvania
9,lincoln,suing,presentation


In [10]:
check='terror'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,kimmel,properly,taiwan
1,saw,tass,votes
2,bent,applause,vacuum
3,mcauliffe,ours,thirty
4,nominated,foley,carriers
5,alexis,scoring,cruelty
6,coordination,hook,largely
7,rallying,offices,dayton
8,corporations,zelensky,goldman
9,shutterstock,seed,happiness


In [11]:
check='ukraine'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,sentences,afghan,genuine
1,louis,dodge,petersburg
2,line,omar,protester
3,lucrative,unfortunately,bail
4,gabriel,cdc,tower
5,shoplifting,democratic,bergen
6,apology,collar,disappointed
7,me,boris,lady
8,nightmare,shouted,compassion
9,aew,oxygen,foreigners


In [12]:
check='violence'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,richmond,seizing,everything
1,more,keys,moderates
2,rare,deals,brush
3,elites,storming,describes
4,arm,boarded,grand
5,overall,resulted,dutch
6,ashley,approach,jerry
7,sisters,pacific,issue
8,staggering,punch,prevail
9,hacking,marginalized,equality


In [13]:
check='gun'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,acquire,tonight,remarks
1,berman,provisions,newspaper
2,vermont,passionate,name
3,abramovich,expect,extended
4,actors,stood,false
5,career,victim,paralyzed
6,expire,investigations,gig
7,dirty,runners,dedicated
8,suffering,vulnerable,peterson
9,divorce,robbery,processed


In [14]:
check='trump'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,uphold,institutions,psychological
1,buildings,militarily,fueling
2,accidentally,won,afternoon
3,christina,comey,arresting
4,cameron,curious,payment
5,test,willingness,prices
6,labor,rioting,argue
7,brennan,grown,pakistani
8,thousands,restrictive,vest
9,grab,determine,function


In [15]:
check='republic'.lower()
pd.DataFrame({
    'general':[x[0] for x in model_general.wv.most_similar(check,topn=10)],
    'cnn':forward_cnn.translate_word(check, k=10)[0],
    'nypost':forward_nypost.translate_word(check, k=10)[0]
})

Unnamed: 0,general,cnn,nypost
0,capabilities,framework,intimidation
1,utility,boko,seventh
2,withdrawal,text,congressman
3,eagles,nra,chosen
4,east,sub,ideal
5,slash,alliance,invest
6,group,categories,population
7,talent,chips,permanent
8,coastal,declaring,god
9,studies,speculated,bulger


In [56]:
cn_map=forward_cnn.translate_mtx(model_cnn.wv[check])
cosine_similarity(np.array(ny_map).reshape([1,-1]),np.array(cn_map).reshape([1,-1]))

array([[-0.10958398]], dtype=float32)

In [88]:
model_general.wv[check]

array([-4.8772991e-04,  1.1522899e-03,  2.4335664e-03, -4.7192629e-03,
       -6.4384425e-04,  1.1733546e-03, -1.5236151e-04, -7.1559887e-04,
       -4.3369326e-04, -7.0619880e-04,  3.0788688e-03, -2.7435201e-03,
       -3.1751040e-03,  1.6124884e-03,  2.7763851e-03,  3.6839894e-05,
        1.2720450e-03, -3.2315950e-03, -1.7085986e-03,  2.5071877e-03,
        2.8880301e-03,  7.6024956e-04,  4.0266859e-05, -2.4150165e-03,
       -3.1555486e-03,  1.2078570e-03,  1.0000559e-03, -9.0697251e-04,
        3.9538718e-03, -2.3819534e-03, -2.1351140e-03,  1.0470392e-03,
       -3.8495401e-03, -3.8961619e-03, -2.8486853e-03,  3.3731149e-03,
       -3.0843297e-03,  3.1627556e-03,  3.1393261e-03,  4.3007167e-04,
       -2.5811319e-03, -2.4852389e-04,  4.3256641e-03,  2.3485398e-05,
       -2.7441173e-03,  1.2207074e-03,  1.5250307e-03, -2.7754339e-03,
       -1.4661993e-03, -3.1597859e-03,  4.6035620e-05,  1.9733724e-03,
        9.4898103e-04,  4.0502232e-03, -3.6525528e-03, -4.1148660e-04,
      

In [85]:
list(model_cnn.wv.index_to_key)

['the',
 'to',
 'of',
 'and',
 'in',
 'that',
 'on',
 'for',
 'is',
 'said',
 'it',
 'he',
 'was',
 'with',
 'as',
 'at',
 'his',
 'have',
 'are',
 'has',
 'from',
 'by',
 'be',
 'not',
 'this',
 'an',
 'but',
 'they',
 'we',
 'trump',
 'who',
 'cnn',
 'will',
 'were',
 'their',
 'been',
 'people',
 'about',
 'us',
 'more',
 'president',
 'she',
 'or',
 'had',
 'after',
 'one',
 'you',
 'her',
 'police',
 'would',
 'there',
 'new',
 'what',
 'also',
 'when',
 'which',
 'up',
 'out',
 'its',
 'all',
 'if',
 'year',
 'our',
 'than',
 'can',
 'no',
 'two',
 'told',
 'other',
 'over',
 'some',
 'state',
 'time',
 'so',
 'into',
 'first',
 'according',
 'could',
 'last',
 'country',
 'him',
 'them',
 'house',
 'against',
 'just',
 'war',
 'like',
 'government',
 'during',
 'now',
 'do',
 'years',
 'those',
 'how',
 'many',
 'says',
 'while',
 'russia',
 'states',
 'because',
 'before',
 'world',
 'white',
 'where',
 'most',
 'day',
 'united',
 'any',
 'ukraine',
 'get',
 'may',
 'only',
 'e

In [None]:
cnn_counts = json.load(open(os.path.join('./models', 'cnn_word_freq.json'), 'r'))
sorted_w = sorted(shared_vocab, key=lambda x: cnn_counts[x], reverse=True)

model_nypost.wv.save(os.path.join('./models','nypost.wv'))
model_cnn.wv.save(os.path.join('./models','cnn.model'))