In [1]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from gensim.models import KeyedVectors
from util import load_metric, read_util
import json

In [None]:
#import gensim

In [2]:
dwt_roots = read_util(Path("../data/utils/dwts.txt"))

In [3]:
dwt_roots

['förortsgäng',
 'återvandr',
 'berika',
 'kulturberika',
 'ordning_och_reda_i_flyktingpolitiken',
 'globalist',
 'hjälpa_på_plats',
 'självständig_utrikespolitik']

In [4]:
models_at = Path("/home/max/Results/fb_pol-yearly-rad3/models")
vocabs_at = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3-v0/vocab")

In [22]:
def neighbors(
    mod_path, 
    voc_path, 
    roots, 
    restriction = ("N1", "N2", "V1", "V2", "A1", "P1"), 
    k=10
):
    
    lst = list()
    
    for model in sorted(os.listdir(mod_path)):
        if not model.endswith(".w2v"):
            continue
        
        print(model)
        
        wv = KeyedVectors.load_word2vec_format(mod_path / model)
        counts = load_metric(voc_path / model.replace(".w2v", ".txt"))
        words = [word for word in wv.index_to_key if any(root in word for root in roots)]
        if restriction != None:
            words = [word for word in words if word.startswith(restriction)]
        
        year = int(model.replace(".w2v", ""))
        
        for word in words:
            for neighbor, score in wv.most_similar(positive = [word], topn = k):
                count = int(counts[neighbor])
                lst.append({
                    "Word": word,
                    "Year": year,
                    "Neighbor": neighbor,
                    "Score": score,
                    "Count": count
                })
                
    return lst

In [23]:
data = neighbors(models_at, vocabs_at, dwt_roots, k=15)

2022.wv.vectors.npy1neg.npy

In [24]:
with open(Path("../../dw_results/neighbors.json"), "w") as f:
    f.write(json.dumps(data, indent=4))  

In [25]:
df = pd.DataFrame(data)
df = df.sort_values(by=["Word", "Year", "Score"])
df.head(30)

Unnamed: 0,Word,Year,Neighbor,Score,Count
464,A1_globalistisk,2008,multikultipropaganda,0.656081,11
463,A1_globalistisk,2008,upplösandet,0.659412,10
462,A1_globalistisk,2008,oligarkiska,0.661036,12
461,A1_globalistisk,2008,hegemoniska,0.66284,12
460,A1_globalistisk,2008,konsumtionskultur,0.663449,14
459,A1_globalistisk,2008,teknokratiska,0.667888,12
458,A1_globalistisk,2008,kälkborgerliga,0.668871,11
457,A1_globalistisk,2008,själlösa,0.672961,24
456,A1_globalistisk,2008,maktcentra,0.67424,15
455,A1_globalistisk,2008,utbildningsväsen,0.675529,16


In [26]:
df.to_csv("../../dw_results/neighbors.csv", sep="\t")