# Finds neighbors of DWEs in the SGNS models

In [None]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from gensim.models import KeyedVectors
from util import load_metric, read_util
import json

In [None]:
#import gensim

In [None]:
dwt_roots = read_util(Path("../data/utils/dwts.txt"))

In [None]:
dwt_roots

In [None]:
models_at = Path("/home/max/Results/fb_pol-yearly-rad3/models")
vocabs_at = Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3-v0/vocab")

In [None]:
def neighbors(
    mod_path, 
    voc_path, 
    roots, 
    restriction = ("N1", "N2", "V1", "V2", "A1", "P1"), 
    k=10,
    min_freq = None
):
    
    lst = list()
    
    for model in sorted(os.listdir(mod_path)):
        if not model.endswith(".w2v"):
            continue
        
        print(model)
        
        wv = KeyedVectors.load_word2vec_format(mod_path / model)
        counts = load_metric(voc_path / model.replace(".w2v", ".txt"))
        words = [word for word in wv.index_to_key if any(root in word for root in roots)]
        if restriction != None:
            words = [word for word in words if word.startswith(restriction)]
        
        year = int(model.replace(".w2v", ""))
        
        for word in words:
            if min_freq == None:
                for neighbor, score in wv.most_similar(positive = [word], topn = k):
                    count = int(counts[neighbor])
                    lst.append({
                        "Word": word,
                        "Year": year,
                        "Neighbor": neighbor,
                        "Score": score,
                        "Count": count
                    })
            
            else:
                nbh_count = 0
                top_candidates = wv.most_similar(positive = [word], topn = 10000) # hopefully 10000 is enough
                for neighbor, score in top_candidates:
                    if nbh_count > k:
                        break
                    count = int(counts[neighbor])
                    if count < min_freq:
                        continue
                    lst.append({
                        "Word": word,
                        "Year": year,
                        "Neighbor": neighbor,
                        "Score": score,
                        "Count": count
                    })                
                    nbh_count += 1
                
    return lst

In [None]:
data = neighbors(models_at, vocabs_at, dwt_roots, k=15)

In [None]:
with open(Path("../../dw_results/neighbors.json"), "w") as f:
    f.write(json.dumps(data, indent=4))  

In [None]:
df = pd.DataFrame(data)
df = df.sort_values(by=["Word", "Year", "Score"])
df.head(30)

In [None]:
df.to_csv("../../dw_results/neighbors.csv", sep="\t")

In [None]:
#######################3

In [None]:
def jaccard(a,b):
    return len(a.intersection(b)) / len(a.union(b))

In [None]:
wv = KeyedVectors.load_word2vec_format(Path("/home/max/Results/fb_pol-yearly-rad3/models") / "2001.w2v")


In [None]:
voc = load_metric(Path("/home/max/Corpora/flashback-pol-time/yearly/fb-pt-radical3-v0/vocab") / "2001.txt")

In [None]:
a = set()
k = 0
for w, score in wv.most_similar(positive = ["N1_kulturberikare"], topn = 1000):
    if k > 20:
        break
    if voc[w] > 100:
        print(w)
        a.add(w)
        k += 1
    

In [None]:
b = set()
k = 0
for w, score in wv.most_similar(positive = ["N1_kulturberikare"], topn = 1000):
    if k > 20:
        break
    if voc[w] > 10:
        print(w)
        b.add(w)
        k += 1
    

In [None]:
jaccard(a,b)