In [8]:
MODEL_ID='COHA_byhalfcentury_nonf_smpl'
MODEL_FN='static/data/db/models/COHA_byhalfcentury_nonf/chained_combined/1800-1999.min=500.run=01.txt'

In [9]:
SAVE_N_TOP = 100

# Build DB

In [24]:
import os,gensim,networkx as nx
from tqdm import tqdm
from scipy.spatial.distance import squareform,pdist
import pandas as pd

In [11]:
MODEL_DIR = os.path.join('db',MODEL_ID)
if not os.path.exists(MODEL_DIR): os.makedirs(MODEL_DIR)

## Load gensim

In [12]:
import gensim
M = gensim.models.KeyedVectors.load_word2vec_format(MODEL_FN)

In [13]:
WORDS = list(M.vocab)
VECS = M.vectors

### Add fields/etc?

## Build dist db

In [14]:
def get_dists(vecs,words,n_top=SAVE_N_TOP):
    print('>> getting distance matrix...')
    distsX = squareform(pdist(vecs,metric='cosine'))
    print('>> done getting distance matrix...')
    return pd.DataFrame(distsX,index=words,columns=words)

In [15]:
dfdist = get_dists(VECS,WORDS)

>> getting distance matrix...
>> done getting distance matrix...


In [27]:
def build_dist_db(n_top=SAVE_N_TOP):    
    print('>> graphing...')
    ofn='db/'+MODEL_ID+'/data.simnet.csv'
    with open(ofn,'w') as of:
        of.write('source,target,weight,sim_rank\n')
        
        G=nx.Graph()
        
        for word in tqdm(WORDS):
            row = dfdist.loc[word].sort_values().iloc[1:n_top+1]
            sim_rank=0
            for word2,result in zip(row.index,row):
                sim_rank+=1
                G.add_edge(word,word2,weight=result,sim_rank=sim_rank)
        
        
        for a,b,d in tqdm(G.edges(data=True)):
            row=[a,b,d['weight'],d['sim_rank']]
            rowstr=','.join([str(x) for x in row])
            of.write(rowstr+'\n')
        
        print('>> saved:')

In [28]:
build_dist_db()


  0%|          | 0/31228 [00:00<?, ?it/s][A
  0%|          | 21/31228 [00:00<02:33, 203.64it/s][A

>> graphing...



  0%|          | 41/31228 [00:00<02:34, 202.08it/s][A
  0%|          | 62/31228 [00:00<02:32, 204.10it/s][A
  0%|          | 84/31228 [00:00<02:29, 208.51it/s][A
  0%|          | 106/31228 [00:00<02:27, 211.03it/s][A
  0%|          | 128/31228 [00:00<02:26, 212.50it/s][A
  0%|          | 150/31228 [00:00<02:25, 213.87it/s][A
  1%|          | 171/31228 [00:00<02:28, 209.68it/s][A
  1%|          | 192/31228 [00:00<02:28, 209.07it/s][A
  1%|          | 214/31228 [00:01<02:27, 210.94it/s][A
  1%|          | 236/31228 [00:01<02:26, 211.19it/s][A
  1%|          | 257/31228 [00:01<02:28, 208.40it/s][A
  1%|          | 278/31228 [00:01<02:32, 202.42it/s][A
  1%|          | 299/31228 [00:01<02:33, 201.23it/s][A
  1%|          | 320/31228 [00:01<02:34, 200.28it/s][A
  1%|          | 340/31228 [00:01<02:35, 198.90it/s][A
  1%|          | 360/31228 [00:01<02:37, 196.17it/s][A
  1%|          | 381/31228 [00:01<02:34, 200.00it/s][A
  1%|▏         | 402/31228 [00:01<02:33, 200.58it/

>> saved:


## Build vec db

In [29]:
def build_vec_db():
    from pymongo import MongoClient
    client = MongoClient()
    vecdb = client['vecdb'][MODEL_ID]
    
    for word in tqdm(WORDS):
        wvecs = [float(x) for x in M[word]]
        doc = {'word':word, 'vecs':wvecs}
        vecdb.insert(doc)

In [30]:
build_vec_db()


  if __name__ == '__main__':

  0%|          | 145/31228 [00:00<00:21, 1448.48it/s][A
  1%|          | 296/31228 [00:00<00:21, 1464.81it/s][A
  1%|▏         | 438/31228 [00:00<00:21, 1449.36it/s][A
  2%|▏         | 587/31228 [00:00<00:20, 1460.03it/s][A
  2%|▏         | 734/31228 [00:00<00:20, 1461.83it/s][A
  3%|▎         | 879/31228 [00:00<00:20, 1455.57it/s][A
  3%|▎         | 1028/31228 [00:00<00:20, 1463.39it/s][A
  4%|▍         | 1179/31228 [00:00<00:20, 1474.82it/s][A
  4%|▍         | 1329/31228 [00:00<00:20, 1481.12it/s][A
  5%|▍         | 1481/31228 [00:01<00:19, 1492.40it/s][A
  5%|▌         | 1629/31228 [00:01<00:19, 1488.36it/s][A
  6%|▌         | 1778/31228 [00:01<00:19, 1488.27it/s][A
  6%|▌         | 1929/31228 [00:01<00:19, 1492.56it/s][A
  7%|▋         | 2077/31228 [00:01<00:19, 1483.39it/s][A
  7%|▋         | 2228/31228 [00:01<00:19, 1490.32it/s][A
  8%|▊         | 2378/31228 [00:01<00:19, 1492.77it/s][A
  8%|▊         | 2531/31228 [00:01<00:19, 1501.

In [33]:
# !wc -l db/COHA_byhalfcentury_nonf_smpl/data.simnet.csv
# !ls -ltrh db/COHA_byhalfcentury_nonf_smpl/data.simnet.csv