In [1]:
import numpy as np
from gensim.models import KeyedVectors
import pickle
import pandas as pd

In [77]:
def shrink(root):
    '''
    Translate HistWords keyed vector files to word2vec_format
    
    Based on gist by @chengjun
    https://github.com/williamleif/histwords/issues/8#issuecomment-400155072
    
    Note: removes vocabulary without vectors (all 0s in original)
    '''
    
    # Load the original files
    mat = np.load(f"{root}-w.npy", 
                  mmap_mode="c")

    iw = pickle.load(open(f'{root}-vocab.pkl','rb'))
    
    # convert to a pandas dataframe
    df = pd.DataFrame(mat, index=iw)
    
    # figure out which rows are all 0s and drop them
    df['sum'] = df.sum(axis=1)
    sdf = df[df['sum']!=0].copy()
    
    vocab_size  = len(sdf)
    vector_size = 300
    
    print(f'{vocab_size} words with {vector_size} vectors.')
    with open(f"{root}-sm.txt", "w") as fp:
        # write header
        fp.write(str(vocab_size) + " " + str(vector_size) + "\n")
        # write vectors
        for row in sdf.iterrows():
            word =row[0]
            vectors = [str(i) for i in row[1][:300]]
            fp.write(word + " " + " ".join(vectors) + "\n")
            
    # load in gensim
    m = KeyedVectors.load_word2vec_format(f"{root}-sm.txt",
                                          binary=False)  
    # save in word2vec C format
    m.save_word2vec_format(f'{root}-sm.bin', binary=True)

In [79]:
#process all files

for year in range(1800,2000,10):
    print(year)
    root = f'eng-all_sgns/{year}'
    shrink(root)

1800
13045 words with 300 vectors.
1810
15771 words with 300 vectors.
1820
20312 words with 300 vectors.
1830
21691 words with 300 vectors.
1840
23818 words with 300 vectors.
1850
29035 words with 300 vectors.
1860
27191 words with 300 vectors.
1870
29320 words with 300 vectors.
1880
34081 words with 300 vectors.
1890
37729 words with 300 vectors.
1900
41551 words with 300 vectors.
1910
36553 words with 300 vectors.
1920
35643 words with 300 vectors.
1930
34477 words with 300 vectors.
1940
34226 words with 300 vectors.
1950
41807 words with 300 vectors.
1960
54332 words with 300 vectors.
1970
60344 words with 300 vectors.
1980
64934 words with 300 vectors.
1990
71097 words with 300 vectors.
