# Summary

Runs [fasttext's](https://fasttext.cc/) word2vec on n-days of reading sessions

### Imports etc.

In [1]:
import copy
import csv
import datetime
import math

import numpy as np
import pandas as pd
import os,sys
import json
import time

import fasttext
print('ready')

ready


## Prepare data

In [14]:
# ## select which files to train on
day_start = datetime.date(2020,4,27)
day_end = datetime.date(2020,4,28)
date_array = \
    (day_start + datetime.timedelta(days=x) for x in range(0, (day_end-day_start).days))


lang = 'wikidata'
# lang = 'enwiki'

In [15]:
## get all the files
PATH_FILES = '../output/reading-sessions-filtered'
PATH_OUT = '../output/models'
list_filenames = []
for date_object in date_array:
    day_str = date_object.strftime("%Y-%m-%d")#str(datetime.date(year,month,day))
    filename = os.path.join(
        PATH_FILES,
       'reading-sessions-filtered_%s_%s'%(lang,day_str)
    )
    list_filenames += [filename]
print(len(list_filenames))

1


In [16]:
## create a single training file for fasttext
## this file will be deledted after training
from multiprocessing import Pool
n_cores_max = 20 ## use at most this many cores to read the individual files when creating a single file

PATH_tmp = '../tmp/'
if not os.path.exists(PATH_tmp):
    os.makedirs(PATH_tmp)
FILE_tmp = os.path.join(PATH_tmp,'tmp_corpus.train')
limit = -1 ## number of lines per file (-1 if all)
def make_tmp_corpus_from_files(filename_in):
    with open(FILE_tmp,'a') as fout:
        i = 0
        for line in open(filename_in):
            i+=1
            fout.write(line)
            if i==limit:
                break
    return None

## append files into single file for fasttext
n_files = len(list_filenames)
with open(FILE_tmp,'w') as fin:
    fin.write('')
try:
    pool = Pool( min([n_files,n_cores_max]))
    pool.map(make_tmp_corpus_from_files,list_filenames)
finally:
    pool.close()
    pool.join()

In [17]:
i=0
for line in open(FILE_tmp):
    i+=1
i

53922695

## Train

In [18]:
## Parameters
mode = 'cbow' ## (if 1: skip-gram, else cbow)
size = 50 ## number of dimensions
window = 5 ## context window size
sample = 0.001 ## downsample high-frequency words
negative = 5 ##negative sampling (noise words)
min_count = 20 ## words with less occurrences in total will be ignored
epochs  = 5 ## number of iterations
workers = 20 ## number of cores to use


In [19]:
## train the model
t1=time.time()
model = fasttext.train_unsupervised(
    FILE_tmp, 
    dim = size, epoch=epochs, model=mode,minCount=min_count,ws=window,neg=negative, t=sample,
    thread=workers,
    maxn=0 ## no subword
)
t2=time.time()
print(t2-t1)

## remove tmp corpus file
os.system('rm -f %s'%FILE_tmp)

356.15539026260376


0

In [20]:
filename_save = os.path.join(
    PATH_OUT,
    'word2vec_%s_%s--%s_params-%s-%s-%s-%s-%s-%s-%s.bin'%(
        lang,str(day_start),str(day_end),
        mode,size,window,sample,negative,min_count,epochs
    )
)
model.save_model(filename_save)

In [21]:
len(model.get_words())

1174834