In [8]:
import multiprocessing
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from os import walk
from gensim.models.word2vec import Text8Corpus
from datetime import datetime
import time
import os

import logging


def train_and_test(root_train, root_test , output_name, params):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    directory = os.path.join("/home/snu/data/Test_data/models", datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(directory)
    test=open(directory+"/test results.txt","w") ##replaced the 'directory' into real one
    startTime = time.time()
      
    print("\nRetrieving the corpus...")   
    #if using another corpus then use LineSentence() which will itterate over the corpus in root 
    #sentences = LineSentence(root)
    sentences = Text8Corpus(root_train)
    
    print("Training the model...")
    model = Word2Vec(sentences, **params)
    
    print("Freeing the memory...")
    model.init_sims(replace=True)
    
    print("saving the model...")
    model.save(directory+"/"+output_name)
    
    print("Testing the model...")
    test.write("Model " + output_name +  " at " + directory + ".\n")
    test.write("Training from : " + root_train + "\n")
    test.write("\n")
    endTime = time.time()
    test.write("Parameters : " + "\n\tVector size = " + repr(params["size"]) + ",\n\tWindow size = " + repr(params["window"]) + ",\n\tMin count = " + repr(params["min_count"]) + ",\n\tskip-gram/CBOW = " + ("skip-gram" if params["sg"]==1 else "CBOW") + ",\n\tHierarchical softmax/Negative sampling = " + ("Hierarchical softmax" if params["hs"]==1 else "Negative sampling \n\n"))
    test.write("The model took " + repr((endTime - startTime)/60)+ " to train." + "\n")
    test.write("Vocabulary length : " + repr(len(model.wv.vocab)) + "\n")
    test.write("\n\n") 
    test.write("Testing from : " + root_test + "\n\n")
    

    print(filenames, path)
    for (dirpath, dirnames, filenames) in walk(root_test):
        filenames = filenames
        
        break
    
    sim=0   
    sim2=0
    num_tests=len(filenames)   
    mw=0
    total_num_pairs=0
    
    for file in filenames:
        similarity = model.wv.evaluate_word_pairs(root_test+file, dummy4unknown=False)
        num_pairs=round(len(open(root_test+file,"r").readlines()))
        total_num_pairs=total_num_pairs + num_pairs
        sim=sim+similarity[0][0]
        sim2=sim2+similarity[1][0]
        mw=mw+similarity[2]*num_pairs/100
        test.write("Test results on " + file + ": \n")
        test.write("Pearson correlation coefficient = %.2f\n" % similarity[0][0])
        test.write("Spearman rank-order correlation coefficient = %.2f\n" % similarity[1][0])
        test.write("Number of missing words = " + repr(round(similarity[2]*num_pairs/100)) + "/" + repr(num_pairs)+ "\n")
        test.write("\n")
    
    test.write("Average test results: \n")    
    test.write("Average Pearson Correlation  = %.2f\n" % (sim/num_tests))
    test.write("Average Pearson Spearman rank-order correlation = %.2f\n" % (sim2/num_tests))
    test.write("Total number of missing words : "+repr(round(mw))+"/"+repr(total_num_pairs)+ "\n")

    test.close()
    
    return (sim/num_tests), (sim2/num_tests), (round(mw)), (total_num_pairs)

if __name__ == "__main__":
    
    
    output_name = "output_0"
    root_train = "/home/snu/data/Training_data/text8"
    root_test = "/home/snu/data/Test_data/WordSimilarity-353-EN"
    
    params = {
        'size': 100,
        'window': 5,
        'min_count': 10,
        'sg' : 1,
        'hs' : 0,
        'workers': max(1, multiprocessing.cpu_count() - 1),
        'sample': 1E-3,
        }
    
    results = train_and_test(root_train, root_test, output_name, params)
    
    print("\nAverage test results: \n")    
    print("Average Pearson Correlation  = %.2f" % results[0])
    print("Average Pearson Spearman rank-order correlation = %.2f" % results[1])
    print("Total number of missing words : " + repr(results[2])+"/" + repr(results[3]))

2017-07-23 19:13:20,806 : INFO : collecting all words and their counts
2017-07-23 19:13:20,808 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



Retrieving the corpus...
Training the model...


2017-07-23 19:13:28,394 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2017-07-23 19:13:28,395 : INFO : Loading a fresh vocabulary
2017-07-23 19:13:28,654 : INFO : min_count=10 retains 47134 unique words (18% of original 253854, drops 206720)
2017-07-23 19:13:28,655 : INFO : min_count=10 leaves 16561031 word corpus (97% of original 17005207, drops 444176)
2017-07-23 19:13:28,783 : INFO : deleting the raw counts dictionary of 253854 items
2017-07-23 19:13:28,792 : INFO : sample=0.001 downsamples 38 most-common words
2017-07-23 19:13:28,793 : INFO : downsampling leaves estimated 12333563 word corpus (74.5% of prior 16561031)
2017-07-23 19:13:28,794 : INFO : estimated required memory for 47134 words and 100 dimensions: 61274200 bytes
2017-07-23 19:13:28,978 : INFO : resetting layer weights
2017-07-23 19:13:29,506 : INFO : training model with 7 workers on 47134 vocabulary and 100 features, using sg=1 hs=0 sample=0.001 negative=5 window=5
2017-07

2017-07-23 19:14:42,491 : INFO : PROGRESS: at 37.00% examples, 312733 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:43,514 : INFO : PROGRESS: at 37.37% examples, 311431 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:44,535 : INFO : PROGRESS: at 37.71% examples, 310028 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:45,539 : INFO : PROGRESS: at 38.04% examples, 308623 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:46,599 : INFO : PROGRESS: at 38.39% examples, 307143 words/s, in_qsize 12, out_qsize 1
2017-07-23 19:14:47,612 : INFO : PROGRESS: at 38.75% examples, 306006 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:48,681 : INFO : PROGRESS: at 39.11% examples, 304642 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:49,689 : INFO : PROGRESS: at 39.44% examples, 303314 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:50,712 : INFO : PROGRESS: at 39.75% examples, 301876 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:14:51,724 : INFO : PROGRESS: at 40.08% examples, 3

2017-07-23 19:16:03,929 : INFO : PROGRESS: at 63.75% examples, 254516 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:16:04,936 : INFO : PROGRESS: at 64.10% examples, 254263 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:16:05,940 : INFO : PROGRESS: at 64.46% examples, 254027 words/s, in_qsize 12, out_qsize 0
2017-07-23 19:16:06,989 : INFO : PROGRESS: at 64.77% examples, 253614 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:16:08,072 : INFO : PROGRESS: at 65.13% examples, 253254 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:16:09,155 : INFO : PROGRESS: at 65.46% examples, 252819 words/s, in_qsize 14, out_qsize 1
2017-07-23 19:16:10,255 : INFO : PROGRESS: at 65.81% examples, 252463 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:16:11,271 : INFO : PROGRESS: at 66.14% examples, 252151 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:16:12,300 : INFO : PROGRESS: at 66.46% examples, 251775 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:16:13,318 : INFO : PROGRESS: at 66.82% examples, 2

2017-07-23 19:17:25,754 : INFO : PROGRESS: at 91.09% examples, 237854 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:26,819 : INFO : PROGRESS: at 91.44% examples, 237704 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:27,876 : INFO : PROGRESS: at 91.80% examples, 237596 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:28,880 : INFO : PROGRESS: at 92.15% examples, 237484 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:29,889 : INFO : PROGRESS: at 92.50% examples, 237391 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:30,901 : INFO : PROGRESS: at 92.83% examples, 237242 words/s, in_qsize 14, out_qsize 0
2017-07-23 19:17:31,926 : INFO : PROGRESS: at 93.18% examples, 237136 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:32,955 : INFO : PROGRESS: at 93.51% examples, 236969 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:33,976 : INFO : PROGRESS: at 93.89% examples, 236934 words/s, in_qsize 13, out_qsize 0
2017-07-23 19:17:34,995 : INFO : PROGRESS: at 94.23% examples, 2

Freeing the memory...


2017-07-23 19:17:52,959 : INFO : saving Word2Vec object under /home/snu/data/Test_data/models/2017-07-23_19-13-20/output_0, separately None
2017-07-23 19:17:52,960 : INFO : not storing attribute syn0norm
2017-07-23 19:17:52,961 : INFO : not storing attribute cum_table


saving the model...


2017-07-23 19:17:53,558 : INFO : saved /home/snu/data/Test_data/models/2017-07-23_19-13-20/output_0


Testing the model...


UnboundLocalError: local variable 'filenames' referenced before assignment