In [2]:
import multiprocessing
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from os import walk
from gensim.models.word2vec import Text8Corpus
from datetime import datetime
import time
import os

import logging


def train_and_test(root_train, root_test , output_name, params):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    directory = os.path.join("/home/snu/data/Test_data/models", datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(directory)
    test=open(directory+"/test results.txt","w") ##replaced the 'directory' into real one
    startTime = time.time()
      
    print("\nRetrieving the corpus...")   
    #if using another corpus then use LineSentence() which will itterate over the corpus in root 
    #sentences = LineSentence(root)
    sentences = Text8Corpus(root_train)
    
    print("Training the model...")
    model = Word2Vec(sentences, **params)
    
    print("Freeing the memory...")
    model.init_sims(replace=True)
    
    print("saving the model...")
    model.save(directory+"/"+output_name)
    
    print("Testing the model...")
    test.write("Model " + output_name +  " at " + directory + ".\n")
    test.write("Training from : " + root_train + "\n")
    test.write("\n")
    endTime = time.time()
    test.write("Parameters : " + "\n\tVector size = " + repr(params["size"]) + ",\n\tWindow size = " + repr(params["window"]) + ",\n\tMin count = " + repr(params["min_count"]) + ",\n\tskip-gram/CBOW = " + ("skip-gram" if params["sg"]==1 else "CBOW") + ",\n\tHierarchical softmax/Negative sampling = " + ("Hierarchical softmax" if params["hs"]==1 else "Negative sampling \n\n"))
    test.write("The model took " + repr((endTime - startTime)/60)+ " to train." + "\n")
    test.write("Vocabulary length : " + repr(len(model.wv.vocab)) + "\n")
    test.write("\n\n") 
    test.write("Testing from : " + root_test + "\n\n")
    

    
    for (dirpath, dirnames, filenames) in walk(root_test):
        filenames = filenames
        break
    
    sim=0   
    sim2=0
    num_tests=len(filenames)   
    mw=0
    total_num_pairs=0
    
    for file in filenames:
        similarity = model.wv.evaluate_word_pairs(root_test+file, dummy4unknown=False)
        num_pairs=round(len(open(root_test+file,"r").readlines()))
        total_num_pairs=total_num_pairs + num_pairs
        sim=sim+similarity[0][0]
        sim2=sim2+similarity[1][0]
        mw=mw+similarity[2]*num_pairs/100
        test.write("Test results on " + file + ": \n")
        test.write("Pearson correlation coefficient = %.2f\n" % similarity[0][0])
        test.write("Spearman rank-order correlation coefficient = %.2f\n" % similarity[1][0])
        test.write("Number of missing words = " + repr(round(similarity[2]*num_pairs/100)) + "/" + repr(num_pairs)+ "\n")
        test.write("\n")
    
    test.write("Average test results: \n")    
    test.write("Average Pearson Correlation  = %.2f\n" % (sim/num_tests))
    test.write("Average Pearson Spearman rank-order correlation = %.2f\n" % (sim2/num_tests))
    test.write("Total number of missing words : "+repr(round(mw))+"/"+repr(total_num_pairs)+ "\n")

    test.close()
    
    return (sim/num_tests), (sim2/num_tests), (round(mw)), (total_num_pairs)

if __name__ == "__main__":
    
    
    output_name = "output_0"
    root_train = "/home/snu/data/Training_data/text8"
    root_test = "/home/snu/data/Test_data/"
    
    params = {
        'size': 100,
        'window': 8,
        'min_count': 10,
        'sg' : 0,
        'hs' : 0,
        'workers': max(1, multiprocessing.cpu_count() - 1),
        'sample': 1E-3,
        }
    
    results = train_and_test(root_train, root_test, output_name, params)
    
    print("\nAverage test results: \n")    
    print("Average Pearson Correlation  = %.2f" % results[0])
    print("Average Pearson Spearman rank-order correlation = %.2f" % results[1])
    print("Total number of missing words : " + repr(results[2])+"/" + repr(results[3]))

2017-07-20 09:53:21,229 : INFO : collecting all words and their counts
2017-07-20 09:53:21,230 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



Retrieving the corpus...
Training the model...


2017-07-20 09:53:26,806 : INFO : collected 253854 word types from a corpus of 17005207 raw words and 1701 sentences
2017-07-20 09:53:26,807 : INFO : Loading a fresh vocabulary
2017-07-20 09:53:26,994 : INFO : min_count=10 retains 47134 unique words (18% of original 253854, drops 206720)
2017-07-20 09:53:26,995 : INFO : min_count=10 leaves 16561031 word corpus (97% of original 17005207, drops 444176)
2017-07-20 09:53:27,108 : INFO : deleting the raw counts dictionary of 253854 items
2017-07-20 09:53:27,114 : INFO : sample=0.001 downsamples 38 most-common words
2017-07-20 09:53:27,115 : INFO : downsampling leaves estimated 12333563 word corpus (74.5% of prior 16561031)
2017-07-20 09:53:27,116 : INFO : estimated required memory for 47134 words and 100 dimensions: 61274200 bytes
2017-07-20 09:53:27,285 : INFO : resetting layer weights
2017-07-20 09:53:27,739 : INFO : training model with 7 workers on 47134 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=8
2017-07

Freeing the memory...


2017-07-20 09:54:27,438 : INFO : saving Word2Vec object under /home/snu/data/Test_data/models/2017-07-20_09-53-21/output_0, separately None
2017-07-20 09:54:27,439 : INFO : not storing attribute syn0norm
2017-07-20 09:54:27,439 : INFO : not storing attribute cum_table


saving the model...


2017-07-20 09:54:27,756 : INFO : saved /home/snu/data/Test_data/models/2017-07-20_09-53-21/output_0
2017-07-20 09:54:27,849 : INFO : Pearson correlation coefficient against /home/snu/data/Test_data/WS353-english-rel.txt: 0.5111
2017-07-20 09:54:27,849 : INFO : Spearman rank-order correlation coefficient against /home/snu/data/Test_data/WS353-english-rel.txt: 0.5122
2017-07-20 09:54:27,850 : INFO : Pairs with unknown words ratio: 0.4%
2017-07-20 09:54:27,909 : INFO : Pearson correlation coefficient against /home/snu/data/Test_data/WS-353-EN.txt: 0.5812
2017-07-20 09:54:27,910 : INFO : Spearman rank-order correlation coefficient against /home/snu/data/Test_data/WS-353-EN.txt: 0.6265
2017-07-20 09:54:27,910 : INFO : Pairs with unknown words ratio: 6.2%
2017-07-20 09:54:28,005 : INFO : Pearson correlation coefficient against /home/snu/data/Test_data/WS353-english-sim.txt: 0.6370
2017-07-20 09:54:28,006 : INFO : Spearman rank-order correlation coefficient against /home/snu/data/Test_data/WS

Testing the model...


2017-07-20 09:54:28,076 : INFO : Pearson correlation coefficient against /home/snu/data/Test_data/SE17-EN.txt: 0.6170
2017-07-20 09:54:28,077 : INFO : Spearman rank-order correlation coefficient against /home/snu/data/Test_data/SE17-EN.txt: 0.6067
2017-07-20 09:54:28,077 : INFO : Pairs with unknown words ratio: 32.6%
2017-07-20 09:54:28,175 : INFO : Pearson correlation coefficient against /home/snu/data/Test_data/WordSimilarity-353-EN.txt: 0.6589
2017-07-20 09:54:28,176 : INFO : Spearman rank-order correlation coefficient against /home/snu/data/Test_data/WordSimilarity-353-EN.txt: 0.6799
2017-07-20 09:54:28,177 : INFO : Pairs with unknown words ratio: 0.8%



Average test results: 

Average Pearson Correlation  = 0.60
Average Pearson Spearman rank-order correlation = 0.61
Total number of missing words : 179/1389
