In [2]:
import multiprocessing
from gensim.models.word2vec import LineSentence
from gensim.models import Word2Vec
from os import walk
from gensim.models.word2vec import Text8Corpus
from datetime import datetime
import time
import os

import logging


def train_and_test(root_train, root_test , output_name, params):
    
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    directory = os.path.join("/home/snu/data/Kor_Test_data/models", datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))
    os.makedirs(directory)
    test=open(directory+"/Kor test results.txt","w") ##replaced the 'directory' into real one
    startTime = time.time()
      
    print("\nRetrieving the corpus...")   
    #if using another corpus then use LineSentence() which will itterate over the corpus in root 
    #sentences = LineSentence(root)
    sentences = Text8Corpus(root_train)
    
    print("Training the model...")
    model = Word2Vec(sentences, **params)
    
    print("Freeing the memory...")
    model.init_sims(replace=True)
    
    print("saving the model...")
    model.save(directory+"/"+output_name)
    
    print("Testing the model...")
    test.write("Model " + output_name +  " at " + directory + ".\n")
    test.write("Training from : " + root_train + "\n")
    test.write("\n")
    endTime = time.time()
    test.write("Parameters : " + "\n\tVector size = " + repr(params["size"]) + ",\n\tWindow size = " + repr(params["window"]) + ",\n\tMin count = " + repr(params["min_count"]) + ",\n\tskip-gram/CBOW = " + ("skip-gram" if params["sg"]==1 else "CBOW") + ",\n\tHierarchical softmax/Negative sampling = " + ("Hierarchical softmax" if params["hs"]==1 else "Negative sampling \n\n"))
    test.write("The model took " + repr((endTime - startTime)/60)+ " to train." + "\n")
    test.write("Vocabulary length : " + repr(len(model.wv.vocab)) + "\n")
    test.write("\n\n") 
    test.write("Testing from : " + root_test + "\n\n")
    

    
    for (dirpath, dirnames, filenames) in walk(root_test):
        filenames = filenames
        break
    
    sim=0   
    sim2=0
    num_tests=len(filenames)   
    mw=0
    total_num_pairs=0
    
    for file in filenames:
        similarity = model.wv.evaluate_word_pairs(root_test+file, dummy4unknown=False)
        num_pairs=round(len(open(root_test+file,"r").readlines()))
        total_num_pairs=total_num_pairs + num_pairs
        sim=sim+similarity[0][0]
        sim2=sim2+similarity[1][0]
        mw=mw+similarity[2]*num_pairs/100
        test.write("Test results on " + file + ": \n")
        test.write("Pearson correlation coefficient = %.2f\n" % similarity[0][0])
        test.write("Spearman rank-order correlation coefficient = %.2f\n" % similarity[1][0])
        test.write("Number of missing words = " + repr(round(similarity[2]*num_pairs/100)) + "/" + repr(num_pairs)+ "\n")
        test.write("\n")
    
    test.write("Average test results: \n")    
    test.write("Average Pearson Correlation  = %.2f\n" % (sim/num_tests))
    test.write("Average Pearson Spearman rank-order correlation = %.2f\n" % (sim2/num_tests))
    test.write("Total number of missing words : "+repr(round(mw))+"/"+repr(total_num_pairs)+ "\n")

    test.close()
    
    return (sim/num_tests), (sim2/num_tests), (round(mw)), (total_num_pairs)

if __name__ == "__main__":
    
    
    output_name = "output_0"
    root_train = "/home/snu/data/Kor_Training_data/full.txt"
    root_test = "/home/snu/data/Kor_Test_data/kor_ws353.txt"
    
    params = {
        'size': 100,
        'window': 5,
        'min_count': 10,
        'sg' : 0,
        'hs' : 0,
        'workers': max(1, multiprocessing.cpu_count() - 1),
        'sample': 1E-3,
        }
    
    results = train_and_test(root_train, root_test, output_name, params)
    
    print("\nAverage test results: \n")    
    print("Average Pearson Correlation  = %.2f" % results[0])
    print("Average Pearson Spearman rank-order correlation = %.2f" % results[1])
    print("Total number of missing words : " + repr(results[2])+"/" + repr(results[3]))

2017-07-23 16:43:00,909 : INFO : collecting all words and their counts
2017-07-23 16:43:00,912 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types



Retrieving the corpus...
Training the model...


2017-07-23 16:43:23,168 : INFO : collected 3220828 word types from a corpus of 44740290 raw words and 4475 sentences
2017-07-23 16:43:23,168 : INFO : Loading a fresh vocabulary
2017-07-23 16:43:24,780 : INFO : min_count=10 retains 266937 unique words (8% of original 3220828, drops 2953891)
2017-07-23 16:43:24,781 : INFO : min_count=10 leaves 39565473 word corpus (88% of original 44740290, drops 5174817)
2017-07-23 16:43:25,395 : INFO : deleting the raw counts dictionary of 3220828 items
2017-07-23 16:43:25,490 : INFO : sample=0.001 downsamples 19 most-common words
2017-07-23 16:43:25,491 : INFO : downsampling leaves estimated 34202262 word corpus (86.4% of prior 39565473)
2017-07-23 16:43:25,492 : INFO : estimated required memory for 266937 words and 100 dimensions: 347018100 bytes
2017-07-23 16:43:26,510 : INFO : resetting layer weights
2017-07-23 16:43:29,041 : INFO : training model with 7 workers on 266937 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5

2017-07-23 16:44:41,030 : INFO : PROGRESS: at 30.69% examples, 730995 words/s, in_qsize 11, out_qsize 2
2017-07-23 16:44:42,042 : INFO : PROGRESS: at 31.12% examples, 730977 words/s, in_qsize 14, out_qsize 1
2017-07-23 16:44:43,077 : INFO : PROGRESS: at 31.56% examples, 730825 words/s, in_qsize 11, out_qsize 2
2017-07-23 16:44:44,101 : INFO : PROGRESS: at 32.00% examples, 730731 words/s, in_qsize 12, out_qsize 1
2017-07-23 16:44:45,109 : INFO : PROGRESS: at 32.44% examples, 731008 words/s, in_qsize 14, out_qsize 0
2017-07-23 16:44:46,114 : INFO : PROGRESS: at 32.87% examples, 730852 words/s, in_qsize 13, out_qsize 0
2017-07-23 16:44:47,122 : INFO : PROGRESS: at 33.30% examples, 730817 words/s, in_qsize 10, out_qsize 1
2017-07-23 16:44:48,123 : INFO : PROGRESS: at 33.72% examples, 730629 words/s, in_qsize 11, out_qsize 3
2017-07-23 16:44:49,141 : INFO : PROGRESS: at 34.15% examples, 730589 words/s, in_qsize 10, out_qsize 3
2017-07-23 16:44:50,157 : INFO : PROGRESS: at 34.60% examples, 7

2017-07-23 16:46:01,042 : INFO : PROGRESS: at 65.13% examples, 733403 words/s, in_qsize 14, out_qsize 1
2017-07-23 16:46:02,043 : INFO : PROGRESS: at 65.55% examples, 733405 words/s, in_qsize 13, out_qsize 2
2017-07-23 16:46:03,068 : INFO : PROGRESS: at 65.98% examples, 733377 words/s, in_qsize 13, out_qsize 0
2017-07-23 16:46:04,093 : INFO : PROGRESS: at 66.40% examples, 733132 words/s, in_qsize 11, out_qsize 2
2017-07-23 16:46:05,125 : INFO : PROGRESS: at 66.85% examples, 733211 words/s, in_qsize 12, out_qsize 1
2017-07-23 16:46:06,137 : INFO : PROGRESS: at 67.28% examples, 733309 words/s, in_qsize 13, out_qsize 0
2017-07-23 16:46:07,150 : INFO : PROGRESS: at 67.71% examples, 733291 words/s, in_qsize 12, out_qsize 1
2017-07-23 16:46:08,151 : INFO : PROGRESS: at 68.13% examples, 733207 words/s, in_qsize 14, out_qsize 1
2017-07-23 16:46:09,154 : INFO : PROGRESS: at 68.56% examples, 733171 words/s, in_qsize 12, out_qsize 1
2017-07-23 16:46:10,167 : INFO : PROGRESS: at 68.98% examples, 7

2017-07-23 16:47:21,004 : INFO : PROGRESS: at 99.70% examples, 735057 words/s, in_qsize 14, out_qsize 0
2017-07-23 16:47:21,594 : INFO : worker thread finished; awaiting finish of 6 more threads
2017-07-23 16:47:21,605 : INFO : worker thread finished; awaiting finish of 5 more threads
2017-07-23 16:47:21,608 : INFO : worker thread finished; awaiting finish of 4 more threads
2017-07-23 16:47:21,616 : INFO : worker thread finished; awaiting finish of 3 more threads
2017-07-23 16:47:21,619 : INFO : worker thread finished; awaiting finish of 2 more threads
2017-07-23 16:47:21,628 : INFO : worker thread finished; awaiting finish of 1 more threads
2017-07-23 16:47:21,630 : INFO : worker thread finished; awaiting finish of 0 more threads
2017-07-23 16:47:21,630 : INFO : training on 223701450 raw words (171012810 effective words) took 232.6s, 735264 effective words/s
2017-07-23 16:47:21,631 : INFO : precomputing L2-norms of word weight vectors


Freeing the memory...


2017-07-23 16:47:23,578 : INFO : saving Word2Vec object under /home/snu/data/Kor_Test_data/models/2017-07-23_16-43-00/output_0, separately None
2017-07-23 16:47:23,578 : INFO : storing np array 'syn0' to /home/snu/data/Kor_Test_data/models/2017-07-23_16-43-00/output_0.wv.syn0.npy
2017-07-23 16:47:23,626 : INFO : not storing attribute syn0norm
2017-07-23 16:47:23,627 : INFO : storing np array 'syn1neg' to /home/snu/data/Kor_Test_data/models/2017-07-23_16-43-00/output_0.syn1neg.npy
2017-07-23 16:47:23,677 : INFO : not storing attribute cum_table


saving the model...


2017-07-23 16:47:24,316 : INFO : saved /home/snu/data/Kor_Test_data/models/2017-07-23_16-43-00/output_0


Testing the model...


UnboundLocalError: local variable 'filenames' referenced before assignment