In [1]:

import numpy as np

class Word2Vec :
    def __init__ ( self , model_file_path ) :
        self.__model_path__ = model_file_path
        self.__word_embeddings__ = None
        self.__word_to_id_map__ = None
        self.__id_to_word_map__ = None
        import pickle
        with open(model_file_path, 'rb') as model_file :
            word2vec_model = pickle.load(model_file)
            self.__word_embeddings__ = word2vec_model['WORD_EMBEDDINGS']
            self.__word_to_id_map__ = word2vec_model['WORD_TO_ID']
            self.__id_to_word_map__ = word2vec_model['ID_TO_WORD']
    
    def __getitem__ ( self , query ) :
        if ( isinstance(query, str) ) :
            query = query.lower()
            if ( not query in self.__word_to_id_map__ ) :
                return None
            return self.__word_embeddings__[self.__word_to_id_map__[query]]
        return None
    
    def CosineSimilarity ( self , word1 , word2 ) :
        vec1 , vec2 = None , None
        if ( isinstance(word1, str) ) : vec1 = self[word1]
        elif ( isinstance(word1, np.ndarray) ) : vec1 = word1
        else : return None
        if ( isinstance(word2, str) ) : vec2 = self[word2]
        elif ( isinstance(word2, np.ndarray) ) : vec2 = word2
        else : return None
        dot_product = abs(vec1.dot(vec2))
        mag1 = np.linalg.norm(vec1)
        mag2 = np.linalg.norm(vec2)
        similarity = dot_product / ( mag1 * mag2 )
        return round(similarity, 4)

    def GetMostSimilar ( self , query , limit = 10 ) :
        if ( isinstance(query, str) ) :
            query = query.lower()
            if ( not query in self.__word_to_id_map__ ) : return []
        similarities = list()
        for word in self.__word_to_id_map__ :
            similarities.append((word, self.CosineSimilarity(word, query)))
        similarities.sort(key = lambda x : -1*x[1])
        return similarities[:limit]
    

In [2]:
model = Word2Vec('MODELS/WORD_2_VEC_MODEL_1')

### Analysis On Analogical Reasoning Tasks
These tasks check the ability of the model to automatically organize concepts and learn implicitly the relationships between them, as during the training we did not provide any supervised information about what a capital city means or how a father-mother semantic relationship is related to a son-daughter semantic relationship.
The calibre of the model is analysed on three types of analogical tasks.
<ol>
 <li> &nbsp;&nbsp;word1&nbsp; +&nbsp; word2&nbsp; ≈&nbsp; word3
 <li> &nbsp;&nbsp;word1&nbsp; -&nbsp; word2&nbsp; +&nbsp; word3&nbsp; ≈&nbsp; word4
 <li> &nbsp;&nbsp;word1&nbsp; ≈&nbsp; word2
<ol/>

In [3]:
vec = model['Switzerland'] + model['language']
model.GetMostSimilar(vec, 15)
# Romansh is the official language of Switzerland

[('switzerland', 0.9061),
 ('romansh', 0.6188),
 ('language', 0.5997),
 ('sweden', 0.4396),
 ('netherlands', 0.4315),
 ('estonia', 0.4269),
 ('neue', 0.3873),
 ('ambassador', 0.3857),
 ('iceland', 0.3848),
 ('hayti', 0.3801),
 ('resorts', 0.3736),
 ('germany', 0.3722),
 ('belgium', 0.3697),
 ('bern', 0.3619),
 ('primates', 0.3609)]

In [4]:
vec = model['Bangladesh'] + model['language']
model.GetMostSimilar(vec, 15)
# Bengali is the official language of Bangladesh

[('bangladesh', 0.9494),
 ('language', 0.5986),
 ('importers', 0.4201),
 ('ul', 0.3928),
 ('bengali', 0.3886),
 ('enhanced', 0.3849),
 ('cleric', 0.3711),
 ('arnett', 0.3625),
 ('tamil', 0.3564),
 ('labs', 0.3522),
 ('graffiti', 0.3486),
 ('insensitivity', 0.3462),
 ('parthenos', 0.3444),
 ('infiltration', 0.3429),
 ('tabletop', 0.342)]

In [5]:
vec = model['India'] + model['river']
model.GetMostSimilar(vec, 15)
# Yamuna is a river in India

[('river', 0.8196),
 ('india', 0.7531),
 ('yamuna', 0.5334),
 ('madras', 0.5002),
 ('haidarabad', 0.4877),
 ('peninsula', 0.483),
 ('n', 0.4797),
 ('persia', 0.473),
 ('e', 0.4647),
 ('mysore', 0.4634),
 ('siberia', 0.4618),
 ('berar', 0.458),
 ('china', 0.4574),
 ('rajputana', 0.4508),
 ('central', 0.4468)]

In [6]:
vec = model['Greece'] + model['river']
model.GetMostSimilar(vec, 15)
# Haliacmon is a river in Greece

[('greece', 0.8148),
 ('river', 0.7718),
 ('haliacmon', 0.5801),
 ('magnesia', 0.4651),
 ('adige', 0.4567),
 ('haidarabad', 0.441),
 ('amazonas', 0.4399),
 ('yangtze', 0.4382),
 ('ebro', 0.4376),
 ('corrientes', 0.4346),
 ('provs', 0.4334),
 ('berar', 0.4282),
 ('baluchistan', 0.4226),
 ('warta', 0.422),
 ('indo-china', 0.4133)]

In [7]:
vec = model['Spanish'] + model['singer']
model.GetMostSimilar(vec, 15)
# Shakira, Ricky Marin, Iglesias are Spanish singers.

[('singer', 0.8331),
 ('spanish', 0.7789),
 ('shakira', 0.6714),
 ('ricky-martin', 0.6524),
 ('iglesias', 0.56),
 ('beyonce', 0.5461),
 ('justin-bieber', 0.5093),
 ('taylor-swift', 0.4968),
 ('chicanos', 0.4369),
 ('rael', 0.436),
 ('folk', 0.4332),
 ('mexicans', 0.4238),
 ('sargent', 0.4194),
 ('tamale', 0.41),
 ('bbq', 0.4058)]

In [8]:
vec = model['American'] + model['singer']
model.GetMostSimilar(vec, 15)
# Beyonce, Justin Beiber, Taylor Swift, Lady Gaga are American singers.

[('singer', 0.9018),
 ('american', 0.6391),
 ('beyonce', 0.5936),
 ('justin-bieber', 0.5756),
 ('shakira', 0.5636),
 ('taylor-swift', 0.562),
 ('ricky-martin', 0.5311),
 ('bashevis', 0.5012),
 ('sargent', 0.4752),
 ('iglesias', 0.4663),
 ('isaac', 0.4429),
 ('nine', 0.4326),
 ('gaga', 0.432),
 ('straus', 0.4248),
 ('d.d.s', 0.4181)]

In [9]:
vec = model['Moscow'] - model['Russia'] + model['Spain']
model.GetMostSimilar(vec, 15)
# Moscow is the capital of Russia. Madrid is the capital of Spain.

[('spain', 0.735),
 ('moscow', 0.5464),
 ('ebro', 0.4454),
 ('madrid', 0.4057),
 ('rosas', 0.397),
 ('franco', 0.3749),
 ('plata', 0.3659),
 ('revolted', 0.3634),
 ('valencia', 0.3544),
 ('ecclesiastical', 0.3539),
 ('spanish', 0.3475),
 ('condado', 0.3452),
 ('rosario', 0.3446),
 ('pizarro', 0.3418),
 ('palacio', 0.3367)]

In [10]:
vec = model['Madrid'] - model['capital'] + model['river']
model.GetMostSimilar(vec, 15)
# Madrid is the capital of Spain. Ebro is a river in Spain.

[('madrid', 0.6781),
 ('river', 0.6539),
 ('spain', 0.4251),
 ('n', 0.4171),
 ('west', 0.4011),
 ('el', 0.3923),
 ('indies', 0.3814),
 ('ebro', 0.3767),
 ('siberia', 0.3662),
 ('cortes', 0.3662),
 ('north', 0.3639),
 ('peru', 0.355),
 ('nebraska', 0.354),
 ('amazonas', 0.3531),
 ('ki', 0.3481)]

In [11]:
vec = model['Warsaw'] - model['capital'] + model['river']
model.GetMostSimilar(vec, 15)
# Warsaw is the capital of Poland. Warta is a river in Poland.

[('warsaw', 0.7173),
 ('river', 0.6149),
 ('krak', 0.4158),
 ('warta', 0.3983),
 ('poland', 0.393),
 ('poznan', 0.3782),
 ('haliacmon', 0.3745),
 ('peninsula', 0.3637),
 ('seine', 0.3585),
 ('ebro', 0.352),
 ('lima', 0.3497),
 ('goyaz', 0.3496),
 ('n.y', 0.3451),
 ('plata', 0.3426),
 ('ul', 0.3418)]

In [12]:
vec = model['Chile'] - model['Valparaiso'] + model['Valencia']
model.GetMostSimilar(vec, 15)
# Valparaiso is a city in Chile. Valenica is a city in Venezuela.

[('valencia', 0.6156),
 ('chile', 0.5261),
 ('valparaiso', 0.3929),
 ('desert', 0.2923),
 ('swam', 0.2866),
 ('venezuela', 0.2837),
 ('healed', 0.2694),
 ('qiagen', 0.2674),
 ('paella', 0.2476),
 ('invitrogen', 0.2456),
 ('git', 0.2452),
 ('kit', 0.2437),
 ('lahore', 0.2434),
 ('aquincum', 0.2346),
 ('locally', 0.2342)]

In [13]:
vec = model['Yamuna'] - model['India'] + model['Greece']
model.GetMostSimilar(vec, 15)
# Yamuna is a river in India. Haliacmon is a river in Greece.

[('yamuna', 0.7809),
 ('greece', 0.6608),
 ('haliacmon', 0.5679),
 ('magnesia', 0.4748),
 ('ebro', 0.4728),
 ('warta', 0.4505),
 ('yangtze', 0.4281),
 ('adige', 0.4279),
 ('turn-off', 0.4231),
 ('henriques', 0.4172),
 ('weel', 0.4098),
 ('cost-per-case', 0.4044),
 ('mycenaean', 0.4024),
 ('dioramas', 0.4009),
 ('henequen', 0.4005)]

In [14]:
vec = model['Dhoni'] - model['cricket'] + model['football']
model.GetMostSimilar(vec, 15)
# Dhoni is a cricket player. Messi, Ronaldo, Neymar are football players.

[('dhoni', 0.7686),
 ('football', 0.5217),
 ('serena-williams', 0.4607),
 ('messi', 0.4488),
 ('coastal', 0.4342),
 ('zidane', 0.4317),
 ('dinghy', 0.4301),
 ('ronaldo', 0.4261),
 ('neymar', 0.4182),
 ('buoy', 0.4055),
 ('djokovic', 0.3915),
 ('ioannidis', 0.3888),
 ('embo', 0.3874),
 ('straddling', 0.386),
 ('inlets', 0.3843)]

In [15]:
vec = model['Dhoni'] - model['cricket'] + model['tennis']
model.GetMostSimilar(vec, 15)
# Dhoni is a cricket player. Serena Williams, Federer, Djokovic are tennis players.

[('dhoni', 0.7401),
 ('tennis', 0.6251),
 ('serena-williams', 0.5404),
 ('federer', 0.5275),
 ('djokovic', 0.4942),
 ('huatulco', 0.4587),
 ('sod', 0.4273),
 ('buoy', 0.4237),
 ('m.g', 0.4088),
 ('coastal', 0.4063),
 ('dinghy', 0.4036),
 ('slacks', 0.4008),
 ('fishing', 0.3966),
 ('parque', 0.3931),
 ('windsurfing', 0.3908)]

In [16]:
vec = model['Dhoni'] - model['cricket'] + model['singer']
model.GetMostSimilar(vec, 15)
# Dhoni is a cricket player. Beyonce, Justin Beiber, Taylor Swift are singers.

[('dhoni', 0.7444),
 ('singer', 0.5684),
 ('beyonce', 0.4885),
 ('justin-bieber', 0.4701),
 ('dinghy', 0.4566),
 ('taylor-swift', 0.4553),
 ('coastal', 0.4408),
 ('microarray', 0.4302),
 ('m.g', 0.4209),
 ('microbial', 0.4101),
 ('southerly', 0.4065),
 ('huatulco', 0.4013),
 ('sycamore', 0.3997),
 ('truscott', 0.3993),
 ('tetramin', 0.3975)]

In [17]:
vec = model['Beyonce'] - model['American'] + model['Spanish']
model.GetMostSimilar(vec, 15)
# Beyonce is an American singer. Ricky Martin, Shakira, Iglesias are Spanish singers.

[('beyonce', 0.9043),
 ('ricky-martin', 0.6254),
 ('shakira', 0.6176),
 ('spanish', 0.5903),
 ('singer', 0.5635),
 ('justin-bieber', 0.5401),
 ('taylor-swift', 0.493),
 ('iglesias', 0.4683),
 ('chicanos', 0.4348),
 ('californios', 0.4342),
 ('burritos', 0.4327),
 ('mixteco', 0.4318),
 ('corrientes', 0.4258),
 ('mestizos', 0.4247),
 ('frijoles', 0.422)]

In [18]:
vec = model['father'] - model['man'] + model['woman']
model.GetMostSimilar(vec, 15)

[('father', 0.7147),
 ('woman', 0.713),
 ('live-in', 0.3987),
 ('daughter', 0.3925),
 ('mother', 0.3762),
 ('lady', 0.3592),
 ('husband', 0.3512),
 ('half-sister', 0.3403),
 ('elder', 0.3336),
 ('maternity', 0.3334),
 ('doctor', 0.3314),
 ('lucie', 0.3292),
 ('after-school', 0.3251),
 ('sister', 0.323),
 ('betty', 0.3213)]

In [19]:
vec = model['boy'] - model['man'] + model['woman']
model.GetMostSimilar(vec, 15)

[('boy', 0.7399),
 ('woman', 0.6875),
 ('girl', 0.3658),
 ('ingraham', 0.3448),
 ('parents', 0.3404),
 ('live-in', 0.3331),
 ('girls', 0.3312),
 ('sari', 0.3279),
 ('elder', 0.3235),
 ('infirmities', 0.3176),
 ('cuddling', 0.3172),
 ('jinks', 0.3168),
 ('chubby', 0.3139),
 ('kids', 0.3136),
 ('lady', 0.3133)]

In [20]:
vec = model['walking'] - model['walk'] + model['run']
model.GetMostSimilar(vec, 15)

[('walking', 0.6299),
 ('run', 0.5617),
 ('fastball', 0.2952),
 ('ill-timed', 0.2885),
 ('runaways', 0.2777),
 ('shortstop', 0.2771),
 ('planters', 0.2671),
 ('away', 0.264),
 ('birthday', 0.263),
 ('running', 0.2597),
 ('shuttling', 0.2587),
 ('thoughtfully', 0.2576),
 ('braund', 0.2555),
 ('thor', 0.255),
 ('bowler', 0.254)]

In [21]:
model.GetMostSimilar('singer', 15)

[('singer', 1.0),
 ('beyonce', 0.5809),
 ('taylor-swift', 0.5696),
 ('shakira', 0.5559),
 ('justin-bieber', 0.5555),
 ('ricky-martin', 0.5318),
 ('bashevis', 0.4569),
 ('iglesias', 0.4454),
 ('gayle', 0.4185),
 ('sargent', 0.4045),
 ('straus', 0.3956),
 ('coraghessan', 0.3871),
 ('bbq', 0.3856),
 ('songwriters', 0.3803),
 ('isaac', 0.3788)]

In [22]:
model.GetMostSimilar('plays', 15)

[('plays', 1.0),
 ('serena-williams', 0.5738),
 ('messi', 0.5581),
 ('tendulkar', 0.5512),
 ('neymar', 0.5372),
 ('djokovic', 0.5166),
 ('federer', 0.486),
 ('gayle', 0.4588),
 ('cricket', 0.436),
 ('dhoni', 0.4329),
 ('football', 0.4262),
 ('dramatist', 0.4008),
 ('ronaldo', 0.3913),
 ('endocytosis', 0.3888),
 ('zidane', 0.3875)]

In [23]:
model.GetMostSimilar('Paraguay', 15)
# Asuncion is the capital of Paraguay. 
# Bolivia, Uruguay, Venezuela, Peru, Brazil, Ecuador are all countries in South America, like Paraguay.

[('paraguay', 1.0),
 ('asuncion', 0.5381),
 ('bolivia', 0.5298),
 ('oviedo', 0.4938),
 ('uruguay', 0.4691),
 ('venezuela', 0.4583),
 ('peru', 0.4538),
 ('belgrano', 0.4532),
 ('brazil', 0.4449),
 ('guiana', 0.443),
 ('rosas', 0.4221),
 ('s.a', 0.4142),
 ('ecuador', 0.412),
 ('ayres', 0.4088),
 ('mendoza', 0.4087)]

In [24]:
model.GetMostSimilar('India', 15)
# Yamuna is a river in India.
# Berar is an ancient province of India.
# Mysore and Madras are cities in India.
# Rupee is the currency of India.
# Hindu is an Indian religion.

[('india', 1.0),
 ('yamuna', 0.4626),
 ('malleson', 0.4117),
 ('hindu', 0.3851),
 ('mysore', 0.3826),
 ('persia', 0.3739),
 ('pakistan', 0.3706),
 ('madras', 0.3687),
 ('central', 0.3677),
 ('coinage', 0.3562),
 ('battles', 0.3549),
 ('berar', 0.3528),
 ('rupee', 0.3465),
 ('turtons', 0.344),
 ('coins', 0.3408)]

In [25]:
model.GetMostSimilar('Canada', 15)
# Athabasca is a town in Canadian.
# Quebec is a city in Canada.
# Anticosti is an island in Canada.
# Ontario, Manitoba, Yukon, Alberta, Newfoundland are all Canadian provinces.

[('canada', 1.0),
 ('whoa', 0.4497),
 ('athabasca', 0.4456),
 ('u.s.a', 0.4418),
 ('w', 0.438),
 ('ontario', 0.4368),
 ('manitoba', 0.4259),
 ('anticosti', 0.4193),
 ('yukon', 0.4116),
 ('canadian', 0.4102),
 ('inlet', 0.4077),
 ('alberta', 0.4025),
 ('newfoundland', 0.4001),
 ('isthmus', 0.3991),
 ('quebec', 0.3971)]

In [26]:
model.GetMostSimilar('Monday', 15)

[('monday', 1.0),
 ('tuesday', 0.6061),
 ('sessions', 0.5851),
 ('biennial', 0.5775),
 ('elections', 0.5716),
 ('meeting', 0.5598),
 ('legislature', 0.5532),
 ('wednesday', 0.5527),
 ('january', 0.5526),
 ('odd-numbered', 0.547),
 ('even-numbered', 0.5417),
 ('quadrennially', 0.5217),
 ('session', 0.5212),
 ('november', 0.5201),
 ('representatives', 0.5049)]

In [27]:
model.GetMostSimilar('December', 15)

[('december', 1.0),
 ('january', 0.4891),
 ('even-numbered', 0.4281),
 ('biennial', 0.4244),
 ('d-del', 0.4214),
 ('month', 0.4159),
 ('monday', 0.4036),
 ('meeting', 0.3918),
 ('september', 0.3886),
 ('tuesday', 0.3814),
 ('nine', 0.3772),
 ('atta', 0.3693),
 ('october', 0.3689),
 ('d.c', 0.3666),
 ('haig', 0.3616)]

In [28]:
model.GetMostSimilar('zero', 15)

[('zero', 1.0),
 ('nine', 0.6839),
 ('five', 0.6709),
 ('four', 0.6341),
 ('seven', 0.6272),
 ('approximately', 0.5852),
 ('million', 0.5828),
 ('six', 0.5706),
 ('eight', 0.5652),
 ('percent', 0.5599),
 ('billion', 0.4853),
 ('year', 0.4818),
 ('mctwo', 0.4766),
 ('km', 0.4745),
 ('clin', 0.4672)]

In [29]:
model.GetMostSimilar('language', 15)

[('language', 1.0),
 ('romansh', 0.5165),
 ('languages', 0.3893),
 ('specialized', 0.3878),
 ('phonological', 0.3858),
 ('vocabulary', 0.3753),
 ('onomatopoeic', 0.3715),
 ('familiarity', 0.3679),
 ('microarrays', 0.3668),
 ('linguistics', 0.3659),
 ('synonymy', 0.3641),
 ('arabic', 0.3568),
 ('bengali', 0.3568),
 ('kanbun', 0.3564),
 ('re-examine', 0.356)]

In [30]:
model.GetMostSimilar('terrorist', 15)

[('terrorist', 1.0),
 ('bin', 0.5735),
 ('osama', 0.5588),
 ('ladin', 0.5467),
 ('pakistani', 0.5395),
 ('qaeda', 0.5385),
 ('jihad', 0.5349),
 ('terrorists', 0.5235),
 ('terrorism', 0.5019),
 ('ubl', 0.491),
 ('hamas', 0.4749),
 ('wmd', 0.4724),
 ('cia', 0.4703),
 ('laden', 0.4683),
 ('ressam', 0.4648)]

In [31]:
model.GetMostSimilar('nasa', 15)
# Foale is an astrophysicist and former NASA astronaut.
# Tsibliev is a cosmonaut.

[('nasa', 1.0),
 ('aeronautics', 0.4482),
 ('lander', 0.445),
 ('foale', 0.4352),
 ('nih', 0.4207),
 ('shuttle', 0.4184),
 ('navfac', 0.4027),
 ('astronauts', 0.4007),
 ('orbiter', 0.4004),
 ('cosmonaut', 0.3996),
 ('tsibliev', 0.3976),
 ('align', 0.3902),
 ('mars', 0.3846),
 ('usace', 0.3828),
 ('astronaut', 0.3808)]

### Evaluation
A *Gensim Word2Vec Model* is trained with the same training corpus that was used for training our model. Then the performance metrics for the two models are computed on the training corpus to get a reference point (in order to reserve time, evaluation is done only on the first 1000 sentences of the training corpus). Further, the performances of the models are monitored on different test corpora with the increasing token sizes.

In [36]:
from gensim.models import KeyedVectors, Word2Vec
from nltk.tokenize import word_tokenize, sent_tokenize
from math import log, exp
from collections import Counter

def TrainGensimModel ( train_corpus_path ) :
    file = open(train_corpus_path, 'r')
    text = file.read()
    text = text.lower()
    file.close()
    sents = [ word_tokenize(x) for x in sent_tokenize(text) ]
    global gensim_model
    gensim_model = Word2Vec(sents, min_count=6, size=300)
    vocab1 = set(gensim_model.wv.vocab.keys())
    vocab2 = set(model.__word_to_id_map__)
    global VOCAB
    VOCAB = vocab1.intersection(vocab2)
    
def GensimModelConditionalProbabilityDenominator ( center_word ) :
    global GMCPD
    if center_word in GMCPD : return GMCPD[center_word]
    GMCPD[center_word] = 0.0
    cwv = gensim_model.wv[center_word]
    for word in VOCAB :
        GMCPD[center_word] += exp(cwv.dot(gensim_model.wv[word]))
    return GMCPD[center_word]

def GensimModelConditionalProbability ( center , context ) :
    num = exp(gensim_model.wv[context].dot(gensim_model.wv[center]))
    den = GensimModelConditionalProbabilityDenominator(center)
    return num / den

def MyModelConditionalProbabilityDenominator ( center_word ) :
    global MMCPD
    if center_word in MMCPD : return MMCPD[center_word]
    MMCPD[center_word] = 0.0
    cwv = model[center_word]
    for word in VOCAB : # VOCAB is the intersection of the VOCABs of both the models
        MMCPD[center_word] += exp(cwv.dot(model[word]))
    return MMCPD[center_word]

def MyModelConditionalProbability ( center , context ) :
    num = exp(model[context].dot(model[center]))
    den = MyModelConditionalProbabilityDenominator(center)
    return num / den

def GensimModelPerformanceMetric ( center_context_pairs , token_count ) :
    cost = 0.0
    for (center, context), freq in center_context_pairs :
        cost += freq * log( GensimModelConditionalProbability(center, context) )
    cost *= ( -1 / token_count )
    pred_acc = 1 / cost
    pred_acc = 1 / ( 1 + exp(-1*pred_acc) )
    return pred_acc

def MyModelPerformanceMetric ( center_context_pairs , token_count ) :
    cost = 0.0
    for (center, context), freq in center_context_pairs :
        val = MyModelConditionalProbability(center, context)
        cost += freq * log( val )
    cost *= ( -1 / token_count )
    pred_acc = 1 / cost
    pred_acc = 1 / ( 1 + exp(-1*pred_acc) )
    return pred_acc

def GenerateCenterContextPairs ( filepath , window , start = None , end = None ) :
    file = open(filepath, 'r')
    testtext = file.read()
    testtext = testtext.lower()
    file.close()
    pairs = list()
    T = 0
    sents = sent_tokenize(testtext)
    if not start is None : 
        if not end is None :
            sents = sents[start:end]
        else :
            sents = sents[start:]
    elif not end is None :
        sents = sents[:end]
    for sent in sents :
        word_tokens = word_tokenize(sent)
        refined = [x for x in word_tokens if x in VOCAB]
        T += len(refined)
        for ceni, center in enumerate(refined) :
            for coni, context in enumerate(refined) :
                if ( abs(ceni-coni) <= window and ceni != coni ) :
                    pairs.append((center, context))
    return list(Counter(pairs).items()), T

def InitializeMaps ( ) :
    global MMCPD, GMCPD
    MMCPD, GMCPD = dict(), dict()

In [33]:
TrainGensimModel('TRAIN_CORPORA/CORPUS.txt')

In [37]:
pairs, token_count = GenerateCenterContextPairs('TRAIN_CORPORA/CORPUS.txt', 5, end = 1000)
InitializeMaps()
my_model_perf_metr = MyModelPerformanceMetric(pairs, token_count)
gensim_model_perf_metr = GensimModelPerformanceMetric(pairs, token_count)
token_count, my_model_perf_metr, gensim_model_perf_metr

(16625, 0.502063831703339, 0.5001855730001165)

In [38]:
pairs, token_count = GenerateCenterContextPairs('TEST_CORPORA/TEST1.txt', 5)
InitializeMaps()
my_model_perf_metr = MyModelPerformanceMetric(pairs, token_count)
gensim_model_perf_metr = GensimModelPerformanceMetric(pairs, token_count)
token_count, my_model_perf_metr, gensim_model_perf_metr

(1611, 0.5017167134921486, 0.5001708895156772)

In [39]:
pairs, token_count = GenerateCenterContextPairs('TEST_CORPORA/TEST2.txt', 5, end = )
InitializeMaps()
my_model_perf_metr = MyModelPerformanceMetric(pairs, token_count)
gensim_model_perf_metr = GensimModelPerformanceMetric(pairs, token_count)
token_count, my_model_perf_metr, gensim_model_perf_metr

(3301, 0.5016301419199041, 0.5001786122073575)

In [40]:
pairs, token_count = GenerateCenterContextPairs('TEST_CORPORA/TEST3.txt', 5)
InitializeMaps()
my_model_perf_metr = MyModelPerformanceMetric(pairs, token_count)
gensim_model_perf_metr = GensimModelPerformanceMetric(pairs, token_count)
token_count, my_model_perf_metr, gensim_model_perf_metr

(3929, 0.5016616919100826, 0.5001710064483414)

In [41]:
pairs, token_count = GenerateCenterContextPairs('TEST_CORPORA/TEST4.txt', 5)
InitializeMaps()
my_model_perf_metr = MyModelPerformanceMetric(pairs, token_count)
gensim_model_perf_metr = GensimModelPerformanceMetric(pairs, token_count)
token_count, my_model_perf_metr, gensim_model_perf_metr

(5110, 0.50176414643688, 0.5001733606332893)

In [42]:
pairs, token_count = GenerateCenterContextPairs('TEST_CORPORA/TEST5.txt', 5)
InitializeMaps()
my_model_perf_metr = MyModelPerformanceMetric(pairs, token_count)
gensim_model_perf_metr = GensimModelPerformanceMetric(pairs, token_count)
token_count, my_model_perf_metr, gensim_model_perf_metr

(6026, 0.501642867857437, 0.500176334821602)

In [43]:
pairs, token_count = GenerateCenterContextPairs('TEST_CORPORA/TEST6.txt', 5)
InitializeMaps()
my_model_perf_metr = MyModelPerformanceMetric(pairs, token_count)
gensim_model_perf_metr = GensimModelPerformanceMetric(pairs, token_count)
token_count, my_model_perf_metr, gensim_model_perf_metr

(7035, 0.5017005127324882, 0.500176469930449)