# <span style='color:red'> Gensim </span>
- Gensim - Package for Topic Modeling, text processing
- uses LDA, LSI 
- Much wider and deeper usage than text package available in Scikit Learn , R etc
- Gensim - handles large text file efficiently without having to load all texts in one go into RAM
- Dictionary : Convert texts to tokens and creates unique ids for each of these tokens/words
- Token : Words
- Documents : Sentence, Paragraph
- Corpus: Collection of documents as collection of words => word id+frequency in each DOCUMENT.   
__For topic modeling we require DIctionary (for unique word) and corpus (bag of words fo each document = word_id,Freq of word_id for each document__

## <span style='color:brown'>GENSIM allows to read senteneces line by line without loading entire text file into RAM, yet read line by line and update DICTIONARY, as and when encounters new word</brown>

In [1]:
import warnings

In [2]:
import gensim



In [3]:
from gensim import corpora
from pprint import pprint

In [4]:
# Create dictionary from list of texts 
documents = ["The Saudis are preparing a report that will acknowledge that", 
             "Saudi journalist Jamal Khashoggi's death was the result of an", 
             "interrogation that went wrong, one that was intended to lead", 
             "to his abduction from Turkey, according to two sources."]

In [5]:
texts = [[text for text in doc.split()] for doc in documents]

In [6]:
dictionary=corpora.Dictionary(texts)

In [7]:
print(dictionary)

Dictionary(33 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [8]:
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32}


In [9]:
#Append new dictionary
documents_2 = ["One source says the report will likely conclude that", 
                "the operation was carried out without clearance and", 
                "transparency and that those involved will be held", 
                "responsible. One of the sources acknowledged that the", 
                "report is still being prepared and cautioned that", 
                "things could change."]

In [10]:
texts2=[[text for text in doc.split()] for doc in documents_2]

In [11]:
print(texts2)

[['One', 'source', 'says', 'the', 'report', 'will', 'likely', 'conclude', 'that'], ['the', 'operation', 'was', 'carried', 'out', 'without', 'clearance', 'and'], ['transparency', 'and', 'that', 'those', 'involved', 'will', 'be', 'held'], ['responsible.', 'One', 'of', 'the', 'sources', 'acknowledged', 'that', 'the'], ['report', 'is', 'still', 'being', 'prepared', 'and', 'cautioned', 'that'], ['things', 'could', 'change.']]


In [12]:
dictionary.add_documents(texts2)

In [13]:
print(dictionary)

Dictionary(60 unique tokens: ['Saudis', 'The', 'a', 'acknowledge', 'are']...)


In [14]:
print(dictionary.token2id)

{'Saudis': 0, 'The': 1, 'a': 2, 'acknowledge': 3, 'are': 4, 'preparing': 5, 'report': 6, 'that': 7, 'will': 8, 'Jamal': 9, "Khashoggi's": 10, 'Saudi': 11, 'an': 12, 'death': 13, 'journalist': 14, 'of': 15, 'result': 16, 'the': 17, 'was': 18, 'intended': 19, 'interrogation': 20, 'lead': 21, 'one': 22, 'to': 23, 'went': 24, 'wrong,': 25, 'Turkey,': 26, 'abduction': 27, 'according': 28, 'from': 29, 'his': 30, 'sources.': 31, 'two': 32, 'One': 33, 'conclude': 34, 'likely': 35, 'says': 36, 'source': 37, 'and': 38, 'carried': 39, 'clearance': 40, 'operation': 41, 'out': 42, 'without': 43, 'be': 44, 'held': 45, 'involved': 46, 'those': 47, 'transparency': 48, 'acknowledged': 49, 'responsible.': 50, 'sources': 51, 'being': 52, 'cautioned': 53, 'is': 54, 'prepared': 55, 'still': 56, 'change.': 57, 'could': 58, 'things': 59}


# <span style='çolor:blue'> Create dictionary from one or more files </span>
- Created one notepad as Sample Txt , stored in the same path where am executing this script
- use gensim util simple process to read file line by line without loading into RAM

In [15]:
from gensim.utils import simple_preprocess
from smart_open import smart_open
import os

In [16]:
import numpy as np

In [17]:
import pandas as pd

In [18]:
dictionary=corpora.Dictionary(simple_preprocess(line,deacc=True) for line in open('sample.txt'))

In [19]:
print(dictionary)

Dictionary(93 unique tokens: ['army', 'china', 'chinese', 'force', 'liberation']...)


## Read one line at a time from multiple files

In [20]:
class ReadTxtFiles(object):
    def __init__(self,dirname):
        self.dirname = dirname
        
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname,fname),encoding='latin'):
                yield simple_preprocess(line)

In [21]:
path_to_text_directory ='E:/Niraj/Niraj Personal/Learnings/Machine Learning/Data/datasets/lda_sports_politics_docs'

In [22]:
dictionary = corpora.Dictionary(ReadTxtFiles(path_to_text_directory))

In [23]:
print(dictionary)

Dictionary(959 unique tokens: ['across', 'activity', 'although', 'and', 'are']...)


# <span style='color:purple'>Generator vs Iteration vs Iterables</span>
- Generator : Data gets exhausted once burned thru Gen
- Iterable  : Data gets repeated after ever iteration , its like new for loop

In [24]:
generator = range(1,10,2)

In [25]:
generator

range(1, 10, 2)

In [26]:
for val in (generator):
    print(val),

1
3
5
7
9


In [27]:
generator = (word + '!' for word in (("baby let me iterate ya").split()))

In [28]:
for i in generator:
    print(i)

baby!
let!
me!
iterate!
ya!


In [29]:
for i in generator:
    print(i)

In [30]:
class Beyonceiterable(object):
    def __iter__(self):
        for word in "baby let me iterate over ya".split():
            yield word +"!"

In [31]:
iterable =Beyonceiterable()

In [32]:
for i in iterable:
    print(i)

baby!
let!
me!
iterate!
over!
ya!


In [33]:
listname=('nir','pan','pagla')

In [34]:
myiter=iter(listname)

In [35]:
print(next(myiter))

nir


# <span style='color:red'> Create BOW (Bag of Words) </span>
- collect tokens
- pass them to doc2bow function of Dictionary method by corpoa
- Creating 'BOW' is Gensim's Document-Term matrix => It creates matrix with information about word, frequncy and document number

In [36]:
#test
my_docs =["who let's the dog out but who","Who?Who?Who?Who?"]

In [37]:
#Tokenize documents
my_tokens=[simple_preprocess(doc) for doc in my_docs]

In [38]:
my_tokens

[['who', 'let', 'the', 'dog', 'out', 'but', 'who'],
 ['who', 'who', 'who', 'who']]

In [39]:
# Create the corpus
# Create object for dictionary
mydict=corpora.Dictionary()

In [40]:
mycorpus=[mydict.doc2bow(doc,allow_update=True) for doc in my_tokens]

In [41]:
print(my_tokens)

[['who', 'let', 'the', 'dog', 'out', 'but', 'who'], ['who', 'who', 'who', 'who']]


In [42]:
## Testing
print(corpora.Dictionary(my_tokens))

Dictionary(6 unique tokens: ['but', 'dog', 'let', 'out', 'the']...)


In [43]:
mydict1=corpora.Dictionary(my_tokens)

In [44]:
for i in range(len(mydict)):
    print(mydict[i])

but
dog
let
out
the
who


In [45]:
pprint(mycorpus)

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 2)], [(5, 4)]]


In [46]:
#How to intepret and decode
[[(mydict[id],count)for id,count in doc]for doc in mycorpus]


[[('but', 1), ('dog', 1), ('let', 1), ('out', 1), ('the', 1), ('who', 2)],
 [('who', 4)]]

# <span style='color:red'>Create BOW corpus from TEXT Files</span>
- Design a class 
- init method with path and dictionary as objects
- iter as iterative to read file and lines iteratively , one line at a time, meaning no loading to the RAM in one GO
- instantiate the class and create corpus

In [47]:
class BoWCorpus(object):
    def __init__(self,path,dictionary):
        self.filepath=path
        self.dictionary=dictionary
    def __iter__(self):
        global mydict #OPTIONAl ; only if updating source dictionary
        for line in smart_open(self.filepath):
            # Tokenize
            tokenized_list=simple_preprocess(line,deacc=True)
            
            #create bag of words
            bow=self.dictionary.doc2bow(tokenized_list,allow_update=True)
            
            #update source dictionary
            mydict.merge_with(self.dictionary)
            
            #return bow
            yield bow

In [48]:
# WhatsApp Chat msg of Sood
tokenlized_wa = [simple_preprocess(line,deacc=True) for line in open('SoodWAChat090520.txt')]

In [49]:
new_doc_wa= [wd for wd in tokenlized_wa]

In [50]:
#Create the Dictionary object
mydict=corpora.Dictionary()
doc_proccessed=[]

In [51]:
# Prepare bag of word corpus of Sood WA Chat
bow_corpus_sood= BoWCorpus('SoodWAChat090520.txt',mydict)

In [52]:
#Create Corpus
bow_corpus= BoWCorpus('sample.txt',mydict)

In [53]:
print(mydict)

Dictionary(0 unique tokens: [])


In [54]:
for line in bow_corpus_sood:
    print(line)

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 2), (14, 1), (15, 1), (16, 2), (17, 1), (18, 3), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 3), (31, 1), (32, 1), (33, 2), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1)]
[(1, 2), (2, 2), (22, 1), (30, 2), (33, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 2), (44, 1), (45, 2), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 2), (61, 1), (62, 1), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1)]
[(18, 1), (19, 1), (30, 5), (31, 1), (33, 2), (35, 1), (45, 1), (52, 1), (56, 2), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 3), (84, 1), (85, 1), (86, 1), (87, 2), (88, 1), (89, 4), (90, 1), (91, 1), (92, 1), (93, 1), (94, 1), (95, 1), (

In [55]:
df6=pd.DataFrame(bow_corpus_sood)

In [56]:
#df6

In [57]:
corpus_name=[[(mydict[id],count) for id,count in line]for line in bow_corpus_sood]
#We can display corpus_name

In [58]:
from gensim import models
import numpy as np

In [59]:
documents=  ["This is the first line",
             "This is the second sentence",
             "This third document"]

In [60]:
mydict=corpora.Dictionary()

In [61]:
token_list= [simple_preprocess(doc) for doc in documents]

In [62]:
dictionary=mydict.from_documents(token_list)

In [63]:
print(dictionary)

Dictionary(9 unique tokens: ['first', 'is', 'line', 'the', 'this']...)


In [64]:
#Corpus
corpus=[dictionary.doc2bow(token) for token in token_list]

In [65]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(4, 1), (7, 1), (8, 1)]]

In [66]:
[[(dictionary[id],count) for id,count in corpus_list] for corpus_list in corpus]

[[('first', 1), ('is', 1), ('line', 1), ('the', 1), ('this', 1)],
 [('is', 1), ('the', 1), ('this', 1), ('second', 1), ('sentence', 1)],
 [('this', 1), ('document', 1), ('third', 1)]]

# <span style ='color:blue'>Enough of exercise, lets sum up the steps </span>

- Documents to token
- token to dictionary
- documents to corpus consisting with dictionary 

In [67]:
# Craete Disctionary and Corpus
mydict= corpora.Dictionary([simple_preprocess(line) for line in documents])
#mydict.add_documents([['hi','bye']])

#mydict=corpora.Dictionary()
corpus = [mydict.doc2bow(simple_preprocess(line)) for line in documents]


In [68]:
#for i in range(len(mydict)):
#    print(mydict[i])

In [69]:
corpus

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(1, 1), (3, 1), (4, 1), (5, 1), (6, 1)],
 [(4, 1), (7, 1), (8, 1)]]

# <span style='color:red'>How to save dictionary and corpus to the disk</span>

In [70]:
mydict.save('my_dictionary')
corpora.MmCorpus.serialize('bow_corpus.nm',corpus)

# <span style='color:green'> TF-IDF Matrix in Gensim </span>
- Multiply local Term Frequency with Inverse Global Term and Normalizes result to Unit LEngth
- Words with high frequency will get down weghed
- Genesim uses <a href ='https://en.wikipedia.org/wiki/SMART_Information_Retrieval_System'>SMART INFORMATION RETRIEVAL SYSTEM</a> to calculate TF IDF weigh
- We can use 'smartirs' parameter in TFIDF model to calculate this term
- Normalizing the CORPUS factoring local component i.e. TF (Term Frequency) and Global component i.e. IDF (Inverse Doc. frequnecy)
- If certain words only come with high frequency in less documents the yield high value
- If words come in all documents then yield low value

In [71]:
# TF-IDF

tfidf=models.TfidfModel(corpus,smartirs='ntc')

In [72]:
for doc in tfidf[corpus]:
    print([[mydict[id],np.around(freq,decimals=2)]for id, freq in doc])

[['first', 0.66], ['is', 0.24], ['line', 0.66], ['the', 0.24]]
[['is', 0.24], ['the', 0.24], ['second', 0.66], ['sentence', 0.66]]
[['document', 0.71], ['third', 0.71]]


# <span style='color:green'>Observation above after TF-IDF</span>
- high frequency words across documents are reduced with value such as 'is','the'
- Words coming always such as 'this' has been removed

# <span style='color:red'>Use GENSIM downloader API to load dataset </span>

In [73]:
import gensim.downloader as api

In [74]:
#download 'glove-wiki-gigaword-50' model

In [75]:
api.info('glove-wiki-gigaword-50')

{'base_dataset': 'Wikipedia 2014 + Gigaword 5 (6B tokens, uncased)',
 'checksum': 'c289bc5d7f2f02c6dc9f2f9b67641813',
 'description': 'Pre-trained vectors based on Wikipedia 2014 + Gigaword, 5.6B tokens, 400K vocab, uncased (https://nlp.stanford.edu/projects/glove/).',
 'file_name': 'glove-wiki-gigaword-50.gz',
 'file_size': 69182535,
 'license': 'http://opendatacommons.org/licenses/pddl/',
 'num_records': 400000,
 'parameters': {'dimension': 50},
 'parts': 1,
 'preprocessing': 'Converted to w2v format with `python -m gensim.scripts.glove2word2vec -i <fname> -o glove-wiki-gigaword-50.txt`.',
 'read_more': ['https://nlp.stanford.edu/projects/glove/',
  'https://nlp.stanford.edu/pubs/glove.pdf'],
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/glove-wiki-gigaword-50/__init__.py'}

In [76]:
#download
w2v_model=api.load('glove-wiki-gigaword-50')

In [77]:
np.version.version

'1.15.4'

In [78]:
w2v_model.most_similar('red')

  if np.issubdtype(vec.dtype, np.int):


[('yellow', 0.8995457887649536),
 ('blue', 0.8901658654212952),
 ('green', 0.8561932444572449),
 ('black', 0.8400583267211914),
 ('purple', 0.8323202133178711),
 ('white', 0.8149363398551941),
 ('pink', 0.8148657083511353),
 ('orange', 0.8042871952056885),
 ('golden', 0.7416437864303589),
 ('colored', 0.7381109595298767)]

# Bigram and Trigram model using Phraser models

In [79]:
api.info('text8')

{'checksum': '68799af40b6bda07dfa47a32612e5364',
 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.',
 'file_name': 'text8.gz',
 'file_size': 33182058,
 'license': 'not found',
 'num_records': 1701,
 'parts': 1,
 'read_more': ['http://mattmahoney.net/dc/textdata.html'],
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py',
 'record_format': 'list of str (tokens)'}

In [80]:
dataset=api.load('text8')

In [81]:
#df1=pd.DataFrame(dataset)

In [82]:
#df1.shape

In [83]:
#df1.tail()

In [84]:
dataset=[wd for wd in dataset]

In [85]:
#np.shape(dataset[1699])

In [86]:
np.shape(dataset)

(1701,)

In [87]:
mydict=corpora.Dictionary(dataset)

In [88]:
#print(mydict)

In [89]:
# mycorpa prepares the list of pair of numbers , 1st is the dictionary id for teh word and 2nd the frequency / occurences of such word in that partiular record
mycorpa=[mydict.doc2bow(line) for line in dataset]

In [90]:
#[[mydict[i],j] for i,j in mycorpa[1]]

## Prepare BIGRAM Model

In [91]:
bigram=gensim.models.phrases.Phrases(dataset,min_count=3,threshold=10)

In [92]:
#print(bigram[dataset[0]])

In [93]:
print(bigram['french','revolution','language','green','revolution'])

['french_revolution', 'language', 'green', 'revolution']




In [94]:
print(bigram['russian','revolution','niraj'])

['russian_revolution', 'niraj']




## Prepare Trigram : pass the o/p of bigram data to the model definition

In [95]:
#build Trigram Model
trigram=gensim.models.phrases.Phrases(bigram[dataset],min_count=5,threshold=10)



In [96]:
print(np.size(bigram[dataset[1]]))

9049




In [97]:
# Construct TRIGRAM
print(np.size(trigram[bigram[dataset[1]]]))



8750


In [98]:
print(np.size(bigram[dataset[1]]))

9049




In [99]:
#print(bigram[dataset[1]])

In [100]:
#print(trigram[bigram[dataset[0]]])

In [101]:
np.size(dataset[1])

10000

In [102]:
#bigram[dataset[0]]

# <span style='color:red'>LDA : Latent Dirichlet Allocation</span>
- reference <a>https://www.youtube.com/watch?v=DWJYZq_fQ2A</a>
- Used for topic modeling, developed by BIES in 2003
- Based upon probabilistic distribution
- Use cases : Topic for text / documentation, semantic analysis, bioinformatiocs, object localization for image
- Document : Topic modeling context , referes to probability distribution of latent topics
- Topic : each topic concerns different probability distribution of words

__ Plate Notation __
- Parameters 

$\alpha :$ Is the parameter of Drichlet Prior on the Per document latent topic distribution  
$\beta  :$ Is the parameter of Drichlet prior on the per-topic word distribution  
$\theta{_m}$: Topic distribution for document (m)  
$z{_m}{_n}$:Topic for nth word in document (m)  
$w{_m}{_n}$:word (document m, word n)

# <span style='color:blue'>Lecture by Prof. David Blei </span>
<a> https://www.youtube.com/watch?v=FkckgwMHP2s</a>
- Annonate Documents
- Organize, Visualize
- Collaborative topic modeling , people read books, texts ...we can learn from their behaviour and what they are reading
- EM Algorithm

- Visualize LDA outputs as distribute documents to the corner of TOPICS


# <span style='color:brown'>Generative process</span>
- generate document with set of words matching certain topic and words distribution
- Deteremine no of words in a document
- Choose a TOPIC mixture for the document ; say Topic A = 20%, B= 30%, C= 50%
- Generate the words in document by (a) Pick up a topic following above multinomial distribution (b) pick up words based upon topic multinomial distribution

# <span style='color:brown'>Working backwards</span>
- Suppose we have copus of documents
- We wish LDA to learn a topic representation of K Topics is each document and word distributio for each topic
- LDA back tracks from document level to identify topics that are likely to have generated the corpus

## <span style='color:red'> Process</red>
- Randomly assign 1 of K topics to each word in each document
- For each document d
    - Assume all topic assigmnents except for the current one is CORRECT
    - Create two proportion (a) Proportion of words currently assigns to topic t = P(topic t|documnet d)
    - (b) Proportion of words in all documents correspond to the topic t = P(word w|topic t)
- Multiple these two Probs and assign w a new topic based upon teh resultant prob
    - P(topic t|document d)*P(word w|topic t)
- Eventuallly we reach a steady state where assignment makes sense

# <span style='color:green'>CONCLUSION:</span>
- document is the PD over topics
- topic is PD over words
- LDA takes number of documents , where words in each document is related
- It then tries to figure out RECIPE as how each document could have been created
- We can tell Model how many topics to create
- Based upon that model/receipe we can fi d out similar documents within corpus

# <span style='color:red'>Topic Models with LDA</span>

In [103]:
# Step 0: Import packages and stopword
from gensim.models import LdaModel, LdaMulticore
import gensim.downloader as api
from gensim.utils import simple_preprocess,lemmatize
from nltk.corpus import stopwords
import re
import logging


In [104]:
logging.basicConfig(format='%(asctime)s :%(levelname)s :%(message)s')
logging.root.setLevel(level=logging.INFO)
stop_words=stopwords.words('english')
stop_words=stop_words+['com', 'edu', 'subject', 'lines', 'organization', 'would', 'article', 'could']

In [105]:
#step 1: import dataset
data = api.load('text8')

In [106]:
api.info('text8')

{'checksum': '68799af40b6bda07dfa47a32612e5364',
 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.',
 'file_name': 'text8.gz',
 'file_size': 33182058,
 'license': 'not found',
 'num_records': 1701,
 'parts': 1,
 'read_more': ['http://mattmahoney.net/dc/textdata.html'],
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py',
 'record_format': 'list of str (tokens)'}

In [107]:
type(data)

text8.Dataset

In [108]:
df2=pd.DataFrame(data)

In [109]:
df2.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
1696,the,format,although,there,are,several,third,party,tools,which,...,aggression,pact,was,signed,with,provisions,that,included,consultation,arbitration
1697,if,either,party,disagreed,neutrality,if,either,went,to,war,...,willie,mccovey,satchel,paige,and,ozzie,smith,new,york,city
1698,and,chicago,are,first,and,second,respectively,the,pop,band,...,two,eight,three,two,three,six,four,zero,four,four
1699,four,five,four,eight,four,nine,five,zero,five,two,...,medal,history,historical,myths,history,of,poland,here,you,can
1700,find,list,of,once,popular,beliefs,or,beliefs,which,are,...,,,,,,,,,,


In [110]:
data =[d for d in data]

In [111]:
np.size(data[3])


10000

In [112]:
np.shape(data[:100])

(100, 10000)

In [113]:
#Test
lemmatize('pitenge',allowed_tags=re.compile('NN'))[0].split(b'/')[0].decode('utf-8')

'pitenge'

In [114]:
#lemmatize('do not terrorize the environment',allowed_tags=re.compile('(NN|JJ|RB)'))[1].split(b'/')[0].decode('utf-8')

In [115]:
data_processed=[]
for i,doc in enumerate(data[:100]+tokenlized_wa):
    doc_out=[]
    for wd in doc:
        if wd not in stop_words:
            lemmatized_word=lemmatize(wd,allowed_tags=re.compile('(NN|JJ|RB)'))
            if lemmatized_word:
                doc_out = doc_out+[lemmatized_word[0].split(b'/')[0].decode('utf-8')]
            else:
                continue
    data_processed.append(doc_out)
            

In [116]:
len(data_processed)

111

In [117]:
np.shape(data_processed)

(111,)

In [118]:
print(data_processed[2][:10])

['aegis', 'zeus', 'battlefield', 'entire', 'trojan', 'army', 'wall', 'troy', 'achille', 'wrath']


In [119]:
lemmatize(tokenlized_wa[0][1],allowed_tags=re.compile('(NN|JJ|RB)'))[0].split(b'/')[0].decode('utf-8')

'hilarious'

# <span style='color:red'>Create dictionary and corpus as input to the LDA model</span>

In [120]:
dict=corpora.Dictionary(data_processed)

2020-08-01 21:25:55,461 :INFO :adding document #0 to Dictionary(0 unique tokens: [])
2020-08-01 21:25:55,952 :INFO :built Dictionary(40130 unique tokens: ['ability', 'able', 'abnormal', 'abolition', 'absence']...) from 111 documents (total 425825 corpus positions)


In [121]:
df3=pd.DataFrame(dict)

In [122]:
[dict[i] for i in df3.index[-10:]]

['raha',
 'rahi',
 'upshot',
 'oaant',
 'oaat',
 'oarn',
 'phaan',
 'phor',
 'shay',
 'uill']

In [123]:
corpus=[dict.doc2bow(line) for line in data_processed]

In [124]:
print(dict)

Dictionary(40130 unique tokens: ['ability', 'able', 'abnormal', 'abolition', 'absence']...)


In [125]:
np.shape(corpus[1])

(1850, 2)

In [126]:
# Test how does corpus - word id from dctionary and frequency per document looks like
# Remove comment to run the command
#[print(dict[i],count) for i,count in corpus[1]]

In [137]:
np.shape(corpus[2])

(1632, 2)

In [127]:
lda_model=LdaMulticore(corpus,id2word=dict,random_state=100,num_topics=7,passes=10,batch=False,alpha='asymmetric',decay=0.5,offset=64,eta=None,eval_every=0,iterations=100,
                   gamma_threshold=0.001,per_word_topics=True)

2020-08-01 21:25:57,563 :INFO :using asymmetric alpha [0.26219156, 0.19027454, 0.14931786, 0.12287004, 0.104381524, 0.090729296, 0.080235206]
2020-08-01 21:25:57,581 :INFO :using symmetric eta at 0.14285714285714285
2020-08-01 21:25:57,646 :INFO :using serial LDA version on this node
2020-08-01 21:25:57,968 :INFO :running online LDA training, 7 topics, 10 passes over the supplied corpus of 111 documents, updating every 6000 documents, evaluating every ~0 documents, iterating 100x with a convergence threshold of 0.001000
2020-08-01 21:25:58,024 :INFO :training LDA model using 3 processes
2020-08-01 21:26:02,166 :INFO :PROGRESS: pass 0, dispatched chunk #0 = documents up to #111/111, outstanding queue size 1
2020-08-01 21:26:37,534 :INFO :topic #6 (0.080): 0.001*"also" + 0.001*"state" + 0.001*"many" + 0.001*"time" + 0.001*"first" + 0.001*"year" + 0.001*"american" + 0.001*"person" + 0.001*"war" + 0.001*"new"
2020-08-01 21:26:37,537 :INFO :topic #5 (0.091): 0.001*"american" + 0.001*"also" 

2020-08-01 21:26:49,305 :INFO :topic #0 (0.262): 0.001*"also" + 0.000*"first" + 0.000*"time" + 0.000*"state" + 0.000*"person" + 0.000*"american" + 0.000*"year" + 0.000*"many" + 0.000*"new" + 0.000*"war"
2020-08-01 21:26:49,310 :INFO :topic diff=0.134342, rho=0.120337
2020-08-01 21:26:50,654 :INFO :-9.326 per-word bound, 641.8 perplexity estimate based on a held-out corpus of 111 documents with 425825 words
2020-08-01 21:26:50,656 :INFO :PROGRESS: pass 6, dispatched chunk #0 = documents up to #111/111, outstanding queue size 1
2020-08-01 21:26:51,467 :INFO :topic #6 (0.080): 0.005*"also" + 0.004*"state" + 0.003*"many" + 0.003*"person" + 0.003*"time" + 0.003*"first" + 0.003*"war" + 0.003*"world" + 0.003*"year" + 0.002*"language"
2020-08-01 21:26:51,470 :INFO :topic #5 (0.091): 0.001*"american" + 0.001*"also" + 0.001*"first" + 0.000*"year" + 0.000*"name" + 0.000*"albert" + 0.000*"world" + 0.000*"new" + 0.000*"however" + 0.000*"war"
2020-08-01 21:26:51,474 :INFO :topic #2 (0.149): 0.004*"a

In [128]:
###lda_model=LdaMulticore(corpus,id2word=dict,random_state=100,num_topics=7,passes=10,chunksize=1000,batch=False,alpha='asymmetric',decay=0.5,offset=64,eta=None,eval_every=0,iterations=100,
###                   gamma_threshold=0.001,per_word_topics=True)

In [129]:
lda_model.save('lda_model.model')

2020-08-01 21:26:59,976 :INFO :saving LdaState object under lda_model.model.state, separately None
2020-08-01 21:27:00,196 :INFO :saved lda_model.model.state
2020-08-01 21:27:00,254 :INFO :saving LdaMulticore object under lda_model.model, separately ['expElogbeta', 'sstats']
2020-08-01 21:27:00,255 :INFO :storing np array 'expElogbeta' to lda_model.model.expElogbeta.npy
2020-08-01 21:27:00,475 :INFO :not storing attribute state
2020-08-01 21:27:00,479 :INFO :not storing attribute dispatcher
2020-08-01 21:27:00,483 :INFO :not storing attribute id2word
2020-08-01 21:27:00,492 :INFO :saved lda_model.model


In [130]:
lda_model.print_topics(-1)

2020-08-01 21:27:00,513 :INFO :topic #0 (0.262): 0.001*"also" + 0.000*"first" + 0.000*"time" + 0.000*"state" + 0.000*"person" + 0.000*"american" + 0.000*"year" + 0.000*"many" + 0.000*"new" + 0.000*"war"
2020-08-01 21:27:00,519 :INFO :topic #1 (0.190): 0.004*"american" + 0.004*"also" + 0.004*"football" + 0.003*"acid" + 0.003*"player" + 0.003*"audi" + 0.003*"ball" + 0.003*"play" + 0.003*"team" + 0.002*"first"
2020-08-01 21:27:00,524 :INFO :topic #2 (0.149): 0.005*"agave" + 0.004*"also" + 0.004*"state" + 0.003*"first" + 0.003*"many" + 0.003*"time" + 0.003*"person" + 0.002*"book" + 0.002*"year" + 0.002*"apollo"
2020-08-01 21:27:00,528 :INFO :topic #3 (0.123): 0.006*"american" + 0.004*"also" + 0.004*"first" + 0.004*"year" + 0.003*"state" + 0.003*"time" + 0.003*"day" + 0.003*"world" + 0.002*"many" + 0.002*"new"
2020-08-01 21:27:00,531 :INFO :topic #4 (0.104): 0.001*"also" + 0.001*"american" + 0.000*"state" + 0.000*"first" + 0.000*"many" + 0.000*"time" + 0.000*"year" + 0.000*"number" + 0.000*

[(0,
  '0.001*"also" + 0.000*"first" + 0.000*"time" + 0.000*"state" + 0.000*"person" + 0.000*"american" + 0.000*"year" + 0.000*"many" + 0.000*"new" + 0.000*"war"'),
 (1,
  '0.004*"american" + 0.004*"also" + 0.004*"football" + 0.003*"acid" + 0.003*"player" + 0.003*"audi" + 0.003*"ball" + 0.003*"play" + 0.003*"team" + 0.002*"first"'),
 (2,
  '0.005*"agave" + 0.004*"also" + 0.004*"state" + 0.003*"first" + 0.003*"many" + 0.003*"time" + 0.003*"person" + 0.002*"book" + 0.002*"year" + 0.002*"apollo"'),
 (3,
  '0.006*"american" + 0.004*"also" + 0.004*"first" + 0.004*"year" + 0.003*"state" + 0.003*"time" + 0.003*"day" + 0.003*"world" + 0.002*"many" + 0.002*"new"'),
 (4,
  '0.001*"also" + 0.001*"american" + 0.000*"state" + 0.000*"first" + 0.000*"many" + 0.000*"time" + 0.000*"year" + 0.000*"number" + 0.000*"world" + 0.000*"day"'),
 (5,
  '0.001*"american" + 0.001*"also" + 0.000*"first" + 0.000*"year" + 0.000*"name" + 0.000*"albert" + 0.000*"world" + 0.000*"new" + 0.000*"however" + 0.000*"war"'),


# <span style='color:blue'>LDA Observation</span>
- Words viz. 'many', 'also' have occured across many topics , may be we can bring these words in teh STOP WORD list
- LDAMulticore () supports multi processors , may be we can run LDAModel()

# <span style='color:purple'>LDA Interpretation</span>
- If we pass a list of words (document) to the model i.e. lda_model
- Three things (a) Topics that document belongs to with % (b) Topic each word in that document belongs to (c) (b) and PHI value
- 

# Run the LDA_MODEL against a set of corpus , say corpus [5:8]
__Output produces the following:-__
- Distribution of topic for each document, i.e. list of topics with its % for each document say 
- List of topic for each word in each document
- PHI value : for each word in each document what is the % of each topic


In [131]:
for c in lda_model[corpus[5:8]]:
    print('Document topics            :',c[0])
    print('Word id, topic             :',c[1][:3])
    print('Word id, topic & PHI val   :',c[2][:3]) #[(word id,[(topic, phi value)])]
    print('word, topic                :',[(dict[wd],topic) for wd,topic in c[1][:2]])
    print('word, PHI Value            :',[(dict[wd],topic) for wd,topic in c[2][:2]])
    print('--------------------------------------------------------------\n')

Document topics            : [(2, 0.97002685), (6, 0.029784068)]
Word id, topic             : [(0, [2, 6]), (7, [2, 6]), (10, [2, 6])]
Word id, topic & PHI val   : [(0, [(2, 2.8966966), (6, 0.10330065)]), (7, [(2, 0.97934216), (6, 0.020654295)]), (10, [(2, 0.96770024), (6, 0.032299228)])]
word, topic                : [('ability', [2, 6]), ('absurdity', [2, 6])]
word, PHI Value            : [('ability', [(2, 2.8966966), (6, 0.10330065)]), ('absurdity', [(2, 0.97934216), (6, 0.020654295)])]
--------------------------------------------------------------

Document topics            : [(2, 0.9615397), (6, 0.03827158)]
Word id, topic             : [(0, [2, 6]), (10, [2, 6]), (16, [2, 6])]
Word id, topic & PHI val   : [(0, [(2, 5.7346497), (6, 0.26534584)]), (10, [(2, 2.8754706), (6, 0.12452758)]), (16, [(2, 0.9751281), (6, 0.024870958)])]
word, topic                : [('ability', [2, 6]), ('academic', [2, 6])]
word, PHI Value            : [('ability', [(2, 5.7346497), (6, 0.26534584)]), ('ac

# Run the LDA_MODEL against a set of corpus , say Atul Sood corpus ,which is last 11 records
__Output produces the following:-__
- Distribution of topic for each document, i.e. list of topics with its % for each document say 
- List of topic for each word in each document
- PHI value : for each word in each document what is the % of each topic


In [141]:
for c in lda_model[corpus[101:111]]:
    print('Document topics            :',c[0])
    print('Word id, topic             :',c[1][:3])
    print('Word id, topic & PHI val   :',c[2][:3]) #[(word id,[(topic, phi value)])]
    print('word, topic                :',[(dict[wd],topic) for wd,topic in c[1][:2]])
    print('word, PHI Value            :',[(dict[wd],topic) for wd,topic in c[2][:2]])
    print('--------------------------------------------------------------\n')

Document topics            : [(0, 0.010456186), (1, 0.9684362)]
Word id, topic             : [(215, [1]), (219, [1]), (984, [1])]
Word id, topic & PHI val   : [(215, [(1, 0.99979466)]), (219, [(1, 0.99985415)]), (984, [(1, 0.99980354)])]
word, topic                : [('certainly', [1]), ('change', [1])]
word, PHI Value            : [('certainly', [(1, 0.99979466)]), ('change', [(1, 0.99985415)])]
--------------------------------------------------------------

Document topics            : [(6, 0.97004974)]
Word id, topic             : [(155, [6]), (227, [6]), (937, [6])]
Word id, topic & PHI val   : [(155, [(6, 0.99985087)]), (227, [(6, 0.9998393)]), (937, [(6, 0.99975485)])]
word, topic                : [('best', [6]), ('chief', [6])]
word, PHI Value            : [('best', [(6, 0.99985087)]), ('chief', [(6, 0.9998393)])]
--------------------------------------------------------------

Document topics            : [(0, 0.024208438), (1, 0.017531896), (2, 0.013670664), (3, 0.011184189), (

In [142]:
#[(dict[i],count) for i,count in corpus[5]]

# <span style='color:blue'>LSI : Latent Semantic Information</span>
- Similar to LDA except using LsiModel

In [143]:
from gensim.models import LsiModel

In [144]:
lsi_model = LsiModel(corpus,num_topics=7,id2word=dict,decay=0.5)

2020-08-02 11:43:22,509 :INFO :using serial LSI version on this node
2020-08-02 11:43:22,513 :INFO :updating model with new documents
2020-08-02 11:43:22,648 :INFO :preparing a new chunk of documents
2020-08-02 11:43:22,803 :INFO :using 100 extra samples and 2 power iterations
2020-08-02 11:43:22,805 :INFO :1st phase: constructing (40130, 107) action matrix
2020-08-02 11:43:22,936 :INFO :orthonormalizing (40130, 107) action matrix
2020-08-02 11:43:26,763 :INFO :2nd phase: running dense svd on (107, 111) matrix
2020-08-02 11:43:27,164 :INFO :computing the final decomposition
2020-08-02 11:43:27,193 :INFO :keeping 7 factors (discarding 62.873% of energy spectrum)
2020-08-02 11:43:27,410 :INFO :processed documents up to #111
2020-08-02 11:43:27,616 :INFO :topic #0(973.796): -0.262*"also" + -0.197*"state" + -0.197*"american" + -0.178*"first" + -0.151*"many" + -0.149*"time" + -0.147*"year" + -0.130*"person" + -0.130*"world" + -0.124*"war"
2020-08-02 11:43:27,620 :INFO :topic #1(572.318): -0

In [145]:
print(lsi_model.print_topics(-1))

2020-08-02 11:43:37,546 :INFO :topic #0(973.796): -0.262*"also" + -0.197*"state" + -0.197*"american" + -0.178*"first" + -0.151*"many" + -0.149*"time" + -0.147*"year" + -0.130*"person" + -0.130*"world" + -0.124*"war"
2020-08-02 11:43:37,551 :INFO :topic #1(572.318): -0.937*"agave" + -0.164*"asia" + -0.100*"aruba" + -0.063*"plant" + -0.053*"var" + -0.052*"state" + -0.045*"east" + -0.044*"congress" + 0.042*"first" + -0.041*"maguey"
2020-08-02 11:43:37,555 :INFO :topic #2(401.785): -0.507*"american" + -0.180*"football" + -0.179*"player" + -0.168*"war" + -0.150*"british" + 0.140*"also" + -0.114*"ball" + -0.110*"day" + 0.107*"atheism" + 0.106*"god"
2020-08-02 11:43:37,559 :INFO :topic #3(334.148): 0.362*"apollo" + -0.248*"lincoln" + -0.211*"state" + 0.172*"player" + 0.151*"football" + -0.127*"union" + 0.125*"ball" + -0.124*"government" + 0.116*"moon" + -0.116*"jews"
2020-08-02 11:43:37,562 :INFO :topic #4(322.190): 0.363*"atheism" + 0.334*"god" + 0.329*"lincoln" + 0.230*"apollo" + 0.215*"ath

[(0, '-0.262*"also" + -0.197*"state" + -0.197*"american" + -0.178*"first" + -0.151*"many" + -0.149*"time" + -0.147*"year" + -0.130*"person" + -0.130*"world" + -0.124*"war"'), (1, '-0.937*"agave" + -0.164*"asia" + -0.100*"aruba" + -0.063*"plant" + -0.053*"var" + -0.052*"state" + -0.045*"east" + -0.044*"congress" + 0.042*"first" + -0.041*"maguey"'), (2, '-0.507*"american" + -0.180*"football" + -0.179*"player" + -0.168*"war" + -0.150*"british" + 0.140*"also" + -0.114*"ball" + -0.110*"day" + 0.107*"atheism" + 0.106*"god"'), (3, '0.362*"apollo" + -0.248*"lincoln" + -0.211*"state" + 0.172*"player" + 0.151*"football" + -0.127*"union" + 0.125*"ball" + -0.124*"government" + 0.116*"moon" + -0.116*"jews"'), (4, '0.363*"atheism" + 0.334*"god" + 0.329*"lincoln" + 0.230*"apollo" + 0.215*"atheist" + 0.143*"abraham" + -0.136*"island" + 0.132*"aristotle" + -0.124*"aluminium" + 0.119*"belief"'), (5, '-0.360*"apollo" + 0.344*"atheism" + -0.326*"lincoln" + 0.226*"god" + 0.205*"atheist" + 0.139*"american" 

# <span style='color:blue'>Word2Vec : Train using W2V using gensim</span>
- USe prebuilt model like word2vec, fasttext, GloVe, ConceptNet : Built using large corpuses using wikipedia, googleNews etc.
- FOr specialized document - viz Technical document, TRAIN model
- Below TRAIN word embedding using gensim and self corpus

In [146]:
from gensim.models.word2vec import Word2Vec
from multiprocessing import cpu_count
import gensim.downloader as api

In [147]:
#download dataset
dataset=api.load('text8')

In [148]:
api.info('text8')

{'checksum': '68799af40b6bda07dfa47a32612e5364',
 'description': 'First 100,000,000 bytes of plain text from Wikipedia. Used for testing purposes; see wiki-english-* for proper full Wikipedia datasets.',
 'file_name': 'text8.gz',
 'file_size': 33182058,
 'license': 'not found',
 'num_records': 1701,
 'parts': 1,
 'read_more': ['http://mattmahoney.net/dc/textdata.html'],
 'reader_code': 'https://github.com/RaRe-Technologies/gensim-data/releases/download/text8/__init__.py',
 'record_format': 'list of str (tokens)'}

In [149]:
df7=pd.DataFrame(dataset)

In [None]:
#df7.tail()

In [150]:
data =[d for d in dataset]

In [151]:
# Split dataset into two part 
data_part1 = data[:1000]
data_part2 = data[1000:]

In [152]:
np.shape(data_part1)

(1000, 10000)

In [154]:
np.shape(data_part2[:-1])

(700, 10000)

In [155]:
np.shape(data_part2[700])

(5207,)

In [156]:
#Train W2V with default vector size = 100
model=Word2Vec(data_part1,min_count=0,workers=cpu_count())

2020-08-02 12:01:01,292 :INFO :collecting all words and their counts
2020-08-02 12:01:01,553 :INFO :PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-08-02 12:01:04,755 :INFO :collected 189074 word types from a corpus of 10000000 raw words and 1000 sentences
2020-08-02 12:01:04,786 :INFO :Loading a fresh vocabulary
2020-08-02 12:01:25,614 :INFO :effective_min_count=0 retains 189074 unique words (100% of original 189074, drops 0)
2020-08-02 12:01:25,615 :INFO :effective_min_count=0 leaves 10000000 word corpus (100% of original 10000000, drops 0)
2020-08-02 12:01:26,395 :INFO :deleting the raw counts dictionary of 189074 items
2020-08-02 12:01:26,401 :INFO :sample=0.001 downsamples 38 most-common words
2020-08-02 12:01:26,403 :INFO :downsampling leaves estimated 7563517 word corpus (75.6% of prior 10000000)
2020-08-02 12:01:27,162 :INFO :estimated required memory for 189074 words and 100 dimensions: 245796200 bytes
2020-08-02 12:01:27,164 :INFO :resetting layer weigh

In [157]:
model['topic']

  """Entry point for launching an IPython kernel.


array([-1.5262312 ,  0.409382  , -1.4726707 ,  0.39713287,  0.24458115,
        0.6964405 , -0.6542572 , -0.28446203, -0.4563818 ,  0.12563562,
       -0.27262893, -0.2628879 , -0.58937395,  1.076456  , -0.14841628,
        0.9316121 , -1.0665156 , -0.7353373 ,  0.30655625, -1.6143473 ,
        0.42842793, -0.15731955,  1.9499857 ,  0.5797155 ,  1.3734457 ,
       -1.1049469 , -1.3376522 ,  0.6532739 , -0.69266707, -0.92454493,
        0.19630088, -0.28022358,  0.7154724 ,  0.9280978 ,  0.19606432,
        0.12112555, -0.05990876,  0.7903815 , -0.45332834,  0.12762949,
       -0.14184141, -0.00800085,  0.2833234 ,  0.2683099 ,  0.45947027,
        0.07656071,  0.48083988, -0.66036373,  0.62744874,  0.8435816 ,
       -0.49784535,  1.6581378 , -0.5192989 , -0.4286661 ,  0.29243273,
       -1.150825  ,  0.42950436,  0.0577182 ,  0.32688102, -0.21003073,
       -0.89452416,  0.23006344, -0.42895955, -0.82256824, -0.50821704,
        0.6943391 , -0.28252715,  0.3059836 ,  0.24493313, -0.08

In [158]:
model.most_similar('subject')

  """Entry point for launching an IPython kernel.
2020-08-02 12:07:30,591 :INFO :precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('topic', 0.6691651344299316),
 ('question', 0.631722629070282),
 ('issue', 0.6240805983543396),
 ('debate', 0.6190211772918701),
 ('matter', 0.5940889120101929),
 ('contrary', 0.5872217416763306),
 ('opinion', 0.5860634446144104),
 ('interpretation', 0.5841245055198669),
 ('moral', 0.5747615098953247),
 ('irrelevant', 0.5735539197921753)]

In [159]:
model.save('newmodel')

2020-08-02 12:07:41,307 :INFO :saving Word2Vec object under newmodel, separately None
2020-08-02 12:07:41,341 :INFO :storing np array 'vectors' to newmodel.wv.vectors.npy
2020-08-02 12:07:42,411 :INFO :not storing attribute vectors_norm
2020-08-02 12:07:42,447 :INFO :storing np array 'syn1neg' to newmodel.trainables.syn1neg.npy
2020-08-02 12:07:43,787 :INFO :not storing attribute cum_table
2020-08-02 12:07:44,508 :INFO :saved newmodel


In [161]:
model=Word2Vec.load('newmodel')

2020-08-02 12:11:59,637 :INFO :loading Word2Vec object from newmodel
2020-08-02 12:12:00,181 :INFO :loading wv recursively from newmodel.wv.* with mmap=None
2020-08-02 12:12:00,182 :INFO :loading vectors from newmodel.wv.vectors.npy with mmap=None
2020-08-02 12:12:00,250 :INFO :setting ignored attribute vectors_norm to None
2020-08-02 12:12:00,251 :INFO :loading vocabulary recursively from newmodel.vocabulary.* with mmap=None
2020-08-02 12:12:00,253 :INFO :loading trainables recursively from newmodel.trainables.* with mmap=None
2020-08-02 12:12:00,255 :INFO :loading syn1neg from newmodel.trainables.syn1neg.npy with mmap=None
2020-08-02 12:12:00,323 :INFO :setting ignored attribute cum_table to None
2020-08-02 12:12:00,325 :INFO :loaded newmodel


In [162]:
corpus1=model.corpus_count

In [163]:
corpus1

1000

In [164]:
len(model.wv.vocab)

189074

# <span style='color:red'>Update existing Word2Vec Model with new data set</span>
__Steps involved :-__
- update dictionary by calling build_vocabulary
- Train the model with sentences 


In [165]:
model.build_vocab(data_part2,update=True)

2020-08-02 12:20:33,233 :INFO :collecting all words and their counts
2020-08-02 12:20:33,236 :INFO :PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-08-02 12:20:35,013 :INFO :collected 153347 word types from a corpus of 7005207 raw words and 701 sentences
2020-08-02 12:20:35,014 :INFO :Updating model with new vocabulary
2020-08-02 12:20:36,029 :INFO :New added 153347 unique words (50% of original 306694) and increased the count of 153347 pre-existing words (50% of original 306694)
2020-08-02 12:20:37,270 :INFO :deleting the raw counts dictionary of 153347 items
2020-08-02 12:20:37,275 :INFO :sample=0.001 downsamples 72 most-common words
2020-08-02 12:20:37,277 :INFO :downsampling leaves estimated 10509051 word corpus (150.0% of prior 7005207)
2020-08-02 12:20:37,881 :INFO :estimated required memory for 306694 words and 100 dimensions: 398702200 bytes
2020-08-02 12:20:37,882 :INFO :updating layer weights


In [166]:
len(model.wv.vocab)

253854

In [167]:
model.corpus_count

701

In [168]:
model.train(data_part2,total_examples=model.corpus_count,epochs=model.iter)

  """Entry point for launching an IPython kernel.
2020-08-02 12:35:54,130 :INFO :training model with 4 workers on 253854 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-08-02 12:35:55,138 :INFO :EPOCH 1 - PROGRESS: at 15.26% examples, 805367 words/s, in_qsize 8, out_qsize 0
2020-08-02 12:35:56,142 :INFO :EPOCH 1 - PROGRESS: at 32.24% examples, 850520 words/s, in_qsize 7, out_qsize 0
2020-08-02 12:35:57,156 :INFO :EPOCH 1 - PROGRESS: at 48.36% examples, 840897 words/s, in_qsize 6, out_qsize 1
2020-08-02 12:35:58,162 :INFO :EPOCH 1 - PROGRESS: at 65.34% examples, 853114 words/s, in_qsize 7, out_qsize 0
2020-08-02 12:35:59,167 :INFO :EPOCH 1 - PROGRESS: at 81.74% examples, 854649 words/s, in_qsize 7, out_qsize 0
2020-08-02 12:36:00,169 :INFO :EPOCH 1 - PROGRESS: at 98.15% examples, 855409 words/s, in_qsize 8, out_qsize 0
2020-08-02 12:36:00,265 :INFO :worker thread finished; awaiting finish of 3 more threads
2020-08-02 12:36:00,273 :INFO :worker thread f

(26273330, 35026035)

In [169]:
model.most_similar('subject')

  """Entry point for launching an IPython kernel.
2020-08-02 12:37:02,070 :INFO :precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('topic', 0.6607025265693665),
 ('matter', 0.6202406883239746),
 ('question', 0.5902384519577026),
 ('nature', 0.5512784719467163),
 ('scope', 0.5502288937568665),
 ('discussion', 0.5417620539665222),
 ('debate', 0.5377275347709656),
 ('perception', 0.5320144891738892),
 ('validity', 0.526340126991272),
 ('intent', 0.5252469182014465)]

In [170]:
# Use Sood's WA text msgs
#tokenlized_wa

In [172]:
model.build_vocab(tokenlized_wa,update=True)

2020-08-02 12:37:23,130 :INFO :collecting all words and their counts
2020-08-02 12:37:23,133 :INFO :PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2020-08-02 12:37:23,191 :INFO :collected 340 word types from a corpus of 637 raw words and 11 sentences
2020-08-02 12:37:23,193 :INFO :Updating model with new vocabulary
2020-08-02 12:37:23,196 :INFO :New added 340 unique words (50% of original 680) and increased the count of 340 pre-existing words (50% of original 680)
2020-08-02 12:37:23,227 :INFO :deleting the raw counts dictionary of 340 items
2020-08-02 12:37:23,229 :INFO :sample=0.001 downsamples 206 most-common words
2020-08-02 12:37:23,231 :INFO :downsampling leaves estimated 905 word corpus (142.2% of prior 637)
2020-08-02 12:37:23,787 :INFO :estimated required memory for 680 words and 100 dimensions: 884000 bytes
2020-08-02 12:37:23,788 :INFO :updating layer weights


In [173]:
len(model.wv.vocab)

253885

In [174]:
model.corpus_count

11

In [175]:
model.train(tokenlized_wa,total_examples=model.corpus_count,epochs=model.iter)

  """Entry point for launching an IPython kernel.
2020-08-02 12:37:55,759 :INFO :training model with 4 workers on 253885 vocabulary and 100 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-08-02 12:37:55,766 :INFO :worker thread finished; awaiting finish of 3 more threads
2020-08-02 12:37:55,768 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 12:37:55,771 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 12:37:55,773 :INFO :worker thread finished; awaiting finish of 0 more threads
2020-08-02 12:37:55,775 :INFO :EPOCH - 1 : training on 637 raw words (460 effective words) took 0.0s, 43110 effective words/s
2020-08-02 12:37:55,784 :INFO :worker thread finished; awaiting finish of 3 more threads
2020-08-02 12:37:55,786 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 12:37:55,787 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 12:37:55,832 :INFO :worker thread finishe

(2280, 3185)

In [None]:
#model.train(data_part2,total_examples=model.corpus_count,epochs=model.iter)

In [176]:
model.similar_by_word('subject')

  """Entry point for launching an IPython kernel.
2020-08-02 12:38:30,647 :INFO :precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('topic', 0.6607025265693665),
 ('matter', 0.6202406883239746),
 ('question', 0.5902384519577026),
 ('nature', 0.5512784719467163),
 ('scope', 0.5502288937568665),
 ('discussion', 0.5417620539665222),
 ('debate', 0.5377275347709656),
 ('perception', 0.5320144891738892),
 ('validity', 0.526340126991272),
 ('intent', 0.5252469182014465)]

In [177]:
model.similar_by_word('jawaharlal')

  """Entry point for launching an IPython kernel.
  if np.issubdtype(vec.dtype, np.int):


[('motilal', 0.8182438015937805),
 ('rajiv', 0.7659848928451538),
 ('sardar', 0.74310302734375),
 ('priyanka', 0.7360767126083374),
 ('zulfikar', 0.730406641960144),
 ('karmal', 0.7269328236579895),
 ('sanjay', 0.720064640045166),
 ('varun', 0.7084603905677795),
 ('mohandas', 0.7075860500335693),
 ('sonia', 0.7064446806907654)]

# <span style='color:red'>Extract word ector using pre trained using W2V and fast trained model</span>

In [178]:
fasttext_model1300 =api.load('fasttext-wiki-news-subwords-300')

2020-08-02 12:40:03,529 :INFO :loading projection weights from C:\Users\Niraj Kumar\gensim-data\fasttext-wiki-news-subwords-300\fasttext-wiki-news-subwords-300.gz
2020-08-02 12:46:25,536 :INFO :loaded (999999, 300) matrix from C:\Users\Niraj Kumar\gensim-data\fasttext-wiki-news-subwords-300\fasttext-wiki-news-subwords-300.gz


In [179]:
word2vec_model1300=api.load('word2vec-google-news-300')

2020-08-02 12:46:33,952 :INFO :loading projection weights from C:\Users\Niraj Kumar\gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz
2020-08-02 12:52:09,096 :INFO :loaded (3000000, 300) matrix from C:\Users\Niraj Kumar\gensim-data\word2vec-google-news-300\word2vec-google-news-300.gz


In [180]:
word2vec_model1300.most_similar('dog')

2020-08-02 13:43:33,493 :INFO :precomputing L2-norms of word weight vectors
  if np.issubdtype(vec.dtype, np.int):


[('dogs', 0.8680490255355835),
 ('puppy', 0.8106428384780884),
 ('pit_bull', 0.780396044254303),
 ('pooch', 0.7627376317977905),
 ('cat', 0.7609457969665527),
 ('golden_retriever', 0.7500901818275452),
 ('German_shepherd', 0.7465174198150635),
 ('Rottweiler', 0.7437615394592285),
 ('beagle', 0.7418621778488159),
 ('pup', 0.7406911253929138)]

# <span style='color:red'>Document to Vector: Doc2Vec</span>
- Two popular method : a. PV-DM (paragraph vector distributed memory) b. DBOW: Distributed bag of word
- PV-DM : Pragraph id + context words ==> Summarzier/average/Concatenate ==> Classifier ==> Target Word .This way we learn weights for Paragraph id and words
- DBOW : Paragraph id ==> context words, Learn document vecor by sampling words from the document  
- __Document Vector is not simply the average of words vectors in the document__


__Workflow:-__
- Create training data set by calling function that takes corpus and number to tag the document
- This tagged dataset is the tarining set
- Call doc2vec and build the model by running through this training dataset

In [181]:
#Exercise
# Read from Text8 and train Doc2Vec

In [186]:
data = [d for d in dataset]

In [189]:
#Create tagged dataset for training the model
def create_tagged_document(list_of_list_of_words):
    for i , list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words,[i])

In [190]:
train_data=list(create_tagged_document(data))

In [196]:
#print(train_data[:1])

In [198]:
#Now train the doc2vec model
# 1. Intialize Doc2Vec mode;
# 2. Build vocabulary
# 3. Train the model

model=gensim.models.doc2vec.Doc2Vec(vector_size=50,min_count=2,epochs=40)



In [199]:
model.build_vocab(train_data)

2020-08-02 16:31:32,127 :INFO :collecting all words and their counts
2020-08-02 16:31:32,130 :INFO :PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2020-08-02 16:31:35,917 :INFO :collected 253854 word types and 1701 unique tags from a corpus of 1701 examples and 17005207 words
2020-08-02 16:31:35,984 :INFO :Loading a fresh vocabulary
2020-08-02 16:37:42,393 :INFO :effective_min_count=2 retains 135335 unique words (53% of original 253854, drops 118519)
2020-08-02 16:37:42,395 :INFO :effective_min_count=2 leaves 16886688 word corpus (99% of original 17005207, drops 118519)
2020-08-02 16:37:43,323 :INFO :deleting the raw counts dictionary of 253854 items
2020-08-02 16:37:43,330 :INFO :sample=0.001 downsamples 37 most-common words
2020-08-02 16:37:43,332 :INFO :downsampling leaves estimated 12689806 word corpus (75.1% of prior 16886688)
2020-08-02 16:37:44,282 :INFO :estimated required memory for 135335 words and 50 dimensions: 122141700 bytes
2020-08-02 16:37:44,316

In [200]:
model.train(train_data,total_examples=model.corpus_count,epochs=model.epochs)

2020-08-02 16:39:31,856 :INFO :training model with 3 workers on 135335 vocabulary and 50 features, using sg=0 hs=0 sample=0.001 negative=5 window=5
2020-08-02 16:39:35,390 :INFO :EPOCH 1 - PROGRESS: at 4.82% examples, 607826 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:39:36,396 :INFO :EPOCH 1 - PROGRESS: at 10.46% examples, 655463 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:39:37,400 :INFO :EPOCH 1 - PROGRESS: at 16.75% examples, 701219 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:39:38,417 :INFO :EPOCH 1 - PROGRESS: at 22.69% examples, 711368 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:39:39,418 :INFO :EPOCH 1 - PROGRESS: at 28.98% examples, 730092 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:39:40,423 :INFO :EPOCH 1 - PROGRESS: at 34.86% examples, 733877 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:39:41,436 :INFO :EPOCH 1 - PROGRESS: at 41.21% examples, 742839 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:39:42,439 :INFO :EPOCH 1 - PROGRESS: at 46.97% examples, 74172

2020-08-02 16:40:40,275 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 16:40:40,276 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 16:40:40,290 :INFO :worker thread finished; awaiting finish of 0 more threads
2020-08-02 16:40:40,291 :INFO :EPOCH - 4 : training on 17005207 raw words (12692621 effective words) took 16.8s, 755425 effective words/s
2020-08-02 16:40:41,309 :INFO :EPOCH 5 - PROGRESS: at 6.00% examples, 748610 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:40:42,320 :INFO :EPOCH 5 - PROGRESS: at 12.11% examples, 755259 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:40:43,328 :INFO :EPOCH 5 - PROGRESS: at 18.28% examples, 762098 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:40:44,335 :INFO :EPOCH 5 - PROGRESS: at 24.34% examples, 763090 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:40:45,344 :INFO :EPOCH 5 - PROGRESS: at 30.45% examples, 766094 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:40:46,345 :INFO :EPOCH 5 - 

2020-08-02 16:41:43,575 :INFO :EPOCH 8 - PROGRESS: at 59.20% examples, 680413 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:41:44,575 :INFO :EPOCH 8 - PROGRESS: at 65.20% examples, 687137 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:41:45,577 :INFO :EPOCH 8 - PROGRESS: at 70.96% examples, 690682 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:41:46,582 :INFO :EPOCH 8 - PROGRESS: at 77.01% examples, 695041 words/s, in_qsize 6, out_qsize 1
2020-08-02 16:41:47,590 :INFO :EPOCH 8 - PROGRESS: at 83.01% examples, 698673 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:41:48,591 :INFO :EPOCH 8 - PROGRESS: at 89.01% examples, 702603 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:41:49,601 :INFO :EPOCH 8 - PROGRESS: at 95.24% examples, 707172 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:41:50,369 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 16:41:50,379 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 16:41:50,389 :INFO :worker thread 

2020-08-02 16:42:45,668 :INFO :EPOCH 12 - PROGRESS: at 29.86% examples, 752041 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:42:46,680 :INFO :EPOCH 12 - PROGRESS: at 35.04% examples, 735894 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:42:47,691 :INFO :EPOCH 12 - PROGRESS: at 40.51% examples, 729176 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:42:48,693 :INFO :EPOCH 12 - PROGRESS: at 45.91% examples, 723966 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:42:49,697 :INFO :EPOCH 12 - PROGRESS: at 51.50% examples, 722375 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:42:50,702 :INFO :EPOCH 12 - PROGRESS: at 57.44% examples, 725466 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:42:51,714 :INFO :EPOCH 12 - PROGRESS: at 63.20% examples, 725495 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:42:52,721 :INFO :EPOCH 12 - PROGRESS: at 69.37% examples, 729934 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:42:53,735 :INFO :EPOCH 12 - PROGRESS: at 75.66% examples, 733830 words/s, in_qsize 5, out_

2020-08-02 16:43:48,773 :INFO :worker thread finished; awaiting finish of 0 more threads
2020-08-02 16:43:48,774 :INFO :EPOCH - 15 : training on 17005207 raw words (12691700 effective words) took 16.5s, 766915 effective words/s
2020-08-02 16:43:49,786 :INFO :EPOCH 16 - PROGRESS: at 6.17% examples, 773311 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:43:50,792 :INFO :EPOCH 16 - PROGRESS: at 12.23% examples, 765794 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:43:51,798 :INFO :EPOCH 16 - PROGRESS: at 18.40% examples, 769628 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:43:52,803 :INFO :EPOCH 16 - PROGRESS: at 24.10% examples, 758462 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:43:53,809 :INFO :EPOCH 16 - PROGRESS: at 29.57% examples, 746173 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:43:54,812 :INFO :EPOCH 16 - PROGRESS: at 34.27% examples, 722095 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:43:55,815 :INFO :EPOCH 16 - PROGRESS: at 39.09% examples, 706511 words/s, in_qsize 6, o

2020-08-02 16:44:55,907 :INFO :EPOCH 18 - PROGRESS: at 81.07% examples, 349856 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:44:56,913 :INFO :EPOCH 18 - PROGRESS: at 86.42% examples, 360607 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:44:57,914 :INFO :EPOCH 18 - PROGRESS: at 91.42% examples, 369282 words/s, in_qsize 4, out_qsize 1
2020-08-02 16:44:58,916 :INFO :EPOCH 18 - PROGRESS: at 97.06% examples, 379829 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:44:59,391 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 16:44:59,396 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 16:44:59,406 :INFO :worker thread finished; awaiting finish of 0 more threads
2020-08-02 16:44:59,407 :INFO :EPOCH - 18 : training on 17005207 raw words (12692486 effective words) took 32.9s, 385379 effective words/s
2020-08-02 16:45:00,415 :INFO :EPOCH 19 - PROGRESS: at 6.11% examples, 772055 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:45:01,417 :INFO :EPOC

2020-08-02 16:45:58,198 :INFO :EPOCH 22 - PROGRESS: at 55.97% examples, 787754 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:45:59,205 :INFO :EPOCH 22 - PROGRESS: at 62.26% examples, 788289 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:46:00,208 :INFO :EPOCH 22 - PROGRESS: at 68.43% examples, 787683 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:46:01,222 :INFO :EPOCH 22 - PROGRESS: at 74.60% examples, 786725 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:46:02,228 :INFO :EPOCH 22 - PROGRESS: at 81.13% examples, 788141 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:46:03,229 :INFO :EPOCH 22 - PROGRESS: at 87.36% examples, 788193 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:46:04,245 :INFO :EPOCH 22 - PROGRESS: at 93.77% examples, 788859 words/s, in_qsize 6, out_qsize 0
2020-08-02 16:46:05,253 :INFO :EPOCH 22 - PROGRESS: at 99.88% examples, 787621 words/s, in_qsize 2, out_qsize 1
2020-08-02 16:46:05,255 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 16:4

2020-08-02 16:46:59,205 :INFO :EPOCH 26 - PROGRESS: at 29.81% examples, 753441 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:47:00,208 :INFO :EPOCH 26 - PROGRESS: at 35.86% examples, 756907 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:47:01,213 :INFO :EPOCH 26 - PROGRESS: at 42.15% examples, 762466 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:47:02,226 :INFO :EPOCH 26 - PROGRESS: at 48.44% examples, 766281 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:47:03,226 :INFO :EPOCH 26 - PROGRESS: at 54.67% examples, 769644 words/s, in_qsize 6, out_qsize 1
2020-08-02 16:47:04,234 :INFO :EPOCH 26 - PROGRESS: at 60.91% examples, 771287 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:47:05,243 :INFO :EPOCH 26 - PROGRESS: at 67.31% examples, 774417 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:47:06,244 :INFO :EPOCH 26 - PROGRESS: at 73.54% examples, 776100 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:47:07,256 :INFO :EPOCH 26 - PROGRESS: at 80.07% examples, 777958 words/s, in_qsize 5, out_

2020-08-02 16:48:04,473 :INFO :EPOCH 29 - PROGRESS: at 97.59% examples, 768842 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:48:04,842 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 16:48:04,843 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 16:48:04,849 :INFO :worker thread finished; awaiting finish of 0 more threads
2020-08-02 16:48:04,851 :INFO :EPOCH - 29 : training on 17005207 raw words (12692084 effective words) took 16.5s, 769684 effective words/s
2020-08-02 16:48:05,866 :INFO :EPOCH 30 - PROGRESS: at 6.29% examples, 785594 words/s, in_qsize 4, out_qsize 1
2020-08-02 16:48:06,868 :INFO :EPOCH 30 - PROGRESS: at 12.52% examples, 785006 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:48:07,874 :INFO :EPOCH 30 - PROGRESS: at 18.87% examples, 789958 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:48:08,892 :INFO :EPOCH 30 - PROGRESS: at 25.22% examples, 790911 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:48:09,896 :INFO :EPOC

2020-08-02 16:49:06,955 :INFO :EPOCH 33 - PROGRESS: at 82.25% examples, 797346 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:49:07,960 :INFO :EPOCH 33 - PROGRESS: at 88.48% examples, 796633 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:49:08,969 :INFO :EPOCH 33 - PROGRESS: at 94.94% examples, 797479 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:49:09,760 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 16:49:09,765 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 16:49:09,769 :INFO :worker thread finished; awaiting finish of 0 more threads
2020-08-02 16:49:09,770 :INFO :EPOCH - 33 : training on 17005207 raw words (12691502 effective words) took 15.9s, 797366 effective words/s
2020-08-02 16:49:10,777 :INFO :EPOCH 34 - PROGRESS: at 6.29% examples, 793308 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:49:11,782 :INFO :EPOCH 34 - PROGRESS: at 12.46% examples, 783494 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:49:12,787 :INFO :EPOC

2020-08-02 16:50:09,128 :INFO :EPOCH 37 - PROGRESS: at 68.43% examples, 784923 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:50:10,132 :INFO :EPOCH 37 - PROGRESS: at 74.60% examples, 784728 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:50:11,138 :INFO :EPOCH 37 - PROGRESS: at 80.83% examples, 783422 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:50:12,146 :INFO :EPOCH 37 - PROGRESS: at 86.89% examples, 781903 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:50:13,150 :INFO :EPOCH 37 - PROGRESS: at 93.24% examples, 783220 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:50:14,159 :INFO :EPOCH 37 - PROGRESS: at 99.53% examples, 783506 words/s, in_qsize 5, out_qsize 0
2020-08-02 16:50:14,220 :INFO :worker thread finished; awaiting finish of 2 more threads
2020-08-02 16:50:14,226 :INFO :worker thread finished; awaiting finish of 1 more threads
2020-08-02 16:50:14,227 :INFO :worker thread finished; awaiting finish of 0 more threads
2020-08-02 16:50:14,229 :INFO :EPOCH - 37 : training on 17005

In [213]:
# In order to get document vector of a sentence , pass the sentence in line of list of words to the infer_vector method
model.infer_vector(['Corona','virus','pandemic'])

array([-0.20169373, -0.22507694, -0.09382267, -0.00425628,  0.0994833 ,
       -0.02771945,  0.30235487, -0.02431474,  0.32105815,  0.1743709 ,
       -0.09962266,  0.08622926,  0.21271455, -0.10941842,  0.17956604,
        0.2929854 , -0.15737611, -0.20132913, -0.04100657,  0.10605428,
       -0.05084371,  0.11696529, -0.03186974, -0.00066347,  0.15038636,
       -0.23955227,  0.05984666, -0.16224216, -0.03229687,  0.11278835,
        0.2681371 , -0.13419876, -0.08148566, -0.0548914 ,  0.26457947,
       -0.06902724,  0.2113031 ,  0.03951471,  0.25166818, -0.13541122,
       -0.05450862, -0.02158979,  0.08941766,  0.08173756,  0.03355532,
       -0.04006654, -0.43827912,  0.18767165,  0.01921168, -0.10870411],
      dtype=float32)

# <span style='color:brown'>Compute similarity metrics like cosine similarity and soft cosine similarity</span>

In [218]:
from gensim.matutils import softcossim

In [252]:
sent_1 = 'Sachine is a cricket player and a opening batsman'.split()
sent_2 ='Dhoni is a cricket player too He is a batsman and keeper'.split()
sent_3 = 'Anand is a chess player'.split()
sent_4 = 'Who killed Sushant Singh Rajput'.split()

In [253]:
#Prepre the similarity matrix
similarity_matrix =fasttext_model1300.similarity_matrix(dictionary,tfidf=None,threshold=0.0,exponent=2.0,nonzero_limit=100)

2020-08-02 20:12:05,123 :INFO :constructing a term similarity matrix
2020-08-02 20:12:05,126 :INFO :PROGRESS: at 5.26% rows (1 / 19, 0 skipped, 5.263158% density)
  if np.issubdtype(vec.dtype, np.int):
2020-08-02 20:12:05,317 :INFO :constructed a term similarity matrix with 87.257618 % nonzero elements


In [254]:
documents=[sent_1,sent_2,sent_3,sent_4]

In [255]:
dictionary=corpora.Dictionary(documents)

2020-08-02 20:12:10,508 :INFO :adding document #0 to Dictionary(0 unique tokens: [])
2020-08-02 20:12:10,511 :INFO :built Dictionary(19 unique tokens: ['Sachine', 'a', 'and', 'batsman', 'cricket']...) from 4 documents (total 31 corpus positions)


In [256]:
#Convert bag of words
sent_1 = dictionary.doc2bow(sent_1)
sent_2 = dictionary.doc2bow(sent_2)
sent_3 = dictionary.doc2bow(sent_3)
sent_4 = dictionary.doc2bow(sent_4)

In [257]:
#compute soft cosine Similarity
print(softcossim(sent_1,sent_2,similarity_matrix))

0.8477188222533869


In [258]:
print(softcossim(sent_1,sent_3,similarity_matrix))

0.6847319615715306


In [259]:
print(softcossim(sent_2,sent_3,similarity_matrix))

0.7099691700919265


In [260]:
print(softcossim(sent_1,sent_4,similarity_matrix))

0.2127669278652476


# <span style='color:green'>Some useful similarities and distance metrics based on word embedding models fasttext, GloVe</span>

In [262]:
#dissimilar
print(fasttext_model1300.doesnt_match(['India','Australia','china','pakistan','beetroot']))

beetroot


  if np.issubdtype(vec.dtype, np.int):


In [266]:
#cosine distance between words
print(fasttext_model1300.distance('king','queen'))

0.22957539558410645


  if np.issubdtype(vec.dtype, np.int):


In [268]:
#Cosine distance between a word and series of words
print(fasttext_model1300.distances('king',['queen','man','woman']))

[0.22957546 0.465837   0.547001  ]


In [269]:
#Cosine Similarities
fasttext_model1300.cosine_similarities(fasttext_model1300['king'],vectors_all=(fasttext_model1300['queen'],fasttext_model1300['man'],
                                                                              fasttext_model1300['woman'],
                                                                               fasttext_model1300['queen']+fasttext_model1300['man']))

array([0.77042454, 0.534163  , 0.45299897, 0.76572555], dtype=float32)

In [None]:
#Get the words closer to w1 than w2
print(glov)