# GloVe
You may find an implemention of GloVe here:

https://pypi.org/project/glove-python-binary/ - must be used in the Python 3.5 and above

https://github.com/maciejkula/glove-python


Ref:

https://medium.com/analytics-vidhya/word-vectorization-using-glove-76919685ee0b

In [1]:
#pip install glove-python-binary

#There two major tasks in GloVe

*   creating a co-occurrence matrix from the corpus, 
*   using it to produce the embeddings

#These are achieved by two classes:

1.   Corpus :  Given a corpus, it constructs vocabulary and co-occurrence matrix

2.   Glove : trains the embeddings



In [2]:
from glove import Corpus, Glove
import re
import glob
from nltk.tokenize import sent_tokenize
import string

#Pre-processing Functions

In [3]:
#%run /content/gdrive/MyDrive/NLP2022/Text_Representation/preprocess.ipynb

In [4]:
def preprocess(lines):
    lines = lines.lower()
    lines = lines.replace('\n',' ')
    lines = lines.replace("-"," ")
    p = string.punctuation.replace(".","")
    lines = lines.translate(str.maketrans('', '', p))
    lines = sent_tokenize(lines)
    lines = list(filter(None, lines))
    return lines

#Load Corpus

In [5]:
path = r'C:\spark\MCA\Semester1\E3_NLP\Lab_E8\mahabharat_english\\*'
list_file = glob.glob(path)
list_file

['C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha01.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha02.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha03.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha04.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha05.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha06.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha07.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha08.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha09.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha10.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha11.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\\maha12.txt',
 'C:\\spark\\MCA\\Semester1\\E3_NLP\\Lab_E8\\mahabharat_english\

In [6]:
#The Mahabharata corpus is taken form the following website:
#https://www.sacred-texts.com/hin/maha/ 
data=[]
n=19 #18 Texts of Mahabharatha
for file_path in list_file:
    file = open(file_path)
    lines = file.read()
    doc = lines
    doc = preprocess(lines)
    data.append(doc)

In [7]:
len(data)

18

In [8]:
sent= []
text =[]
for doc in data:
    for s in doc:
        sent.append(s.split()) 
        text.append(s)

In [9]:
print(len(sent))
print(sent[10000])

114760
['prativindhya', 'by', 'yudhishthira', 'sutasoma', 'by', 'vrikodara', 'srutakarman', 'by', 'arjuna', 'satanika', 'by', 'nakula', 'and', 'srutasena', 'by', 'sahadeva', 'these', 'were', 'the', 'five', 'heroes', 'and', 'great', 'warriors', 'that', 'panchali', 'brought', 'forth', 'like', 'aditi', 'bringing', 'forth', 'the', 'adityas.']


In [10]:
type(text)

list

In [11]:
import pandas as pd
df = pd.DataFrame(text,columns=["text"])

In [12]:
df

Unnamed: 0,text
0,the mahabharata of krishna dwaipayana vy...
1,additional proofing and formatting at sacred t...
2,translators preface the object of a translato...
3,that being so his chief duty is to represent s...
4,in regard to translations from the sanskrit no...
...,...
114755,after concluding a recitation of the bharata o...
114756,i have thus o chief of men told everything in ...
114757,he that listens with devotion to this bharata ...
114758,destroying all his sins like the maker of day ...


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114760 entries, 0 to 114759
Data columns (total 1 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    114760 non-null  object
dtypes: object(1)
memory usage: 896.7+ KB


In [14]:
df.to_csv("preprocessedMB.csv")

#Create the GloVe Model

The corpus.fit() takes two arguments:

1.   lines — text after pre-processing
2.   window — context window

In [15]:
corpus = Corpus() #Corpus Object

#create the co-occurrence matrix for text data with respect to a context window
corpus.fit(sent, window=10)

##corpus --> co-occ Mat


The Glove() constructor takes 

1.   no_of_components — size of the word vectors that are created
2.   learning_rate - machine learning parameter - learning rate


In [16]:
#Golve object
glove = Glove(no_components=25) #size of vectors

The glove.fit() takes:

1.   cooccurence_matrix: the matrix of word-word co-occurrences
2.   epochs: number of times the dataset is processed
3. no_of_threads: number of threads for parallel processing







In [17]:
#only once
import time
start = time.time()
glove.fit(corpus.matrix, epochs=50, no_threads=4)## co-occ --> word embeddings
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
end = time.time()
end-start

94.24503970146179

In [18]:
len(corpus.dictionary)

46791

In [19]:
glove.word_vectors[glove.dictionary['krishna']]

array([ 0.4778509 ,  0.14773281,  0.39837043, -0.3770924 , -0.28766537,
       -0.17131281, -0.41354778, -0.40933758,  0.47970575, -0.30716545,
        0.49147569, -0.53670212, -0.31391455, -0.35042678, -0.3389083 ,
       -0.28259091,  0.45590131, -0.12468991, -0.317167  ,  0.24706167,
        0.51958547, -0.14584829, -0.50549949, -0.18544229,  0.40084348])

In [20]:
glove.most_similar('king')

[('monarch', 0.9555552304184208),
 ('duryodhana', 0.9433193661788369),
 ('yudhishthira', 0.9222028845148091),
 ('bharata', 0.903510897698654)]

In [21]:
D = {word: glove.word_vectors[glove.dictionary[word]] for word in glove.dictionary.keys()}

In [22]:
#D

In [23]:
D['arjuna']

array([ 0.57699267,  0.36621157,  0.55405507, -0.56335129, -0.0650611 ,
        0.10685475, -0.81364381, -0.46516488,  0.30973729, -0.09798152,
        0.54853518, -0.39354963, -0.20064745, -0.40925738, -0.36333385,
       -0.15747619,  0.34456326, -0.00810636, -0.36407946,  0.4957972 ,
        0.50644452, -0.11737984, -0.25802467,  0.33445719,  0.42122452])

In [24]:
import pandas as pd
df = pd.DataFrame(D)

In [25]:
df = df.transpose()

In [26]:
df.sample(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,23,24
yavana,-0.044939,-0.095258,-0.097646,0.180051,0.155797,-0.285886,0.059299,0.037465,-0.097905,0.029441,...,0.160152,-0.062681,0.11712,0.06404,-0.063518,0.027999,0.21105,-0.068955,0.048399,-0.102791
exquisite,0.01549,-0.183916,0.122078,0.182846,-0.101313,-0.044686,0.161934,0.073581,-0.185812,0.103931,...,0.200986,-0.018348,0.25418,0.063273,-0.041993,0.055287,-0.043401,-0.003465,0.192534,-0.074508
pratisrotas,-0.098649,-0.110395,0.056684,0.071223,0.065958,-0.041973,-0.008777,0.153775,-0.054653,0.132646,...,0.042309,-0.102647,0.056865,0.154739,-0.124225,-0.100677,-0.025158,0.035445,0.066316,-0.118231
dlxviii,-0.121528,-0.103558,0.021556,0.148777,0.088002,-0.046783,-0.11297,0.16106,-0.03453,0.165765,...,-0.043224,-0.087877,0.079167,0.172218,-0.117246,-0.105481,-0.099671,0.021953,0.013451,-0.124036
abandoning,0.044545,0.167809,-0.262169,-0.176397,0.06155,-0.158869,-0.17472,-0.247173,0.126836,-0.296499,...,-0.032308,0.07554,0.095868,-0.246085,0.372745,0.087675,-0.103422,-0.135601,0.175592,0.288789
thirsty,-0.041761,0.065411,0.354773,-0.02413,-0.004028,0.02035,0.389434,-0.065159,0.099365,0.099425,...,0.151063,0.155451,0.116534,-0.08261,0.020003,0.081936,-0.1476,-0.141729,-0.067208,0.133219
1661.,-0.01234,-0.042875,0.020202,0.023086,0.016774,-0.040143,0.01934,0.027904,-0.000577,0.029923,...,-0.004301,-0.00684,0.01776,0.038821,-0.006405,-0.040565,0.040636,-0.00053,0.065142,-0.007408
karambhas,-0.09269,-0.078799,0.077762,0.117036,0.090188,0.012267,-0.082311,0.085237,-0.097816,0.098873,...,0.016286,-0.100723,-0.056833,0.097675,-0.118214,-0.087055,-0.062048,-0.013202,0.006834,-0.074448
yogis,-0.068003,-0.07092,-0.040563,0.052159,0.03882,0.019839,0.074766,0.139045,-0.030791,0.117465,...,-0.029618,-0.031111,0.092946,0.119906,-0.085155,-0.097439,0.101397,0.040428,-0.051095,-0.157071
hereto,-0.106055,-0.070734,0.065577,0.132028,0.065903,-0.005411,-0.16146,0.196275,-0.074539,0.088667,...,-0.024514,-0.092012,-0.032896,0.196904,-0.150236,-0.066463,-0.052547,0.002552,-0.061027,-0.174543


In [27]:
df.to_csv('mb_golve.csv')

Save the dictionary to file and follow the same process as used in pre-trained model