# GloVe
You may find an implemention of GloVe here:

https://pypi.org/project/glove-python-binary/ - must be used in the Python 3.5 and above

https://github.com/maciejkula/glove-python


Ref:

https://medium.com/analytics-vidhya/word-vectorization-using-glove-76919685ee0b

In [None]:
!pip install glove-python-binary



#There two major tasks in GloVe

*   creating a co-occurrence matrix from the corpus, 
*   using it to produce the embeddings

#These are achieved by two classes:

1.   Corpus :  Given a corpus, it constructs vocabulary and co-occurrence matrix

2.   Glove : trains the embeddings



In [None]:
from glove import Corpus, Glove

#Pre-processing Functions

In [None]:
%run /content/gdrive/MyDrive/NLP2022/Text_Representation/preprocess.ipynb

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


#Load Corpus

In [None]:
path = r'/content/gdrive/MyDrive/NLP2022/Text_Representation'

In [None]:
#The Mahabharata corpus is taken form the following website:
#https://www.sacred-texts.com/hin/maha/ 
data=[]
n=19 #18 Texts of Mahabharatha
for i in range(1,n):
    path1=path + "/mb/maha"+str(i) + '.txt'
    #print(path)
    file = open(path1)
    lines = file.read()
    doc = preprocess(lines)
    data.append(doc)

In [None]:
len(data)

18

In [None]:
sent= []
text =[]
for doc in data:
    for s in doc:
        sent.append(s.split()) 
        text.append(s)
        
print(len(sent))
print(sent[10000])

124015
['section', 'cciii', 'viduragamana', 'parva', 'continued', 'vaisampayana', 'said', 'dhritarashtra', 'replied', 'saying', 'i', 'desire', 'to', 'do', 'exactly', 'what', 'you', 'would', 'recommend']


In [None]:
type(text)

list

In [None]:
import pandas as pd
df = pd.DataFrame(text,columns=["text"])

In [None]:
df

Unnamed: 0,text
0,the mahabharataofkrishna dwaipayana vyasabooka...
1,proofed at distributed proofing juliet suther...
2,additional proofing and formatting at sacred t...
3,that being so his chief duty is to represent ...
4,in regard to translations from the sanskrit n...
...,...
124010,after concluding a recitation of the bharata ...
124011,i have thus o chief of men told everything ...
124012,he that listens with devotion to this bharata ...
124013,destroying all his sins like the maker of day ...


In [None]:
df.to_csv("preprocessedMB.csv")

#Create the GloVe Model

The corpus.fit() takes two arguments:

1.   lines — text after pre-processing
2.   window — context window

In [None]:
corpus = Corpus() #Corpus Object

#create the co-occurrence matrix for text data with respect to a context window
corpus.fit(sent, window=10)

##corpus --> co-occ Mat


The Glove() constructor takes 

1.   no_of_components — size of the word vectors that are created
2.   learning_rate - machine learning parameter - learning rate


In [None]:
#Golve object
glove = Glove(no_components=25) #size of vectors

The glove.fit() takes:

1.   cooccurence_matrix: the matrix of word-word co-occurrences
2.   epochs: number of times the dataset is processed
3. no_of_threads: number of threads for parallel processing







In [None]:
#only once
import time
start = time.time()
glove.fit(corpus.matrix, epochs=50, no_threads=4)## co-occ --> word embeddings
glove.add_dictionary(corpus.dictionary)
glove.save('glove.model')
end = time.time()
end-start

81.33157062530518

In [None]:
len(corpus.dictionary)

32138

In [None]:
glove.word_vectors[glove.dictionary['krishna']]

array([-2.91807084e-01, -3.32005124e-01,  6.31997769e-01,  3.15874387e-01,
       -4.37353848e-01,  4.29821910e-01,  2.06571078e-01, -4.75993005e-01,
        1.26553917e-01,  3.15851696e-01, -1.14847048e-01, -3.36544164e-01,
        3.74548087e-01, -4.00699518e-01,  3.25098067e-01,  1.77077153e-01,
        3.27180602e-01, -1.06984117e-01,  8.65871184e-02, -3.92025700e-01,
       -3.34928644e-01,  3.06306652e-01,  2.10976822e-01,  3.01185689e-01,
       -3.56836091e-01,  3.19020071e-01,  2.49654421e-01,  7.08883638e-05,
        4.45357491e-01,  2.19307910e-01])

In [None]:
glove.most_similar('king')

[('monarch', 0.919001331817927),
 ('yudhishthira', 0.9077564337658728),
 ('duryodhana', 0.9036964223178245),
 ('then', 0.8693812108159769)]

In [None]:
D = {word: glove.word_vectors[glove.dictionary[word]] for word in glove.dictionary.keys()}

In [None]:
#D

In [None]:
D['arjuna']

array([-0.20000414, -0.15423234,  0.37347789,  0.33541933, -0.39752285,
        0.38975892,  0.43145379, -0.62951984,  0.0075359 ,  0.54347898,
       -0.32503328, -0.14352344,  0.25301757, -0.27988916,  0.35260083,
        0.34029142,  0.32079188, -0.11034939, -0.17756282, -0.19393753,
       -0.50417504,  0.13132983,  0.39275059,  0.6866579 , -0.2537895 ,
        0.73260225,  0.29672534,  0.27533914,  0.44794031,  0.1618586 ])

In [None]:
import pandas as pd
df = pd.DataFrame(D)

In [None]:
df = df.transpose()

In [None]:
df.sample(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29
yuvaraja,-0.090175,0.173788,-0.06062,-0.087723,0.042508,-0.110741,-0.148258,0.072733,-0.112113,-0.027217,0.168409,0.060113,0.035432,0.003615,-0.099589,-0.098474,-0.125295,0.099536,-0.11708,0.092391,0.178871,0.025299,-0.091153,-0.027803,0.041369,-0.206632,-0.075993,-0.092921,-0.112651,-0.085806
devah,-0.148272,0.094555,-0.034239,-0.088128,0.059334,-0.093166,-0.070045,0.051504,-0.142437,-0.002115,0.099959,0.087284,0.053985,0.01658,-0.123098,-0.086289,-0.119443,0.106606,-0.160906,0.077136,0.107124,0.04942,0.06918,-0.027654,0.024024,-0.083068,-0.033574,-0.113328,-0.123192,-0.072658
inclinations,0.132576,0.091669,-0.088153,0.130952,-0.041467,-0.001004,-0.168067,0.108301,-0.075147,-0.123997,0.119734,-0.101322,-0.243224,-0.199557,-0.128331,0.069724,-0.053246,-0.001616,0.043575,-0.069553,0.006491,0.166651,0.068408,-0.050124,-0.204322,-0.209814,0.086402,0.216513,0.038111,-0.035231
yajus,0.074357,0.207149,0.054827,-0.288221,0.05929,0.008097,-0.057913,0.036143,-0.097,-0.091508,0.120186,-0.013024,0.375026,0.216188,0.076051,0.029874,-0.054954,-0.046785,-0.153614,0.001973,0.050956,0.253311,0.042961,-0.244153,0.081546,-0.147467,-0.227786,0.172118,-0.001505,-0.003467
carefulness,0.082717,0.017598,-0.059997,0.076102,0.081508,-0.128355,-0.15802,0.131366,-0.218814,-0.08127,0.158615,0.113827,0.071927,0.170103,-0.160403,-0.204639,-0.180102,0.186638,-0.147061,0.16314,0.205324,-0.093402,0.017638,0.058605,0.091549,-0.084055,-0.10302,-0.002785,-0.225888,-0.086627
dharmarayana,-0.05954,0.093488,-0.08327,-0.046643,0.096761,-0.128482,-0.132529,0.139134,-0.05903,-0.038529,0.105993,0.084021,-0.000304,0.011583,-0.115119,-0.119034,-0.12998,0.13983,-0.081319,0.103443,0.136812,0.018951,-0.015566,0.039744,0.044977,-0.099056,-0.033423,-0.044085,-0.132494,-0.09302
dhatu,-0.101304,0.040685,-0.082835,0.046168,0.101298,-0.13223,-0.096962,0.114962,-0.059808,0.02963,0.051836,0.108415,0.045431,-0.03628,-0.108607,-0.122019,-0.118801,0.117372,-0.092043,0.13614,0.132405,-0.010769,0.006021,0.067204,0.08936,-0.068971,-0.017847,-0.102586,-0.146375,-0.113471
corroborated,-0.08039,0.08507,-0.126881,-0.072137,0.11181,-0.132046,-0.130534,0.132503,-0.130037,-0.019299,0.089463,0.122312,0.056974,0.048796,-0.113326,-0.141167,-0.131897,0.108509,-0.181657,0.163493,0.15838,-0.028679,-0.046763,-0.004251,0.059727,-0.137927,-0.05915,-0.126832,-0.164884,-0.100149
confident,0.204229,-0.020032,0.029914,-0.048278,0.05211,-0.089547,-0.056434,0.143117,-0.008825,-0.104845,0.096173,0.090288,-0.045591,0.022414,-0.003666,-0.120635,-0.142906,0.093067,-0.114709,0.125198,0.085968,0.127019,-0.022304,-0.237611,0.208408,-0.111387,0.016275,0.042068,-0.107131,-0.161583
mrityu,-0.033972,-0.198969,0.236953,-0.396861,-0.162922,0.09563,-0.264257,-0.043606,-0.23858,-0.192751,0.250104,-0.07127,0.363494,-0.198736,-0.163597,0.012363,-0.039439,0.089712,-0.213322,-0.171284,0.015213,0.390443,0.180253,-0.198672,0.109253,0.245038,-0.036233,-0.137464,0.070058,0.091131


In [None]:
df.to_csv('mb_golve.csv')

Save the dictionary to file and follow the same process as used in pre-trained model