# <font color='red'>compute_embeddings.ipynb</font>

<br><b>Filename: compute_embeddings.ipynb</b> ---> <font color='purple'>defines the implementation pipeline for computing word embeddings for any given input text. The approach utilizes the pre-trained 50-dimensional vectors obtained using Wikipedia corpus. The file comprises of 6B tokens.</font>
<hr/>
This notebook specifies the following functions: ( the sequence of description is same as the sequence of their definition in the notebook cells below )
<ol>
    <li><b>glove_embedding( cat ): </b> Given the list of strings ( can be keywords, phrases or complete sentences ), this function returns the corresponding feature representation.</li>
    <ul>
        <li>For single word inputs, the function simply looks up for the word in the vector list.</li>
        <li>For any input with >= 2 words, it computes their separate embeddings and aggregates them (average) to obtain a single n-dimensional (n=50 in this case) vector for the entire input.</li>
    </ul>
    <br>
    <li><b>embedding_model( cat ):</b> The driver function for computing feature representation for given input.</li>
</ol>

<img src='images/glove.png'>


### CELL #1: importing required modules

In [2]:
import numpy as np
import pandas as pd
import scipy as sp
import nltk
from scipy import spatial
#from sklearn.manifold import TSNE
import re

### CELL #2: defining glove_embedding( cat ):
<br>Function description in the top cell
<br>This function does the following sequence of operations:
<ol>
    <li>Import the pre-trained word vectors text file and convert it into a dictionary.</li>
    <li>Create a dataframe to store the string and its corresponding embedding.</li>
    <li>For each input string:</li>
    <ol>
        <li>Remove unnecessary spaces from the string.</li>
        <li>Word tokenize the string</li>
        <li>If a single word in the string, simply look up for the word in the vector file</li>
        <li>If this single word is not in the vector file, replace the word with none.</li>
        <li>If more than one word in the string, obtain embedding for each individual word and average them.</li>
        <li>If any of these words is not present in the vector file, ignore the word.</li>
        <li>Update the dataframe with the embedding for the string.</li>
    </ol>
    <li>Return these computed results</li>
</ol>

In [None]:
def glove_embedding(cat):
    
    #---------------------------------------------------------------------- STEP-1 STARTS HERE
    
    embeddings_dict={}
    with open("pre_trained_vectors/glove.6B.50d.txt", 'r', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:],"float32")
            embeddings_dict[word] = vector
            
    #---------------------------------------------------------------------- STEP-1 ENDS HERE
    
    cat_embeddings = pd.DataFrame(columns=['word','embedding'])  #--------- STEP-2
    
    #---------------------------------------------------------------------- STEP-3 STARTS HERE
    
    for i in range(len(cat)):
        #print("-------------- Computing embedding for: ",cat[i])
        #print("Text No: ",i+1)
        cat[i] = re.sub(' +',' ',cat[i])  #-------------------------------- 3 A)
        #cat[i] = cat[i].replace("  "," ")
        s = len(cat_embeddings)
        cat_embeddings.loc[s,'word'] = cat[i] 
        '''if i ==5685:
            print("----",cat[i].split())'''
            #embeddings_dict[cat[i]]
        if len(cat[i].split())==1: #---------------------------- 3 B) , C)
            try:
                cat_embeddings.loc[s,'embedding'] = embeddings_dict[(cat[i].split())[0]]
            except KeyError as ke: #---------------------------- 3 D)
                #print("WORD NOT FOUND: ",words[j])
                cat_embeddings.loc[s,'word'] = 'none'
                cat_embeddings.loc[s,'embedding'] = embeddings_dict['none']
                #print("!!!!!!!!!     ",i,"    !!!!!!!!!!")
                pass
        else: #---------------------------- 3 B) , E)
            words = cat[i].split(" ")
            words_embed=[]
            for j in range(len(words)):
                try:
                    words_embed.append(embeddings_dict[words[j]])
                except KeyError as ke: #---------------------------- 3 F)
                    #print("WORD NOT FOUND: ",words[j])
                    pass
            cat_embeddings.loc[s,'embedding'] = np.mean(words_embed,axis=0) #----------- 3 G)
            
    #---------------------------------------------------------------------- STEP-3 STARTS HERE
    
    cat_embeddings.to_csv("output_files/vectors_for_svm.csv")
    return cat_embeddings #---------------------------- STEP-4

### CELL #3: defining embedding_model( cat ):
<br>Driver function for computing feature representations for given list of strings.
<br>Function description in the top cell


In [None]:
def embedding_model(cat):
    print("------------------ COMPUTING FEATURE REPRESENTATIONS........")
    g = glove_embedding(cat)
    print("------------------ REPRESENTATIONS SUCCESSFULLY COMPUTED AND STORED!")
    return g