In [None]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk import pos_tag
import re

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download("stopwords")
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words("english"))

In [None]:
def lemmatization(text):
    
    lemmatizer = WordNetLemmatizer()
    
    return " ".join([lemmatizer.lemmatize(word) for word in text.split() ])

def remove_stop_words(text):
    
    return " ".join([word for word in text.split() if word not in stop_words])
                
def remove_numbers(text):
                    
    return " ".join([word for word in text.split() if not word.isdigit()])

def remove_punctuations(text):
    
    text = re.sub("[%s]" % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), " ", text)
    text = re.sub("\s+", " ", text)
    
    return text
    
                    
def lower_case(text):
                    
    return " ".join([word.casefold() for word in text.split()])
                    

def normalize_text(data):
    
    data.text = data.text.apply(lambda text: lower_case(text))
    data.text = data.text.apply(lambda text: remove_stop_words(text))
    data.text = data.text.apply(lambda text: remove_numbers(text))
    data.text = data.text.apply(lambda text: remove_punctuations(text))
    data.text = data.text.apply(lambda text: lemmatization(text))
    
    return data

In [None]:
def read_gloves():
    
    word_to_vec= dict()
    word_to_index = dict()
    index_to_word = dict()
    
    with open("glove.6b.300d.txt", "r",encoding='utf-8') as f:
        
        for i, line in enumerate(f):
            data = line.strip().split()
            word_to_vec[data[0]] = np.array(data[1:], dtype = np.float64)
            word_to_index[data[0]] = i+1;
            index_to_word[i+1] = data[0]
            
    return word_to_vec, word_to_index, index_to_word

def get_word_beg(X_train):
    
    wordBag = set()
    
    for x in X_train:
        for word in x.split():
            wordBag.add(word.lower())
            
    for x in X_test:
        for word in x.split():
            wordBag.add(word.lower())
    
    wordBag.add("unk")
    
    word_to_index = dict()
    index_to_word = dict()
    for i, word in enumerate(wordBag):
        word_to_index[word] = i+1
        index_to_word[i+1] = word
        
    return  word_to_index, index_to_word
    
    
def x_to_indices(X, maxLen, word_to_indices):
    
    X_indices = np.zeros( (X.shape[0], maxLen) )
    
    for i,x in enumerate(X):
        for j, word in zip(range( min(maxLen,len(x.split())) ), x.split()):
            
            if word.lower() in word_to_indices:
                X_indices[i, j] = word_to_indices[word.lower()]
            else:
                X_indices[i, j] = word_to_indices["unk"]
            
    return X_indices.astype(int)


def one_hot_x(X_indices, word_to_index):
    
    I = np.eye( len(word_to_index) + 1 )
    I[0,0] = 0
    return I[X_indices]
        
        
def one_hot_y(Y, cls = 5):
    
    return np.eye(cls)[Y]


def get_label():
    
    return { 0: "sadness", 1 : "joy", 2 : "love", 3: "anger", 4 : "fear", 5 : "surprise"}
    