# Test word embedings

In [7]:
import os
import string

import nltk
import pandas as pd
import numpy as np
import random

from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk import pos_tag, RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from collections import defaultdict
from nltk.corpus import wordnet

from sklearn.model_selection import KFold


nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to /home/mxdelmas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mxdelmas/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/mxdelmas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mxdelmas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Import & process data

Pour chaque sentense de la question-pairs:

1) Tokenization en utilisant *word_tokenize* de nltk. C'est le tokenizer par défaut de la librairie nltk.tokenize

2) to.lower ()

3) POS tagging: On souhaite faire de la lematization, plutôt que juste faire du steming. Cependant pour pouvoir appliquer les algorithmes de lematization, on a besoin de savoir si le mot est employé comme verbe, nom ou ajectif etc. dans la phrase. Il faut donc faire un pré-traitement de POS-tagging

4) Avant d'appliquer l'algo de lemmatisation, on va remove les stop words. On a utiliser une liste personnalité de stop words. J'ai notamment rajouter "need", "should", "would", "n't". L'absence de "need", "should", "would" est étrange mais j'avais lu dans un article que justement beaucoup de limites avaient été identifiés par rapport aux liste de stop-words et que certaines sont plus ou moins complète. Pour "n't" je l'ai rajouté à cause du processus de tokenization que l'on utilise avec *word_tokenize* car il va par exemple tokenizer "wouldn't" en "would" + "n't".

5) On map le tag de treebank sur les tag de wordnet. En fait le process de pos-tagging que l'on utilise avec ntlk nous tag les tokens en utilisant les tags de tree-bank, or, ensuite on utilise le lemmatizer de wordnet et donc il nous faut les tags de wordNet qui sont restreint à Adjectif, nom, verbe et adverbe. On réalise donc un mapping

6) Enfin on applique le lemmatizer

# Seed

In [None]:
def set_seed(seed: int):
    """
    Helper function for reproducible behavior to set the seed in ``random``, ``numpy``, ``torch`` and/or ``tf`` (if
    installed).

    Args:
        seed (:obj:`int`): The seed to set.
    """
    random.seed(seed)
    np.random.seed(seed)

set_seed(1024)

In [None]:
def tokenize_sentense(sentense, stop_words, lemmatizer):
    """tokenize an input sentense

    Args:
        sentense (str): the input sentense
        stop_words (list): the stop word list
        lemmatizer (WordNetLemmatizer): the WordNet lemmatizer

    Returns:
        tokens: the token vector list
    """

    def get_wordnet_tag(treebank_tag):

        if treebank_tag.startswith('J'):
            return wordnet.ADJ

        elif treebank_tag.startswith('V'):
            return wordnet.VERB

        elif treebank_tag.startswith('N'):
            return wordnet.NOUN

        elif treebank_tag.startswith('R'):
            return wordnet.ADV

        else:
            return None

    stop_words_and_punct = stop_words + list(string.punctuation) 
    
    # tokenize
    tokens = word_tokenize(sentense)
    
    # to lower
    tokens = [t.lower() for t in tokens]
    
    # postag 
    tokens = pos_tag(tokens)
    
    # remove stop words from stop_words list
    tokens = [(tks, tag) for (tks, tag) in tokens if tks not in stop_words_and_punct]

    # Transform tree_bank_tag to wordnet tag
    tokens = [(tks, get_wordnet_tag(tag)) for (tks, tag) in tokens]

    # lemmatization with pos-tagging
    tokens = [lemmatizer.lemmatize(tks, tag) if tag else tks for (tks, tag) in tokens]

    return tokens
    

def prepare_data_and_vocabulary(data, path_stop_words):
    """Prepare the dataset

    Args:
        data (pd.Dataframe): the input panda dataframe
        path_stop_words (str): path to the stop word file 

    Returns:
        dataset (list): [token_list_q1, token_list_q1]
        voc (dict): the vocabulary dict
    """
    
    # Read stop words
    stop_words = []
    with open(path_stop_words, "r") as stop_words_f:
        for w in stop_words_f:
            stop_words.append(w.rstrip())
    
    # Init nltk stemmer
    lem = WordNetLemmatizer()

    # Initialyze vocabulary
    voc = defaultdict(int)

    # Initialyze dataset
    dataset = list()
    
    # Interate over sentenses
    for index, row in data.iterrows():
        tokens_s1 = tokenize_sentense(row["question1"], stop_words, lem)
        tokens_s2 = tokenize_sentense(row["question2"], stop_words, lem)

        dataset.append((tokens_s1, tokens_s2))
        
        # Increment voc
        for t in tokens_s1 + tokens_s2:
            voc[t] += 1
    
    return dataset, voc

def prepare_data_and_vocabulary_2(data):
    """Prepare the dataset nbut without stop words filtering or lemming

    Args:
        data (pd.Dataframe): the input panda dataframe

    Returns:
        dataset (list): [token_list_q1, token_list_q1]
        voc (dict): the vocabulary dict
    """
    # Initialyze vocabulary
    voc = defaultdict(int)

    # Initialyze dataset
    dataset = list()

    tokenizer = RegexpTokenizer('\w+')

    # Interate over sentenses
    for index, row in data.iterrows():
        tokens_s1 = tokenizer.tokenize(row["question1"].lower())
        tokens_s2 = tokenizer.tokenize(row["question2"].lower())

        dataset.append((tokens_s1, tokens_s2))
        
        # Increment voc
        for t in tokens_s1 + tokens_s2:
            voc[t] += 1
    
    return dataset, voc


def create_sentense_vectors(tokens, embedding, d):
    
    l = len(tokens)
    e = np.empty((l, d))
    
    for i in range(l):
        if tokens[i] in embedding.keys():
            e[i, :] = embedding[tokens[i]]
        
        else:
            e[i, :] = embedding["[UNK]"]
    
    return e

def create_dataset_vectors(dataset, embedding, d):
    
    vectorized_dataset = []

    for question_pair in dataset:
        vectorized_dataset.append((create_sentense_vectors(question_pair[0], embedding, d), create_sentense_vectors(question_pair[1], embedding, d)))
    
    return vectorized_dataset

# For test Fold

In [None]:
# Read data
data = pd.read_csv("data/test/test_set.csv", index_col = False)

y = data["is_duplicate"].tolist()


# Create voc and tokenized data
# WITH STOP WORDS AND LEMMING
# dataset, voc = prepare_data_and_vocabulary(data, "data/utils/stop_words.txt")

# WITHOUT STOP WORDS AND LEMMING
dataset, voc = prepare_data_and_vocabulary_2(data)

# Check empty vectors
empty_q  = [(len(i) == 0 or len(j) == 0) for i, j in dataset]
print("There are " + str(sum(empty_q)) + " pair of sentences when at least one has an empty tokenized representation. They will be removed.")
for i in range(len(dataset)):
    if empty_q[i]:
        print(dataset[i])
        print(data.loc[i])

dataset = [dataset[i] for i in range(len(dataset)) if not empty_q[i]]
y = [y[i] for i in range(len(y)) if not empty_q[i]]

# GloVe

In [None]:
def load_glove_model(file, d):
    print("Loading Glove Model")
    glove_model = {}
    
    # For UNK token references: https://github.com/keras-team/keras/issues/12124 
    unk = np.zeros(d)

    with open(file,'r') as f:

        for line in f:

            split_line = line.split()
            word = split_line[0]
            embedding = np.array(split_line[1:], dtype=np.float64)
            glove_model[word] = embedding
            unk += embedding
    
    # Add [UNK] token random vector
    n = len(glove_model)
    unk = unk/n
    glove_model["[UNK]"] = unk

    print(f"{n + 1} words loaded!")

    return glove_model

# Get Glove Embedding
d = 300
glove = load_glove_model("data/utils/glove.6B/glove.6B.300d.txt", d=d)

# Word2vec

In [None]:
from gensim.models import KeyedVectors

def create_w2v_subset_embedding(word2vec, voc):
    
    word_embedding = dict()
    word_embedding["[UNK]"] = word2vec.vectors.mean(axis=0)

    available_words = word2vec.key_to_index.keys()

    for word in voc:
        if word in available_words:
            word_embedding[word] = word2vec[word]
    
    return word_embedding

w2v = KeyedVectors.load_word2vec_format('data/utils/Word2Vec/GoogleNews-vectors-negative300.bin', binary = True)

subset_word2vec = create_w2v_subset_embedding(w2v, voc)

# Load embeddings

In [None]:
dataset_glove = create_dataset_vectors(dataset, glove, d=d)

dataset_vord2vec = create_dataset_vectors(dataset, subset_word2vec, d=d)

# Pooling and Pairwise distance

In [None]:
def avg_pooling(tokens_vectors):
    return tokens_vectors.mean(axis=0)

def pairwise_distance(dataset, pooling_fn):
    pairwise_distance = []

    for q_pair in dataset:
        pairwise_distance.append(np.linalg.norm(pooling_fn(q_pair[0]) - pooling_fn(q_pair[1])))
    
    return pairwise_distance

glove_pairwise_dist = pairwise_distance(dataset_glove, avg_pooling)
word2vec_pairwise_dist = pairwise_distance(dataset_vord2vec, avg_pooling)

df_glove = pd.DataFrame({"pwdist": glove_pairwise_dist, "y": y})
df_w2c = pd.DataFrame({"pwdist": word2vec_pairwise_dist, "y": y})

df_glove.to_csv("data/test/Glove/pwdist_no_stopwordsLemming_CV.csv", index=False)
df_w2c.to_csv("data/test/Word2Vec/pwdist_no_stopwordsLemming_CV.csv", index=False)