# Evaluating Roman-Urdu Models

This notebook was written to perform both qualitative and quantitative analysis of the various word embedding models we trained

# References: 
1. https://web.stanford.edu/class/cs224n/materials/Gensim%20word%20vector%20visualization.html
2. https://raw.githubusercontent.com/devmount/GermanWordEmbeddings/master/visualize.py

#### Hiding warnings

In [0]:
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

import warnings
warnings.filterwarnings("ignore")

#### Colab-specific statements

In [0]:
try:
    from google.colab import drive
    
    drive.mount('/content/drive/')

    base = '/content/drive/My Drive/FYP/'
except:
    base = 'C:/Users/Ali/Google Drive/FYP/'

## Loading Models

#### Defining paths

In [0]:
import os

word2vec_cbow_path = os.path.join(base, 'Models/Word2Vec/cbow-roman_urdu/word2vec_cbow-roman_urdu')
word2vec_sg_path = os.path.join(base, 'Models/Word2Vec/sg-roman_urdu/word2vec_sg-roman_urdu')

glove_path =  os.path.join(base, 'Models/GloVe/glove-roman_urdu.txt')

fasttext_cbow_path =  os.path.join(base, 'Models/fastText/cbow-roman_urdu/fasttext_cbow-roman_urdu')
fasttext_sg_path =  os.path.join(base, 'Models/fastText/sg-roman_urdu/fasttext_sg-roman_urdu')

elmo_path =  os.path.join(base, 'Models/ELMo/roman_urdu/embeddings.txt')

bert_path =  os.path.join(base, 'Models/BERT/roman_urdu/embeddings.txt')

wordsim_path = 'Evaluation/wordsim353-roman_urdu.tsv'
simlex_path = 'Evaluation/simlex999-roman_urdu.tsv'

#### Loading Word2Vec

In [0]:
from gensim.models import Word2Vec

word2vec_cbow = Word2Vec.load(word2vec_cbow_path)
word2vec_cbow = word2vec_cbow.wv

word2vec_sg = Word2Vec.load(word2vec_sg_path)
word2vec_sg = word2vec_sg.wv

#### Loading GloVe

In [0]:
from gensim.models.keyedvectors import KeyedVectors

glove = KeyedVectors.load_word2vec_format(glove_path, binary=False)

#### Loading fastText

In [0]:
from gensim.models import FastText

fasttext_cbow = FastText.load(fasttext_cbow_path)
fasttext_cbow = fasttext_cbow.wv

fasttext_sg = FastText.load(fasttext_sg_path)
fasttext_sg = fasttext_sg.wv

#### Loading ELMo

In [None]:
elmo = KeyedVectors.load_word2vec_format(elmo_path, binary=False)

#### Loading BERT

In [None]:
bert = KeyedVectors.load_word2vec_format(bert_path, binary=False)

## Displaying PCA Plots

### Functions

In [0]:
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

%matplotlib inline

def display_word_pairs_plot(model, words, title='', filename=''):
    """ Displays a scatter plot for the word-word pairs"""
    word_vectors = [model[w] for w in words]
    pca = PCA(n_components=2)
    twodim = pca.fit_transform(word_vectors)

    fig = plt.figure(figsize=(10, 10))
    fig.suptitle(title, fontsize=20)
    plt.scatter(twodim[:, 0], twodim[:, 1], c='g')

    for word, (x, y) in zip(words, twodim):
        plt.text(x + 0.05, y + 0.05, s=word)

    # Plotting arrows
    for i in range(0, len(words) - 1, 2):
        a = twodim[i][0] + 0.04
        b = twodim[i][1]
        c = twodim[i + 1][0] - 0.04
        d = twodim[i + 1][1]
        plt.arrow(
            a, b, c - a, d - b,
            shape='full',
            lw=0.1,
            edgecolor='#bbbbbb',
            facecolor='#bbbbbb',
            length_includes_head=True,
            head_width=0.08,
            width=0.01
        )

    if filename:
        plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')

### Word Lists

In [0]:
countries = ['pakistan', 'islamabad', 'england', 'london', 'afghanistan', 'kabul', 'japan', 'tokyo', 'iraq','bghdad']
synonyms = ['hans', 'muskura', 'dilkash', 'khoobsurat', 'khuda', 'parvardigaar']
antonyms = ['hansna', 'rona', 'baithna', 'chalna', 'shaam', 'subha']
sing_plu = ['beta', 'betay', 'beti', 'betiyan']

### Word2Vec Plots

#### Countries

In [0]:
display_word_pairs_plot(word2vec_sg, countries, 'Word2Vec-SG Roman-Urdu - Countries', 'word2vec_sg-roman_urdu-countries.png')
display_word_pairs_plot(word2vec_cbow, countries, 'Word2Vec-CBOW Roman-Urdu - Countries', 'word2vec_cbow-roman_urdu-countries.png')

#### Synonyms

In [0]:
display_word_pairs_plot(word2vec_sg, synonyms, 'Word2Vec-SG Roman-Urdu - Synonyms', 'word2vec_sg-roman_urdu-synonyms.png')
display_word_pairs_plot(word2vec_cbow, synonyms, 'Word2Vec-CBOW Roman-Urdu - Synonyms', 'word2vec_cbow-roman_urdu-synonyms.png')

### GloVe Plots

#### Countries

In [0]:
display_word_pairs_plot(glove, countries, 'GloVe Roman-Urdu - Countries', 'glove-roman_urdu-countries.png')

#### Synonyms

In [0]:
display_word_pairs_plot(glove, synonyms, 'GloVe Roman-Urdu - Synonyms', 'glove-roman_urdu-synonyms.png')

### fastText Plots

#### Countries

In [0]:
display_word_pairs_plot(fasttext_sg, countries, 'FastText-SG Roman-Urdu - Countries', 'fasttext_sg-roman_urdu-countries.png')
display_word_pairs_plot(fasttext_cbow, countries, 'FastText-CBOW Roman-Urdu - Countries', 'fasttext_cbow-roman_urdu-countries.png')

#### Synonyms

In [0]:
display_word_pairs_plot(fasttext_sg, synonyms, 'FastText-SG Roman-Urdu - Synonyms', 'fasttext_sg-roman_urdu-synonyms.png')
display_word_pairs_plot(fasttext_cbow, synonyms, 'FastText-CBOW Roman-Urdu - Synonyms', 'fasttext_cbow-roman-urdu-synonyms.png')

### ELMo Plots

#### Countries

In [None]:
display_word_pairs_plot(elmo, countries, 'ELMo Roman-Urdu - Countries', 'elmo-roman_urdu-countries.png')

### BERT Plots

#### Countries

In [None]:
display_word_pairs_plot(bert, countries, 'BERT Roman-Urdu - Countries', 'bert-roman_urdu-countries.png')

## Displaying TSNE Scatter Plots

### Functions

In [0]:
#https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d

import matplotlib.cm as cm
import numpy as np
from sklearn.manifold import TSNE

def build_clusters(model, words, perp):
    """ Returns embeddings and clusters of similar words obtained from the model"""
    embedding_clusters = []
    word_clusters = []
    for word in words:
        embeddings = []
        words = []
        for similar_word, _ in model.most_similar(word, topn=10):
            words.append(similar_word)
            embeddings.append(model[similar_word])
        embedding_clusters.append(embeddings)
        word_clusters.append(words)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=perp, n_components=2, init='pca', n_iter=5000)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

    return embeddings_en_2d, word_clusters

def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, filename):
    """ Displays scatter plots showing clusters of similar words"""
    fig = plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=0.7, label=label)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.5, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=8)
    plt.legend(loc=4)
    fig.suptitle(title, fontsize=20)
    plt.grid(True)
    plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')
    plt.show()

def display_cluster_plot(model, words, title='', filename='', perp = 35):
    """ Calls functions to get embeddings, clusters and display the plots"""
    embeddings_en_2d, word_clusters = build_clusters(model, words, perp)
    tsne_plot_similar_words(title, words, embeddings_en_2d, word_clusters, filename)

The list of words used to generate clusters

In [0]:
words = ['mazhab', 'khana', 'mausam', 'mohabbat', 'pakistan', 'allah', 'maalik',
         'hukoomat', 'shehar', 'mohammad', 'cricket', 'musalman', 'america', 'baap', 'subha']

#### Word2Vec Plots

In [0]:
display_cluster_plot(word2vec_sg, words, 'Word2Vec-SG Roman-Urdu - Clusters', 'word2vec_sg-roman_urdu-clusters.png')
display_cluster_plot(word2vec_cbow, words, 'Word2Vec-CBOW Roman-Urdu - Clusters', 'word2vec_cbow-roman_urdu-clusters.png', 39)

#### GloVe Plots

In [0]:
display_cluster_plot(glove, words, 'GloVe Roman-Urdu - Clusters', 'glove-roman_urdu-clusters.png')

#### fastText Plots

In [0]:
display_cluster_plot(fasttext_sg, words, 'FastText-SG Roman-Urdu - Clusters', 'fasttext_sg-roman_urdu-clusters.png')
display_cluster_plot(fasttext_cbow, words, 'FastText-CBOW Roman-Urdu - Clusters', 'fasttext_cbow-roman_urdu-clusters.png')

#### ELMo Plots

In [None]:
display_cluster_plot(elmo, words, 'ELMo Roman-Urdu - Clusters', 'elmo-roman_urdu-clusters.png')

#### BERT Plots

In [None]:
display_cluster_plot(bert, words, 'BERT Roman-Urdu - Clusters', 'bert-roman_urdu-clusters.png')

## Performing Quantitative Analysis using Spearman's Correlation

### Functions

In [0]:
def get_spearman_scores(evaluation_dataset):
    """ Returns a dictionary of Spearman's Correlation Coefficients for the given dataset"""
    scores_dict = {}

    _, spearman_coefficient_w2v_cbow, __ = word2vec_cbow.evaluate_word_pairs(evaluation_dataset)
    _, spearman_coefficient_w2v_sg, __ = word2vec_sg.evaluate_word_pairs(evaluation_dataset)
    scores_dict['Word2Vec CBOW'] = spearman_coefficient_w2v_cbow[0]
    scores_dict['Word2Vec SG'] = spearman_coefficient_w2v_sg[0]

    _, spearman_coefficient_glove, __ = glove.evaluate_word_pairs(evaluation_dataset)
    scores_dict['GloVe'] = spearman_coefficient_glove[0]
    
    _, spearman_coefficient_ft_cbow, __ = fasttext_cbow.evaluate_word_pairs(evaluation_dataset)
    _, spearman_coefficient_ft_sg, __ = fasttext_sg.evaluate_word_pairs(evaluation_dataset)
    scores_dict['fastText CBOW'] = spearman_coefficient_ft_cbow[0]
    scores_dict['fastText SG'] = spearman_coefficient_ft_sg[0]
    
    _, spearman_coefficient_elmo, __ = elmo.evaluate_word_pairs(evaluation_dataset)
    scores_dict['elmo'] = spearman_coefficient_elmo[0]
    
    _, spearman_coefficient_bert, __ = bert.evaluate_word_pairs(evaluation_dataset)
    scores_dict['bert'] = spearman_coefficient_bert[0]

    return scores_dict

def display_scores(scores_dict):
    """ Displays the scores from the dictionary"""
    for score in scores_dict:
        print("{}: {:.3f}".format(score, scores_dict[score]))

### WordSim-353

In [0]:
wordsim_file = os.path.join(base, wordsim_path)

wordsim_scores = get_spearman_scores(wordsim_file)
display_scores(wordsim_scores)

### SimLex-999


In [0]:
simlex_file = os.path.join(base, simlex_path)

simlex_scores = get_spearman_scores(simlex_file)
display_scores(simlex_scores)