# Evaluating Urdu Word Embeddings

In [None]:
# To filter out warnings
from matplotlib.axes._axes import _log as matplotlib_axes_logger
matplotlib_axes_logger.setLevel('ERROR')

import warnings
warnings.filterwarnings("ignore")

In [None]:
IN_COLAB = False

try:
    from google.colab import drive
    IN_COLAB = True

    !pip install arabic_reshaper python-bidi
    
    drive.mount('/drive/')
except:
    pass

## Loading Models

#### Defining paths

In [None]:
import os

base = ''

if IN_COLAB:
    base = '/drive/My Drive/Shared/FYP/'
else:
    base = 'C:/Users/Ali/Google Drive/Shared/FYP/'

word2vec_cbow_path = os.path.join(base, 'Models/Word2Vec/Ours/word2vec_urdu_cbow_500')
word2vec_sg_path = os.path.join(base, 'Models/Word2Vec/Ours/word2vec_urdu_sg_500')

glove_path =  os.path.join(base, 'Models/GloVe/glove_urdu_500.txt')

fasttext_cbow_path =  os.path.join(base, 'Models/fastText/urdu_cbow/fasttext_urdu_cbow_500')
fasttext_sg_path =  os.path.join(base, 'Models/fastText/urdu_sg/fasttext_urdu_sg_500')

#### Loading Word2Vec

In [None]:
from gensim.models import Word2Vec

word2vec_cbow = Word2Vec.load(word2vec_cbow_path)
word2vec_cbow = word2vec_cbow.wv

word2vec_sg = Word2Vec.load(word2vec_sg_path)
word2vec_sg = word2vec_sg.wv

#### Loading GloVe

In [None]:
from gensim.models.keyedvectors import KeyedVectors

glove = KeyedVectors.load_word2vec_format(glove_path, binary=False)

#### Loading fastText

In [None]:
from gensim.models import FastText

fasttext_cbow = FastText.load(fasttext_cbow_path)
fasttext_cbow = fasttext_cbow.wv

fasttext_sg = FastText.load(fasttext_sg_path)
fasttext_sg = fasttext_sg.wv

## Displaying PCA Plots

In [None]:
#https://web.stanford.edu/class/cs224n/materials/Gensim%20word%20vector%20visualization.html
#https://raw.githubusercontent.com/devmount/GermanWordEmbeddings/master/visualize.py

import matplotlib.pyplot as plt
from arabic_reshaper import reshape
from bidi.algorithm import get_display
from sklearn.decomposition import PCA

%matplotlib inline

def display_pca_scatterplot(model, words, title='', filename=''):
    word_vectors = [model[w] for w in words]
    pca = PCA(n_components=2)
    twodim = pca.fit_transform(word_vectors)

    fig = plt.figure(figsize=(10, 10))
    fig.suptitle(title, fontsize=20)
    plt.scatter(twodim[:, 0], twodim[:, 1], c='g')

    for word, (x, y) in zip(words, twodim):
        word = reshape(word)
        word = get_display(word)
        plt.text(x + 0.05, y + 0.05, s=word)

    # Plotting arrows
    for i in range(0, len(words) - 1, 2):
        a = twodim[i][0] + 0.04
        b = twodim[i][1]
        c = twodim[i + 1][0] - 0.04
        d = twodim[i + 1][1]
        plt.arrow(
            a, b, c - a, d - b,
            shape='full',
            lw=0.1,
            edgecolor='#bbbbbb',
            facecolor='#bbbbbb',
            length_includes_head=True,
            head_width=0.08,
            width=0.01
        )

    if filename:
        plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')

### Word Lists

In [None]:
countries = ['انگلینڈ', 'لنڈن', 'افغانستان', 'کابل', 'جاپان', 'ٹوکیو', 'عراق', 'بغداد']
synonyms = ['ہنس', 'مسکرا', 'دلکش', 'خوبصورت', 'خدا', 'پروردگار']
antonyms = ['ہنسنا', 'رونا', 'بیٹھنا' ,'چلنا', 'شام', 'صبح']
sing_plu = ['بیٹا', 'بیٹے', 'بیٹی', 'بیٹیاں']

### Word2Vec Plots

#### Countries

In [None]:
display_pca_scatterplot(word2vec_cbow, countries, 'Word2Vec Urdu CBOW 500 - Countries', 'word2vec_urdu_cbow_500_countries.png')
display_pca_scatterplot(word2vec_sg, countries, 'Word2Vec Urdu SG 500 - Countries', 'word2vec_urdu_sg_500_countries.png')

#### Synonyms

In [None]:
display_pca_scatterplot(word2vec_cbow, synonyms, 'Word2Vec Urdu CBOW 500 - Synonyms', 'word2vec_urdu_cbow_500_synonyms.png')
display_pca_scatterplot(word2vec_sg, synonyms, 'Word2Vec Urdu SG 500 - Synonyms', 'word2vec_urdu_sg_500_synonyms.png')

### GloVe Plots

#### Countries

In [None]:
display_pca_scatterplot(glove, countries, 'GloVe Urdu 500 - Countries', 'glove_urdu_sg_500_countries.png')

#### Synonyms

In [None]:
display_pca_scatterplot(glove, synonyms, 'GloVe Urdu 500 - Synonyms', 'glove_urdu_sg_500_synonyms.png')

### fastText Plots

#### Countries

In [None]:
display_pca_scatterplot(fasttext_cbow, countries, 'FastText Urdu CBOW 500 - Countries', 'fasttext_urdu_cbow_500_countries.png')
display_pca_scatterplot(fasttext_sg, countries, 'FastText Urdu SG 500 - Countries', 'fasttext_urdu_sg_500_countries.png')

#### Synonyms

In [None]:
display_pca_scatterplot(fasttext_cbow, synonyms, 'FastText Urdu CBOW 500 - Synonyms', 'fasttext_urdu_cbow_500_synonyms.png')
display_pca_scatterplot(fasttext_sg, synonyms, 'FastText Urdu SG 500 - Synonyms', 'fasttext_urdu_sg_500_synonyms.png')

## Displaying TSNE Scatter Plots

In [None]:
#https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d

import matplotlib.cm as cm
import numpy as np
from sklearn.manifold import TSNE

def build_clusters(model, words, perp):
    embedding_clusters = []
    word_clusters = []

    for word in words:
        embeddings = []
        words = []

        for similar_word, _ in model.most_similar(word, topn=10):
            words.append(similar_word)
            embeddings.append(model[similar_word])

        embedding_clusters.append(embeddings)
        word_clusters.append(words)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=perp, n_components=2, init='pca', n_iter=5000)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)

    return embeddings_en_2d, word_clusters


def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, filename=''):
    fig = plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))

    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=0.7, label=label)

        for i, word in enumerate(words):
            word = reshape(word)
            word = get_display(word)
            plt.text(x[i] + 0.05, y[i] + 0.05, s=word)

    plt.legend(loc=4)
    fig.suptitle(title, fontsize=20)
    plt.grid(True)

    if filename:
        plt.savefig(filename, format='png', dpi=300, bbox_inches='tight')

    plt.show()
    
def gen_plot(model, words, title='', filename='', perp=35):
    embeddings_en_2d, word_clusters = build_clusters(model, words, perp)
    tsne_plot_similar_words(title, words, embeddings_en_2d, word_clusters, filename)

The list of words used to generate clusters

In [None]:
words = ['مذہب', 'کھانا', 'موسم', 'محبت', 'پاکستان', 'اللہ', 'مالک', 'حکومت', 'شہر', 'محمد', 'کرکٹ', 'مسلمان', 'امریکہ', 'باپ', 'صبح']

#### Word2Vec Plots

In [None]:
gen_plot(word2vec_cbow, words, 'Word2Vec CBOW 500 Urdu - Clusters', 'word2vec_urdu_cbow_500_clusters.png', 39)
gen_plot(word2vec_sg, words, 'Word2Vec SG 500 Urdu - Clusters', 'word2vec_urdu_sg_500_clusters.png')

#### GloVe Plots

In [None]:
gen_plot(glove, words, 'GloVe 500 Urdu - Clusters', 'glove_urdu_500_clusters.png')

#### fastText Plots

In [None]:
gen_plot(fasttext_cbow, words, 'FastText CBOW 500 Urdu - Clusters', 'fasttext_urdu_cbow_500_clusters.png')
gen_plot(fasttext_sg, words, 'FastText SG 500 Urdu - Clusters', 'fasttext_urdu_sg_500_clusters.png')

## Performing Quantitative Analysis using Spearman's Correlation

Loading Benchmarks

In [None]:
import pandas as pd

simlex_file = os.path.join(base, 'Data/Evaluation-Datasets/SimLex-999_urdu.txt')
wordsim_file = os.path.join(base, 'Data/Evaluation-Datasets/wordsim353_agreed_urdu.txt')

ws_df = pd.read_csv(wordsim_file, sep='\t', header=None)
sl_df =  pd.read_csv(simlex_file, sep='\t', header=None)

## Playground