# Evaluating Roman-Urdu Models through Sentiment Analysis

This notebook was written to evaluate the Roman-Urdu models through a sentiment analysis task. This was performed on a labelled tweets dataset.

References
1. https://github.com/tthustla/twitter_sentiment_analysis_part11/blob/master/Capstone_part11.ipynb
2. https://towardsdatascience.com/another-twitter-sentiment-analysis-with-python-part-11-cnn-word2vec-41f5e28eda74

#### Hiding warnings

In [0]:
import warnings
warnings.filterwarnings("ignore")

#### Colab-specific statements


In [0]:
from google.colab import drive

drive.mount('/content/drive/')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive/


## Loading Models

#### Defining paths

In [0]:
import os
import tensorflow

base = '/content/drive/My Drive/FYP/'

elmo_path = '/content/drive/My Drive/FYP/Models/ELMo/roman-urdu/embeddings.txt'

tweets_path = '/content/drive/My Drive/FYP/Evaluation/roman-urdu_tweets.csv'

In [0]:
from gensim.models.keyedvectors import KeyedVectors

elmo = KeyedVectors.load_word2vec_format(elmo_path, binary=False)

## Loading and processing the dataset

In [0]:
import pandas as pd
import re

def clean_dataset(data_df):
    """ Removes numbers and emojis from each tweet and labels positive tweets as 1 and negative ones as 0"""
    cleaned = []

    for row in data_df.values:
        if isinstance(row[0], str) == True:
            tweet = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", row[0]).split()) 
            tweet = tweet.lower()
            
            label = 0
            if row[1] == 'Positive':
                label = 1
            else:
                label = 0
                
            cleaned.append([tweet, label])

    return cleaned

In [0]:
data_df = pd.read_csv(tweets_path, header=None)
cleaned_dataset = clean_dataset(data_df)
dataset = pd.DataFrame(cleaned_dataset)
dataset = dataset.sample(frac=1)

## Preparing train and test sets

In [0]:
train_x = dataset[:16000][0].tolist()
train_y = dataset[:16000][1].tolist()

test_x = dataset[16000:][0].tolist()
test_y = dataset[16000:][1].tolist()

## Building the Neural Network

#### Functions

In [0]:
import numpy as np

from keras.layers import Dense, Flatten
from keras.layers.embeddings import Embedding
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

def get_embedding_indexes(emb_model):
    """ Returns a dictionary where each word in the trained word embedding model is mapped to an index"""
    embeddings_index = {}

    for word in emb_model.vocab.keys():
            embeddings_index[word] = (emb_model.wv[word])
    print('Found %s word vectors.' % len(embeddings_index))
    
    return embeddings_index

def get_word_counts():
    """ Returns the total number of words unique words present in the training data"""
    word_counts = {}

    for line in train_x:
        tokens = line.split()
        
        for word in tokens:
            if word in word_counts:
                count = word_counts[word]
                count += 1
                word_counts[word] = count
            else:
                word_counts[word] = 0

    return len(word_counts)

def generate_embedding_matrix(tokenizer, embeddings_index, word_counts):
    """ Returns a matrix of size nwords X embedding size, where each row is a word vector of a word in the train_set"""
    embedding_matrix = np.zeros((word_counts, 500))
    for word, i in tokenizer.word_index.items():
        if i >= word_counts:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

def train_model(embedding_matrix, word_counts, x_train_seq, train_y):
    """ Feeds the embedding matrix to the embedding layer as initial weights and starts training a NN on the sentiment analysis task"""
    model = Sequential()
    e = Embedding(word_counts, 500, weights=[embedding_matrix], input_length=320, trainable=True)
    model.add(e)
    model.add(Flatten())
    model.add(Dense(256, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(x_train_seq, train_y, epochs=5, batch_size=32, verbose=2)

    return model

def train_embedding_model(emb_model, name, train_x, train_y, test_x, test_y):
    """ Calls all the functions above sequentially and prints out the evaluation scores of all the models trained on the same dataset"""
    tokenizer = Tokenizer(num_words=len(emb_model.wv.vocab))
    tokenizer.fit_on_texts(train_x)
    sequences = tokenizer.texts_to_sequences(train_x)

    x_train_seq = pad_sequences(sequences, maxlen=320)

    embeddings_index = get_embedding_indexes(emb_model)

    word_counts = get_word_counts()

    embedding_matrix = generate_embedding_matrix(tokenizer, embeddings_index, word_counts)
    
    model = train_model(embedding_matrix, word_counts, x_train_seq, train_y)
    model.save(name + '.h5')

    sequences_test = tokenizer.texts_to_sequences(test_x)
    x_test_seq = pad_sequences(sequences_test, maxlen=320)   

    print("{} accuracy: ".format(name))
    print (model.evaluate(x=x_test_seq, y=test_y))
    print()

Using TensorFlow backend.


## Training the model

In [0]:
models = [elmo]
names = ['Elmo']

In [0]:
for i in range(len(models)):
    train_embedding_model(models[i], names[i], train_x, train_y, test_x, test_y)

Found 37172 word vectors.











Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/5
 - 24s - loss: 0.5740 - acc: 0.7222
Epoch 2/5
 - 21s - loss: 0.4214 - acc: 0.8105
Epoch 3/5
 - 21s - loss: 0.2212 - acc: 0.9109
Epoch 4/5
 - 21s - loss: 0.0883 - acc: 0.9717
Epoch 5/5
 - 21s - loss: 0.0361 - acc: 0.9902
Elmo accuracy: 
[1.1395248333688266, 0.7594607379375591]

