In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import Word2Vec
from gensim.models import FastText
import numpy as np

In [32]:
dt = pd.read_csv('urdu-sentiment-corpus-v1.tsv', delimiter='\t')
dt.columns = ['Tweet', 'Class']
dt['Class'] = dt['Class'].map({'P': 1, 'N': 0})
dt = dt.dropna()
dt

Unnamed: 0,Tweet,Class
0,میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...,1.0
1,چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...,0.0
3,"سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...",1.0
4,ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ,1.0
5,گندی زبان اور گٹر جیسے دماغ والے جاهل جیالے ه...,0.0
...,...,...
995,اُس آدمی نے اِس سالار کو کافی معقول ٹپ دی ہے ۔,1.0
996,چچا غالب کی روح سے معذرت کے ساتھہم نے مانا کہ ...,1.0
997,واہ جناب واہ! اچھی رہی۔ جناب خود کو فرشتہ سمجو...,1.0
998,اسلام آباد :پی اے ٹی کا دھرنا ختم، صفائی کے کا...,1.0


In [33]:
X = dt['Tweet'].values
y = dt['Class'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [34]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
max_len = max([len(seq) for seq in X_train + X_test])
vocab_size = len(tokenizer.word_index) + 1
X_train = pad_sequences(X_train, maxlen=max_len, padding='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post')
word2vec_model = Word2Vec(sentences=dt['Tweet'], vector_size=100, window=5, min_count=1, workers=4)
word2vec_model.save("word2vec_urdu.model")



In [35]:
def bilstm_model(embedding_matrix):
    model = Sequential()
    model.add(Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], input_length=max_len, trainable=False))
    model.add(Bidirectional(LSTM(100, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(100, return_sequences=True)))
    model.add(Dropout(0.3))
    model.add(Bidirectional(LSTM(100)))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [38]:
def evaluation(model, X_test, y_test):
    y_pred = (model.predict_class(X_test) > 0.5).astype("int32")
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return acc, precision, recall, f1
results = pd.DataFrame(columns=['Accuracy', 'Precision', 'Recall', 'F1-Score'])

In [None]:
word2vec_model = Word2Vec.load("word2vec_urdu.model")
word_vectors = word2vec_model.wv
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
word_index_items = list(tokenizer.word_index.items())
i = 0
while i < len(word_index_items):
    word, index = word_index_items[i]
    if word in word_vectors:
        embedding_matrix[index] = word_vectors[word]
    i += 1
bilstm_model = bilstm_model(embedding_matrix)
bilstm_model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=1)
acc, precision, recall, f1 = evaluation(bilstm_model, X_test, y_test)
results.loc['BiLSTM_Word2Vec'] = [acc, precision, recall, f1]
print(results)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
                 Accuracy  Precision  Recall  F1-Score
BiLSTM_Word2Vec  0.489796        0.0     0.0       0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [40]:
embeddings_index = {}
with open('glove.6B.100d.txt', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, 100))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
bilstm = bilstm_model(embedding_matrix)
bilstm.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=1)
acc, precision, recall, f1 = evaluation(bilstm, X_test, y_test)
results.loc['BiLSTM_Glove'] = [acc, precision, recall, f1]
print(results)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


AttributeError: 'Sequential' object has no attribute 'predict_class'

In [None]:
X_sentences = [sentence.split() for sentence in X]
fasttext_model = FastText(X_sentences, vector_size=32, min_count=1)
fasttext_dict = {word: fasttext_model.wv[word] for word in fasttext_model.wv.index_to_key}
embedding_matrix = np.zeros((5000, 32))
for word, i in tokenizer.word_index.items():
    embedding_vector = fasttext_dict.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
bilstm = bilstm_model(embedding_matrix)
bilstm.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, verbose=1)
acc, precision, recall, f1 = evaluation(bilstm, X_test, y_test)
results.loc['BiLSTM_FastText'] = [acc, precision, recall, f1]
print(results)