In [2]:
# Question 1
import pandas as pd

In [3]:
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')
print(df)

                                                 Tweet Class
0    میں نے ایٹم بم بنایا ھے ۔۔۔۔او بھائی ایٹم بمب ...     P
1    چندے سے انقلاب اور عمران خان وزیر اعظم نہیں بن...     N
2                             ٹویٹر کا خیال کیسے آیا ؟     O
3    سرچ انجن گوگل کے نائب صدر نے فضا میں ، 130,000...     P
4      ابھی تک اسکی لہریں کبھی کبھی آ جاتی ہیں یار :أْ     P
..                                                 ...   ...
995     اُس آدمی نے اِس سالار کو کافی معقول ٹپ دی ہے ۔     P
996  چچا غالب کی روح سے معذرت کے ساتھہم نے مانا کہ ...     P
997  واہ جناب واہ! اچھی رہی۔ جناب خود کو فرشتہ سمجو...     P
998  اسلام آباد :پی اے ٹی کا دھرنا ختم، صفائی کے کا...     P
999  دنیا نے کس کا راہ وفا میں دیا ہے ساتھتم بھی چل...     P

[1000 rows x 2 columns]


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, GRU, LSTM, Bidirectional, Dropout
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


def preprocess_data(df):
    X = df['Tweet']
    y = df['Class']

    y_binary = np.where(y == 'P', 1, 0)

    if np.any((y_binary != 0) & (y_binary != 1)):
        raise ValueError("Labels contain mixed or unknown targets.")

    missing_labels = np.isnan(y_binary)
    if np.any(missing_labels):
        raise ValueError("Missing values found in labels.")

    y_binary = np.expand_dims(y_binary, axis=-1)

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X)
    X = tokenizer.texts_to_sequences(X)
    max_sequence_length = 100
    X = pad_sequences(X, maxlen=max_sequence_length)

    return X, y_binary, tokenizer




def create_model(model_type, num_layers, dropout_rate, max_sequence_length, tokenizer):
    model = Sequential()
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=100, input_length=max_sequence_length))

    layers_dict = {'RNN': SimpleRNN, 'GRU': GRU, 'LSTM': LSTM}

    for _ in range(num_layers):
        if model_type == 'BiLSTM':
            layer = Bidirectional(LSTM(64, return_sequences=True))
        else:
            layer = layers_dict[model_type](64, return_sequences=True)
        model.add(layer)

    model.add(Dropout(dropout_rate))
    model.add(Dense(1, activation='sigmoid'))

    return model





def evaluate_model(model, X_test, y_test):
    y_pred_prob = model.predict(X_test)
    y_pred_mean = np.mean(y_pred_prob, axis=1)
    y_pred = (y_pred_mean > 0.5).astype(int)

    if y_test.shape != y_pred.shape:
        raise ValueError("Shapes of y_test and y_pred do not match.")

    if np.isnan(y_test).any():
        raise ValueError("Missing values found in y_test.")

    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    return accuracy, precision, recall, f1



X, y_binary, tokenizer = preprocess_data(df)

X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.25, random_state=42)

results = []

for model_type in ['RNN', 'GRU', 'LSTM', 'BiLSTM']:
    for num_layers in [2, 3]:
        for dropout_rate in [0.3, 0.7]:
            model = create_model(model_type, num_layers, dropout_rate, 100, tokenizer)
            model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

            model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=0)

            accuracy, precision, recall, f1 = evaluate_model(model, X_test, y_test)

            results.append({
                'Model': model_type,
                'Num Layers': num_layers,
                'Dropout Rate': dropout_rate,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall,
                'F1-score': f1
            })

results_df = pd.DataFrame(results)
print(results_df)


     Model  Num Layers  Dropout Rate  Accuracy  Precision    Recall  F1-score
0      RNN           2           0.3     0.504   1.000000  0.023622  0.046154
1      RNN           2           0.7     0.568   0.542986  0.944882  0.689655
2      RNN           3           0.3     0.556   0.735294  0.196850  0.310559
3      RNN           3           0.7     0.496   1.000000  0.007874  0.015625
4      GRU           2           0.3     0.572   0.661290  0.322835  0.433862
5      GRU           2           0.7     0.580   0.677419  0.330709  0.444444
6      GRU           3           0.3     0.520   0.733333  0.086614  0.154930
7      GRU           3           0.7     0.628   0.601190  0.795276  0.684746
8     LSTM           2           0.3     0.572   0.656250  0.330709  0.439791
9     LSTM           2           0.7     0.576   0.623529  0.417323  0.500000
10    LSTM           3           0.3     0.548   0.769231  0.157480  0.261438
11    LSTM           3           0.7     0.568   0.545455  0.897

In [14]:
#GLOVE EMBEDDING 

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to load GloVe embeddings
def load_glove_embeddings(file_path):
    with open(file_path, encoding='utf-8') as glove_file:
        embeddings_dict = {line.split()[0]: np.array(line.split()[1:], dtype='float32') 
                           for line in glove_file}
    return embeddings_dict

# Load GloVe embeddings
glove_embeddings_path = r'C:\Users\AATIGERS\Downloads\Compressed\glove.6B\glove.6B.300d.txt'
glove_embeddings = load_glove_embeddings(glove_embeddings_path)

# Load dataset
dataset_path = 'urdu-sentiment-corpus-v1.tsv'
dataframe = pd.read_csv(dataset_path, delimiter='\t')

# Tokenize text data and convert to sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(dataframe['Tweet'])
sequences = tokenizer.texts_to_sequences(dataframe['Tweet'])

# Pad sequences
max_length_sequence = 100
padded_sequences = pad_sequences(sequences, maxlen=max_length_sequence)

# Split data into features and labels
features = padded_sequences
labels = np.where(dataframe['Class'] == 'P', 1, 0)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.25, random_state=42)

# Create embedding matrix using GloVe embeddings
embedding_dimension = 300
word_index = tokenizer.word_index
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words, embedding_dimension))
for word, index in word_index.items():
    embedding_vector = glove_embeddings.get(word)
    if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector

# Define RNN model
model = Sequential()
model.add(Embedding(num_words, embedding_dimension, weights=[embedding_matrix], input_length=max_length_sequence, trainable=False))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Evaluate the model
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy:", accuracy)

# Predict probabilities for the test set
y_pred_probabilities = model.predict(X_test)

# Thresholds for classification
thresholds = np.arange(0.1, 0.6, 0.1)
results = []

# Calculate precision, recall, and F1-score for different thresholds
for threshold in thresholds:
    y_pred = (y_pred_probabilities > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})

# Convert results to DataFrame
results_dataframe = pd.DataFrame(results)

# Print results
print(results_dataframe)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.4959999918937683
   Threshold  Precision    Recall  F1-score
0        0.1   0.508000  1.000000  0.673740
1        0.2   0.502092  0.944882  0.655738
2        0.3   0.497890  0.929134  0.648352
3        0.4   0.500000  0.897638  0.642254
4        0.5   0.517241  0.118110  0.192308


In [18]:
#LOAD FASTTEXT

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to load FastText embeddings
def load_fasttext_embeddings(file_path, limit=None):
    fasttext_model = KeyedVectors.load_word2vec_format(file_path, binary=False, limit=limit)
    return fasttext_model

# Function to preprocess data
def preprocess_data(X, y, tokenizer, max_sequence_length):
    tokenizer.fit_on_texts(X)
    X_sequences = tokenizer.texts_to_sequences(X)
    X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
    y_binary = np.where(y == 'P', 1, 0)
    return X_padded, y_binary

# Function to create embedding matrix
def create_embedding_matrix(tokenizer, embedding_model):
    embedding_dimension = embedding_model.vector_size
    num_words = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((num_words, embedding_dimension))
    for word, index in tokenizer.word_index.items():
        if word in embedding_model:
            embedding_matrix[index] = embedding_model[word]
    return embedding_matrix

# Load the dataset
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')
X = df['Tweet']
y = df['Class']

# Load FastText embeddings
fasttext_embeddings_path = 'wiki-news-300d-1M.vec'
fasttext_embeddings = load_fasttext_embeddings(fasttext_embeddings_path, limit=20000)

# Tokenization and preprocessing
max_sequence_length = 100
tokenizer = Tokenizer()
X_processed, y_processed = preprocess_data(X, y, tokenizer, max_sequence_length)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.25, random_state=42)

# Create embedding matrix
embedding_matrix = create_embedding_matrix(tokenizer, fasttext_embeddings)

# Define RNN model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=fasttext_embeddings.vector_size,
                    weights=[embedding_matrix], input_length=max_sequence_length, trainable=False))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Evaluate the model
_, accuracy = model.evaluate(X_test, y_test, verbose=0)
print("Accuracy:", accuracy)

# Predict probabilities for the test set
y_pred_prob = model.predict(X_test)

# Threshold for classification
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate precision, recall, and F1-score
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Calculate precision, recall, and F1-score for different thresholds
results = []
for threshold in np.arange(0.1, 0.6, 0.1):
    y_pred = (y_pred_prob > threshold).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': threshold, 'Precision': precision, 'Recall': recall, 'F1-score': f1})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
print(results_df)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 0.5040000081062317
Evaluation metrics:
Accuracy: 0.5040000081062317
Precision: 0.5882352941176471
Recall: 0.07874015748031496
F1-score: 0.1388888888888889
   Threshold  Precision    Recall  F1-score
0        0.1   0.508065  0.992126  0.672000
1        0.2   0.512295  0.984252  0.673854
2        0.3   0.518987  0.968504  0.675824
3        0.4   0.506608  0.905512  0.649718
4        0.5   0.588235  0.078740  0.138889


In [18]:
pip install tensorflow-text


Collecting tensorflow-text
  Downloading tensorflow_text-2.10.0-cp310-cp310-win_amd64.whl.metadata (2.1 kB)
Collecting tensorflow-hub>=0.8.0 (from tensorflow-text)
  Downloading tensorflow_hub-0.16.1-py2.py3-none-any.whl.metadata (1.3 kB)
Collecting tf-keras>=2.14.1 (from tensorflow-hub>=0.8.0->tensorflow-text)
  Downloading tf_keras-2.16.0-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of tf-keras to determine which version is compatible with other requirements. This could take a while.
  Downloading tf_keras-2.15.1-py3-none-any.whl.metadata (1.7 kB)
  Downloading tf_keras-2.15.0-py3-none-any.whl.metadata (1.6 kB)
Downloading tensorflow_text-2.10.0-cp310-cp310-win_amd64.whl (5.0 MB)
   ---------------------------------------- 0.0/5.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.0 MB ? eta -:--:--
   ---------------------------------------- 0.0/5.0 MB ? eta -:--:--
 

In [20]:
#ELMO



import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

def preprocess_data(X, y):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X)
    X_sequences = tokenizer.texts_to_sequences(X)
    max_sequence_length = max(map(len, X_sequences))
    X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
    y_binary = np.where(y == 'P', 1, 0)
    return X_padded, y_binary, tokenizer, max_sequence_length

def build_model(input_dim, output_dim, max_sequence_length):
    model = Sequential()
    model.add(Embedding(input_dim=input_dim + 1, output_dim=output_dim, input_length=max_sequence_length))
    model.add(SimpleRNN(64, return_sequences=True))  
    model.add(Dropout(0.3))
    model.add(SimpleRNN(64, return_sequences=True))  
    model.add(Dropout(0.3))
    model.add(SimpleRNN(64, return_sequences=False))  
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='sigmoid'))  
    return model

# Load the dataset
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

# Split data into features and labels
X = df['Tweet']
y = df['Class']

# Preprocess the data
X_processed, y_processed, tokenizer, max_sequence_length = preprocess_data(X, y)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.25, random_state=42)

# Define the RNN model
model = build_model(len(tokenizer.word_index), 300, max_sequence_length)

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Predict probabilities for the test set
y_pred_prob = model.predict(X_test)

# Threshold for classification
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation metrics:
Accuracy: 0.468
Precision: 0.4785714285714286
Recall: 0.5275590551181102
F1-score: 0.5018726591760299


In [35]:
#WORD2VEC

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score
from gensim.models import KeyedVectors
from keras.models import Sequential
from keras.layers import Dense, Embedding, SimpleRNN, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences



def load_word_vectors(file_path, limit=None):
    word_vectors = KeyedVectors.load_word2vec_format(file_path, binary=True, limit=limit)
    return word_vectors

def preprocess_data(X, y):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(X)
    X_sequences = tokenizer.texts_to_sequences(X)
    max_sequence_length = max(map(len, X_sequences))
    X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
    y_binary = np.where(y == 'P', 1, 0)
    return X_padded, y_binary, tokenizer, max_sequence_length

def create_embedding_matrix(tokenizer, word_vectors):
    embedding_dim = word_vectors.vector_size
    num_words = len(tokenizer.word_index) + 1
    embedding_matrix = np.zeros((num_words, embedding_dim))
    for word, index in tokenizer.word_index.items():
        if word in word_vectors:
            embedding_matrix[index] = word_vectors[word]
    return embedding_matrix

# Load the dataset
file_path = 'urdu-sentiment-corpus-v1.tsv'
df = pd.read_csv(file_path, delimiter='\t')

# Extract features and labels
X = df['Tweet']
y = df['Class']

# Preprocess the data
X_processed, y_processed, tokenizer, max_seq_length = preprocess_data(X, y)

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_processed, y_processed, test_size=0.25, random_state=42)

# Load pre-trained word vectors
word_vectors_path = r'C:\Users\AATIGERS\Desktop\Urdsent\GoogleNews-vectors-negative300.bin'
word_vectors = load_word_vectors(word_vectors_path, limit=20000)

# Create embedding matrix
embedding_matrix = create_embedding_matrix(tokenizer, word_vectors)

# Define the RNN model architecture
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=300, weights=[embedding_matrix], input_length=max_seq_length, trainable=False))
model.add(SimpleRNN(64, return_sequences=False))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=10, batch_size=64, verbose=1)

# Predict probabilities for the test set
y_pred_prob = model.predict(X_test)

# Apply threshold for classification
threshold = 0.5
y_pred = (y_pred_prob > threshold).astype(int)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print evaluation metrics
print("Evaluation metrics:")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

# Calculate precision, recall, and F1-score for different thresholds
thresholds = np.arange(0.1, 0.6, 0.1)
results = []
for thr in thresholds:
    y_pred = (y_pred_prob > thr).astype(int)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    results.append({'Threshold': thr, 'Precision': precision, 'Recall': recall, 'F1-score': f1})

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Print results
print(results_df)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Evaluation metrics:
Accuracy: 0.492
Precision: 0.5
Recall: 0.007874015748031496
F1-score: 0.015503875968992248
   Threshold  Precision    Recall  F1-score
0        0.1   0.508000  1.000000  0.673740
1        0.2   0.510204  0.984252  0.672043
2        0.3   0.512295  0.984252  0.673854
3        0.4   0.512295  0.984252  0.673854
4        0.5   0.500000  0.007874  0.015504
