# Step-1 Data Acquisition

In [None]:
# !pip install datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[

In [1]:
from datasets import load_dataset

dataset = load_dataset("carblacac/twitter-sentiment-analysis",trust_remote_code=True)

PermissionError: [Errno 13] Permission denied: 'C:/twitter-sentiment-analysis-train.jsonl'

In [None]:
dataset

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df_train = pd.DataFrame(dataset["train"])
df_val = pd.DataFrame(dataset["validation"])
df_test = pd.DataFrame(dataset["test"])

The original Twitter Sentiment Analysis Dataset contains 1,578,627 classified tweets, each row is marked as 1 for positive sentiment and 0 for negative sentiment

In [None]:
df_train.sample(5)

In [None]:
df_train['feeling'].value_counts()

In [None]:
dataset1 = load_dataset("Sp1786/multiclass-sentiment-analysis-dataset")

In [None]:
dataset1

In [None]:
df_train1 = pd.DataFrame(dataset1["train"])
df_val1 = pd.DataFrame(dataset1["validation"])
df_test1 = pd.DataFrame(dataset1["test"])

In [None]:
df_train1.sample(5)

# Step-2 Dataset Concatination and Visualization

In [None]:
def convert_df_pipeline(df1,df2):
  df1['feeling'] = df1['feeling'].apply(lambda x: 2 if x == 1 else 0)
  df1 = df1.rename(columns={'feeling': 'label'})

  df_concat = pd.concat([df1, df2.loc[:,['text','label']]])

  return df_concat


train=convert_df_pipeline(df_train,df_train1)
val=convert_df_pipeline(df_val,df_val1)
test=convert_df_pipeline(df_test,df_test1)

In [None]:
ls1=train['label'].value_counts()

In [None]:
ls1

In [None]:
val['label'].value_counts()

In [None]:
labels = 'Negative (0)', 'Positive (2)', 'Neutral (1)'
sizes = [ls1[0], ls1[2], ls1[1]]
fig1, ax1 = plt.subplots()
plt.title('LABEL DISTRIBUTION IN TRAINING DATASET')
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
ax1.axis('equal')
plt.show()

In [None]:
labels = 'Negative (0)', 'Positive (2)', 'Neutral (1)'
sizes = [ls1[0], ls1[2], ls1[1]]

fig, ax = plt.subplots()
plt.title('LABEL DISTRIBUTION IN TRAINING DATASET')
ax.bar(labels, sizes)
ax.set_ylabel('Count')
ax.set_xticklabels(labels, rotation=0)
plt.show()

# Step-3 Text Preprocessing

In [None]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

lemmatizer = WordNetLemmatizer()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)

In [None]:
nltk.download('punkt')
nltk.download('wordnet')

In [None]:
# negative words to exclude from stop_words
negative_words = {"don't", "doesn't", "didn't", "won't", "wouldn't", "can't", "mightn", 'didn', 'ain', "cannot", "isn't", "aren't", 'needn', "wasn't", "weren't", "haven't", "hasn't", "hadn't", "mustn't", "needn't", "shan't", "shouldn't",'not', "mightn't", "couldn't",'no', "hasn't"}

# Remove negative words from stop_words
stop_words -= negative_words

In [None]:
print(stop_words)

In [None]:
def preprocess_text_pipeline(text):
    #--> 1. Lowercasing
    text = text.lower()

    #--> 2. Remove HTML tags and special characters
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    #--> 3. Tokenize the text
    tokens = word_tokenize(text)
    # print(tokens)

    #--> 4. Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    # print(tokens)

    #--> 5. Lemmatize the tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # print(tokens)

    #--> 7. Join the tokens back into a string
    text = ' '.join(tokens)

    return text

#--> Example usage:
text = "I am not a @good kid. I am joking."
preprocessed_text = preprocess_text_pipeline(text)
print(preprocessed_text)

# Step-4 Word Embedding and padding

In [None]:
!wget http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip

In [None]:
!unzip glove.6B.zip

In [None]:
import numpy as np

words = dict()

def add_to_dict(d, filename):
  with open(filename, 'r') as f:
    for line in f.readlines():
      line = line.split(' ')

      try:
        d[line[0]] = np.array(line[1:], dtype=float)
      except:
        continue

add_to_dict(words, 'glove.6B.50d.txt')
words

In [None]:
len(words)

In [None]:
def text_to_word_vectors(message, word_dict=words):
  text=preprocess_text_pipeline(message)
  # print(text)
  tokens = word_tokenize(text)
  processed_list_of_tokens = [t for t in tokens if t in words]

  vectors = []

  for token in processed_list_of_tokens:
    if token not in word_dict:
      continue

    token_vector = word_dict[token]
    vectors.append(token_vector)

  return np.array(vectors, dtype=float)


vector=text_to_word_vectors('i hate this film, it is worst')
print(vector,vector.shape)

In [None]:
def df_to_X_y(dff):
  y = dff['label'].to_numpy().astype(int)

  all_word_vector_sequences = []

  for message in dff['text']:
    message_as_vector_seq = text_to_word_vectors(message)

    if message_as_vector_seq.shape[0] == 0:
      message_as_vector_seq = np.zeros(shape=(1, 50))

    all_word_vector_sequences.append(message_as_vector_seq)

  return all_word_vector_sequences, y

In [None]:
train.head()

In [None]:
X_train, y_train = df_to_X_y(train)

In [None]:
print(len(X_train), len(X_train[2]), type(X_train))

In [None]:
sequence_lengths = []

for i in range(len(X_train)):
  sequence_lengths.append(len(X_train[i]))

plt.hist(sequence_lengths)

In [None]:
pd.Series(sequence_lengths).describe()

In [None]:
# from copy import deepcopy

# def pad_X(X, desired_sequence_length=198):
#   X_copy = deepcopy(X)

#   for i, x in enumerate(X):
#     x_seq_len = x.shape[0]
#     sequence_length_difference = desired_sequence_length - x_seq_len

#     pad = np.zeros(shape=(sequence_length_difference, 50))

#     X_copy[i] = np.concatenate([x, pad])

#   return np.array(X_copy).astype(float)

#--> The issue is that this pad_X function is creating a huge amount of temporary arrays and copying data around, which is causing the CPU to collapse.

In [None]:
import tqdm
def pad_X(X, batch_size=20, desired_sequence_length=198):
    padded_X = []
    for batch in tqdm.tqdm(range(0, len(X), batch_size), desc="Padding batches", unit="batch"):
        batch_X = X[batch:batch+batch_size]
        batch_padded = [np.pad(x, ((0, desired_sequence_length - x.shape[0]), (0, 0)), mode='constant') for x in batch_X]
        padded_X.extend(batch_padded)
    return np.array(padded_X).astype(float)

In [None]:
X_train = pad_X(X_train)

X_train.shape

Padding batches:  92%|█████████▏| 6990/7561 [00:24<00:05, 103.94batch/s]

In [None]:
y_train.shape

In [None]:
X_val, y_val = df_to_X_y(val)
X_val = pad_X(X_val)

X_val.shape, y_val.shape

In [None]:
X_test, y_test = df_to_X_y(test)
X_test = pad_X(X_test)

X_test.shape, y_test.shape

# Step-5 Model Training

In [None]:
frequencies = pd.value_counts(train['label'])

frequencies

In [None]:
weights = {0: frequencies.sum() / frequencies[0], 1: frequencies.sum() / frequencies[1], 2: frequencies.sum() / frequencies[2]}
weights

In [None]:
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

model = Sequential([])

model.add(layers.Input(shape=(198, 50)))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.LSTM(64, return_sequences=True))
model.add(layers.Dropout(0.2))
model.add(layers.Flatten())
model.add(layers.Dense(3, activation='softmax'))

In [None]:
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import AUC, f1_score
from tensorflow.keras.callbacks import ModelCheckpoint

cp = ModelCheckpoint('model/', save_best_only=True)

model.compile(optimizer=Adam(learning_rate=0.0001),
              loss=CategoricalCrossentropy(),
              metrics=['accuracy', f1_score(num_classes=3, average='weighted')])

In [None]:
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=20, callbacks=[cp], class_weight=weights)

# Step-6 Model Evaluation

In [None]:
from tensorflow.keras.models import load_model

best_model = load_model('model/')

In [None]:
test_predictions = best_model.predict(X_test)
test_predictions_class = np.argmax(test_predictions, axis=1)

from sklearn.metrics import classification_report

print(classification_report(y_test, test_predictions_class))

# Step-7 X-AI Using LIME

In [None]:
!pip install lime

In [None]:
import lime
from lime import lime_text
from lime.lime_text import LimeTextExplainer

In [None]:
idx=150
output = best_model.predict([X_test[idx]])
print(output)

In [None]:
class_names = [0,1,2]
explainer = LimeTextExplainer(class_names=class_names)
exp = explainer.explain_instance(X_test[idx], best_model.predict_proba, num_features = 100,top_labels=3)
print('New document id: %d' % idx)
print('Predicted Label =', best_model.predict([X_test[idx]]))
print('Predicted probabilites =', best_model.predict_proba([X_test[idx]]))
print('Actual Label: %s' % y_test[idx])
print(exp.available_labels())

In [None]:
exp.show_in_notebook(text=True)

In [None]:
exp.as_list(label=0)

In [None]:
exp.as_list(label=1)

In [None]:
exp.as_list(label=2)

In [None]:
exp.as_pyplot_figure(label=0)