<a href="https://www.kaggle.com/code/mohsinmshabbir/twitter-sentiment-analysis-traditional-dl?scriptVersionId=187135419" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Setting Up

## Setting Up Conda Environment

In [None]:
# conda create -n nlp2 python=3.10.*
# conda activate nlp2 
# conda install tensorflow
# conda install pandas
# conda install spacy
# conda install scikit-learn
# conda install imbalanced-learn
# conda install worldcloud
# conda install gensim

## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import re
import spacy
import string
import random

from wordcloud import WordCloud # type: ignore
import matplotlib.pyplot as plt # type: ignore
#! python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm", disable=['parser', 'ner'])

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.dummy import DummyClassifier

In [None]:
# ! pip install keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences # type: ignore
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.callbacks import LearningRateScheduler


import gensim # type: ignore
from gensim.models import Word2Vec # type: ignore

## Importing Data

In [None]:
df = pd.read_csv('Twitter_Data.csv')
df.head()

## Data Preprocessing

In [None]:
df.info()

In [None]:
df.category.value_counts()

In [None]:
print(df['clean_text'][19])
print(df['clean_text'][91])

In [None]:
any_nan_in_A = df['clean_text'].isna().any()
print(any_nan_in_A)

In [None]:
df['clean_text'] = df['clean_text'].fillna('')

In [None]:
df = df.dropna()

In [None]:
any_nan_in_A = df['category'].isna().any()
print(any_nan_in_A)

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_emoji(text):
    emoji_pattern = re.compile(
        '['
        u'\U0001F600-\U0001F64F'  # emoticons
        u'\U0001F300-\U0001F5FF'  # symbols & pictographs
        u'\U0001F680-\U0001F6FF'  # transport & map symbols
        u'\U0001F1E0-\U0001F1FF'  # flags (iOS)
        u'\U00002702-\U000027B0'
        u'\U000024C2-\U0001F251'
        ']+',
        flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)


def remove_html(text):
    html = re.compile(r'^[^ ]<.*?>|&([a-z0-9]+|#[0-9]\"\'\“{1,6}|#x[0-9a-f]{1,6});[^A-Za-z0-9]+')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

def remove_quotes(text):
    quotes = re.compile(r'[^A-Za-z0-9\s]+')
    return re.sub(quotes, '', text)


# Applying helper functions
df1 = df.copy()
df1['clean_text'] = df1['clean_text'].apply(lambda x: remove_URL(x))
df1['clean_text'] = df1['clean_text'].apply(lambda x: remove_emoji(x))
df1['clean_text'] = df1['clean_text'].apply(lambda x: remove_html(x))
df1['clean_text'] = df1['clean_text'].apply(lambda x: remove_punct(x))
df1['clean_text'] = df1['clean_text'].apply(lambda x: remove_quotes(x))
df1['clean_text'] = df1['clean_text'].str.lower()


print(df1['clean_text'][19]),
print(df1['clean_text'][91])

### Reducing the Dataset Size

As the Dataset size is too large, we'll reduce the dataset size by dropping the neutral (0.0) sentiment score in our Dataset

In [None]:
df2 =df1.copy()
df2 = df1[df1.category != 0.0]
df2.info()

In [None]:
df2 = df2.drop_duplicates(subset=['clean_text'])
df2.info()

Reduced the dataset Size by 34% approx.

In [None]:
df3 = df2.copy()
def clean_tweet_text(text):
  doc = nlp(text)
  tokens = [token.lemma_ for token in doc if not token.is_digit
            and not token.is_space]
  return " ".join(tokens)

df3['clean_text'] = df3["clean_text"].apply(clean_tweet_text)

print(df3['clean_text'][19]),
print(df3['clean_text'][91])

# Exploratary Data Analysis

In [None]:
df_test = df3.copy()
category = df_test['category']
mapped_labels = [0 if label == -1 else 1 for label in category]
df_test['category'] = mapped_labels
df_test.head()


In [None]:
display(df_test.describe())

# Print the info
print(df_test.info())

In [None]:
# Calculate sentiment counts
sentiment_counts = df_test['category'].value_counts()

# Get values for x-axis and y-axis
x = sentiment_counts.index.astype(str).to_list()  # Ensure string labels for x-axis
y = sentiment_counts.values.tolist()

# Create the bar chart
plt.bar(x, y)

# Customize the plot
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.xticks(rotation=45, ha='right')  # Rotate x-axis labels for better readability

# Adjust layout for better visibility
plt.tight_layout()

# Display the plot
plt.show()

**Inference**

Positive data is the larger than negative.

Which means our data is not balanced. We'll keep that in my mind when we'll train and test our data.

In [None]:
df4 = df_test.copy()
df4['word_count'] = df_test['clean_text'].apply(lambda x : len(x.split()))
df4['char_count'] = df_test['clean_text'].apply(lambda x : len(x.replace(" ","")))
df4['word_density'] = df4['word_count'] / (df4['char_count'] + 1)

df4[['word_count', 'char_count', 'word_density']].head()

In [None]:
def plot_distribution_by_category(df, column, start, end, size, category_type):
    # Filter data based on category
    negative_df = df[df['category'] == 0]
    positive_df = df[df['category'] == 1]

    # Define bins for histogram
    bins = np.arange(start, end + size, size)

    # Create subplots with 1 row and 2 columns (for negative, positive)
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))  # Adjust figure size as needed

    # Plot histograms for each category in separate subplots
    ax[0].hist(negative_df[column], bins=bins, color='orange', alpha=0.75)
    ax[0].set_title('Negative Tweets')
    ax[0].set_xlabel(f'Tweet Length {category_type}')
    ax[0].set_ylabel('Number of Tweets')

    ax[1].hist(positive_df[column], bins=bins, color='blue', alpha=0.75)
    ax[1].set_title('Positive Tweets')
    ax[1].set_xlabel(f'Tweet Length {category_type}')
    ax[1].set_ylabel('Number of Tweet')

    # Adjust layout to prevent overlap of labels
    plt.tight_layout()

    # Show plot
    plt.show()

In [None]:
plot_distribution_by_category(df4, 'word_count', 0, 60, 3,'Words')

**Inference**

> Most tweets prefer to use less than 40 words to write a tweet.
>
> Generally, tweeters write about 10 - 37 words













In [None]:
plot_distribution_by_category(df4, 'char_count', 0, 300, 30,'Characters')

**Inference**


> The twitter has a limitation of 280 words and yet tweeters use anywhere between 50-200 characters
>





In [None]:
plot_distribution_by_category(df4,'word_density', 0.09, 0.3, .01,'Word density')

In [None]:
positive_tweets = df4[df4['category'] == 1.0]
negative_tweets = df4[df4['category'] == 0.0]

In [None]:
df4.head()

In [None]:
pos_text_cln = " ".join(positive_tweets.clean_text)
neg_text_cln = " ".join(negative_tweets.clean_text)

# replacing some most common words present in these texts
noise_words = ['will', 'make', 'people','say', 'vote', 'now', 'give',
               's', 'one', 'govt', 'thi', 'hi', 'ju', 'hi'
               ]
for noise in noise_words:
    pos_text_cln = pos_text_cln.lower().replace(noise," ")
    neg_text_cln = neg_text_cln.lower().replace(noise, " ")

def green_color(word, font_size, position, orientation, random_state=None, **kwargs):
    return 'hsl({:d}, 80%, {:d}%)'.format(random.randint(85, 140), random.randint(60, 80))

def red_color(word, font_size, position, orientation, random_state=None, **kwargs):
    return 'hsl({:d}, 80%, {:d}%)'.format(random.randint(0, 35), random.randint(60, 80))


fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[10, 6])

wordcloud1 = WordCloud(background_color='white', height=400).generate(pos_text_cln)
ax1.imshow(wordcloud1.recolor(color_func=green_color, random_state=3),interpolation="bilinear")
ax1.axis('off');
ax1.set_title('Positive Tweets');

wordcloud2 = WordCloud(background_color='white', height=400).generate(neg_text_cln)
ax2.imshow(wordcloud2.recolor(color_func=red_color, random_state=3),interpolation="bilinear")
ax2.axis('off');
ax2.set_title('Negative Tweets');

Inference

> One thing is clear that our data comes largely from India as evident from WordCloud
>
> The topic also seems to be around politics primarily talking about 'bjp', 'congress' and 'modi'
>
> The word 'modi', prime minister of India, is used in every sentiment indicating both support and opposition for "Modi Government".

# Saving the dataset

In [None]:
df4.to_csv('refined_tweet_data.csv',encoding = 'utf-8-sig')

# Baseline Model

In [None]:
X = df4['clean_text']
Y = df4['category']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2,
                                                    stratify = Y,
                                                    random_state = 34)

In [None]:
#Initialize the tf-id vectorizer
vectorizer = TfidfVectorizer(strip_accents='ascii')

#Using the vectorizer to fit on out training data and testing data
tfidf_train = vectorizer.fit_transform(X_train)
tfidf_test = vectorizer.transform(X_test)

In [None]:
# Create a Majority Class Classifier
majority_classifier = DummyClassifier(strategy="most_frequent")
majority_classifier.fit(tfidf_train, Y_train)

# Predict the majority class for all instances in the test set
y_pred_majority = majority_classifier.predict(tfidf_test)

In [None]:
# Print classification report
print("\nClassification Report:\n")
print(classification_report(Y_test, y_pred_majority))

# Traditional Machine Learning Model

## Naive Bayes

In [None]:
# df4 = pd.read_csv('refined_tweet_data.csv')

In [None]:
#Initialize the Multinomial Naive Bayes classifier
best_accuracy = 0
a = 0.1
while a <= 1:
  nb = MultinomialNB(alpha=a)

  #Fitting the model
  nb.fit(tfidf_train, Y_train)
  curr_accuracy = nb.score(tfidf_test, Y_test)
  print(f"Accuracy with alpha {a}: {curr_accuracy}")
  # Predict the labels
  if curr_accuracy > best_accuracy:
    best_accuracy = curr_accuracy
    y_pred = nb.predict(tfidf_test)
  a = a + 0.05

### Evaluate Performance

In [None]:
# Print classification report
print("\nClassification Report:\n")
print(classification_report(Y_test, y_pred))

In [None]:
smote = SMOTE(random_state=42)
tfidf_train_oversample, Y_train = smote.fit_resample(tfidf_train, Y_train)

In [None]:
#Initialize the Multinomial Naive Bayes classifier
best_accuracy = 0
a = 0.1
while a <= 1:
  nb = MultinomialNB(alpha=a)

  #Fitting the model
  nb.fit(tfidf_train_oversample, Y_train)
  curr_accuracy = nb.score(tfidf_test, Y_test)
  print(f"Accuracy with alpha {a}: {curr_accuracy}")
  # Predict the labels
  if curr_accuracy > best_accuracy:
    best_accuracy = curr_accuracy
    y_pred = nb.predict(tfidf_test)
  a = a + 0.05

In [None]:
# Print classification report
print("\nClassification Report:\n")
print(classification_report(Y_test, y_pred))

**Inference:**

Based on the evaluation metrics, the text classification model demonstrates a GOOD overall performance. It achieves an accuracy of almost 81%, with somewhat imbalanced precision and precise recall for both classes. The F1-score further support this conclusion.

### Prediction using Naive Bayes

In [None]:
def predict_tweet(tweet, model, vectorizer):
    # Preprocess the input tweet
    tweet = remove_URL(tweet)
    tweet = remove_emoji(tweet)
    tweet = remove_html(tweet)
    tweet = remove_punct(tweet)
    tweet = remove_quotes(tweet)
    tweet = tweet.lower()
    tweet = clean_tweet_text(tweet)
    
    # Transform the tweet using the TF-IDF vectorizer
    tweet_tfidf = vectorizer.transform([tweet])
    
    # Predict the label
    prediction = model.predict(tweet_tfidf)
    return prediction[0]

new_text = "Great match tonight! Our team played exceptionally well and secured a decisive victory. #football #winning"
predicted_sentiment= predict_tweet(new_text, nb, vectorizer)

# Print the result with the sentiment label
sentiment_label = "Positive" if predicted_sentiment == 1 else "Negative"
print(f"Predicted Sentiment: {sentiment_label}, Score: {predicted_sentiment}")

## Logistic Regression

In [None]:
logistic_regression = LogisticRegression(max_iter=1000)
logistic_regression.fit(tfidf_train_oversample, Y_train)

logistic_prediction = logistic_regression.predict(tfidf_test)
logistic_accuracy = accuracy_score(Y_test, logistic_prediction)
print(f"Accuracy: {logistic_accuracy}")

### Evaluate Performance

In [None]:
# Print the Classification Report
cr = classification_report(Y_test, logistic_prediction)
print("\n\nClassification Report\n")
print(cr)

**Inference:**

These metrics suggest that the text classification model demonstrates great overall performance with an accuracy of 91% in classifying text data. It can effectively identify positive and negative tweets with a balanced accuracy and has a good ability to differentiate between the classes.

### Prediction using Logistic Regression

In [None]:
new_text = "Great match tonight! Our team played exceptionally well and secured a decisive victory. #football #winning"
predicted_sentiment= predict_tweet(new_text, logistic_regression, vectorizer)

# Print the result with the sentiment label
sentiment_label = "Positive" if predicted_sentiment == 1 else "Negative"
print(f"Predicted Sentiment: {sentiment_label}, Score: {predicted_sentiment}")

# Word2Vec Analysis

In [None]:
TRAIN_SIZE = 0.8

# Parameters for WORD2VEC
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# Parameters related to KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 8
BATCH_SIZE = 1024

In [None]:
df_train, df_test = train_test_split(df4, test_size=1-TRAIN_SIZE, random_state=42)

### Corpus Creation

In [None]:
documents = [text.split() for text in df_train.clean_text]

### Word2Vec Model Creation

In [None]:
w2v_model = gensim.models.word2vec.Word2Vec(vector_size=W2V_SIZE,
                                            window=W2V_WINDOW,
                                            min_count=W2V_MIN_COUNT,
                                            workers=8)

### Vocabulary Creation

In [None]:
w2v_model.build_vocab(documents)

In [None]:
words = w2v_model.wv.key_to_index.keys()
vocab_size = len(words)
print("Vocab size", vocab_size)

### Training Word2Vec Model

In [None]:
w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

### Word2Vec Model Testing

In [None]:
w2v_model.wv.most_similar("india")

In [None]:
w2v_model.wv.most_similar("bjp")

In [None]:
w2v_model.wv.most_similar("narendra")

# Deep Learning Models

## LSTM

In [None]:
def create_tokenizer_and_vocab(df_train, text_column):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df_train[text_column])

    vocab_size = len(tokenizer.word_index) + 1
    print("Total words", vocab_size)

    return tokenizer, vocab_size

def preprocess_texts(texts, tokenizer, sequence_length):
    text_sequences = tokenizer.texts_to_sequences(texts)
    text_padded = pad_sequences(text_sequences, maxlen=sequence_length)
    return text_padded

def decode_sentiment(score):       
    return 0 if score <= 0.5 else 1

def predict_sentiment(text, model, tokenizer, sequence_length):
    text_padded = preprocess_texts(text, tokenizer, sequence_length)
    score = model.predict(text_padded, verbose=0)[0][0]
    sentiment = decode_sentiment(score)
    return sentiment, score


tokenizer, vocab_size = create_tokenizer_and_vocab(df_train, 'clean_text')

### Embedding Matrix

In [None]:
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
print(embedding_matrix.shape)

In [None]:
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)

### LSTM Model Creation

In [None]:
model = Sequential()
model.add(embedding_layer)
model.add(Dropout(0.5))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer="adam",
              metrics=['accuracy'])

### Preparing training and test data

In [None]:
# Prepare training and test data
x_train = preprocess_texts(df_train.clean_text, tokenizer, SEQUENCE_LENGTH)
x_test = preprocess_texts(df_test.clean_text, tokenizer, SEQUENCE_LENGTH)
y_train = df_train['category'].values.astype('int32').reshape(-1, 1)
y_test = df_test['category'].values.astype('int32').reshape(-1, 1)

### LSTM Model Training

In [None]:
callbacks = [ ReduceLROnPlateau(monitor='val_loss', patience=5, cooldown=0),
              EarlyStopping(monitor='val_loss', min_delta=1e-4, patience=5)]

In [None]:
lstm_history = model.fit(x_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_split=0.1,
                    verbose=1,
                    callbacks=callbacks)

### Evaluating the Model (LSTM)

In [None]:
score = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
print()
print("ACCURACY:",score[1])
print("LOSS:",score[0])

### Making Predictions

In [None]:
# Making predictions on test set
y_pred_1d = [decode_sentiment(score) for score in model.predict(x_test, verbose=1, batch_size=8000)]
y_test_1d = list(df_test['category'])
print(classification_report(y_test_1d, y_pred_1d))

In [None]:
# Predicting sentiment of a new text
new_text = "The movie had stunning visuals and a great soundtrack, but the plot was incredibly boring and the acting was subpar."
predicted_sentiment, score = predict_sentiment(new_text, model, tokenizer, SEQUENCE_LENGTH)


# Print the result with the sentiment label
sentiment_label = "Positive" if predicted_sentiment == 1 else "Negative"
print(f"Predicted Sentiment: {sentiment_label}, Score: {score}")

## CNNs

### Hyperparameters

In [None]:
# Hyperparameters (you can adjust these)
FILTER_SIZES = [3, 4, 5]  # Experiment with different filter sizes
NUM_FILTERS = 128  # Number of filters per convolutional layer
SEQUENCE_LENGTH = 300  # Assuming tweets are preprocessed to this length
EPOCHS = 8
BATCH_SIZE = 1024

In [None]:
vocab_size, embedding_dim = embedding_matrix.shape

### Define CNN Model

In [None]:
cnn_model = Sequential()

# Embedding layer with pre-trained weights (non-trainable)
cnn_model.add(Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False))

# Convolutional layers with different filter sizes
for filter_size in FILTER_SIZES:
  cnn_model.add(Conv1D(NUM_FILTERS, kernel_size=filter_size, activation='relu'))
  cnn_model.add(MaxPooling1D(pool_size=2))

# Flatten layer and add fully-connected layers
cnn_model.add(Flatten())
cnn_model.add(Dense(128, activation='relu'))
cnn_model.add(Dropout(0.5))
cnn_model.add(Dense(1, activation='sigmoid'))  # Binary-class classification (positive, negative)

cnn_model.summary()

In [None]:
def lr_schedule(epoch):
  ##"""Reduce learning rate by 10% every 2 epochs."""
  lr = 0.001  # Initial learning rate
  if epoch > 0 and epoch % 2 == 0:
    lr *= 0.9
  return lr

learning_rate_scheduler = LearningRateScheduler(lr_schedule)

### Compile and train CNN Model

In [None]:
cnn_model.compile(loss='binary_crossentropy',
                 optimizer="adam",
                 metrics=['accuracy'])

cnn_history = cnn_model.fit(x_train, y_train,
                             batch_size=BATCH_SIZE,
                             epochs=EPOCHS,
                             validation_split=0.1,
                             verbose=1,
                             callbacks=[learning_rate_scheduler]
                             )

### Evaluate Performance(CNN)

In [None]:
test_loss, test_accuracy = cnn_model.evaluate(x_test, y_test, verbose=1)
print("Test Accuracy:", test_accuracy)

### Making Predictions

In [None]:
cnn_scores = cnn_model.predict(x_test, verbose=1, batch_size=8000)
cnn_y_pred_1d = [decode_sentiment(score[0]) for score in cnn_scores]
print(classification_report(y_test_1d, cnn_y_pred_1d))

In [None]:
new_text = "The movie had stunning visuals and a great soundtrack, but the plot was incredibly boring and the acting was subpar."
predicted_sentiment, score = predict_sentiment(new_text, cnn_model, tokenizer, SEQUENCE_LENGTH)

# Print the result with the sentiment label
sentiment_label = "Positive" if predicted_sentiment == 1 else "Negative"
print(f"Predicted Sentiment: {sentiment_label}, Score: {score}")