NAIVE BAYES ALGORITHM FOR SENTIMENT ANALYSIS


In [1]:
pip install datasets

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.2 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from datasets import load_dataset

In [3]:
dataset = load_dataset("carblacac/twitter-sentiment-analysis")

Downloading builder script:   0%|          | 0.00/4.38k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.44k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/5.38M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/149985 [00:00<?, ? examples/s]

Map:   0%|          | 0/61998 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/120 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/30 [00:00<?, ?ba/s]

Creating json from Arrow format:   0%|          | 0/62 [00:00<?, ?ba/s]

Generating train split:   0%|          | 0/119988 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/29997 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/61998 [00:00<?, ? examples/s]

In [4]:
df = pd.DataFrame(dataset['train'])

# Preprocessing steps (basic example)
df['text'] = df['text'].str.replace(r'http\S+', '', regex=True).str.replace(r'@\S+', '', regex=True).str.replace(r'#', '').str.replace(r'\s+', ' ', regex=True).str.strip()

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['feeling'], test_size=0.2, random_state=42)


In [5]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [6]:
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)



In [7]:
# Predictions and Evaluation
y_pred = nb_model.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.75      0.80      0.77     11954
           1       0.79      0.73      0.76     12044

    accuracy                           0.76     23998
   macro avg       0.77      0.76      0.76     23998
weighted avg       0.77      0.76      0.76     23998



In [15]:
from sklearn.metrics import accuracy_score


In [17]:
# Assuming y_pred contains probabilities and not discrete class labels
# Convert probabilities to binary labels
y_pred_classes = (y_pred > 0.5).astype(int)

# Then calculate accuracy
accuracy = accuracy_score(y_test, y_pred_classes)
print(f'Accuracy: {accuracy}')


Accuracy: 0.7906908909075756


BI-LSTM MODEL FOR SENTIMENT ANALYSIS

In [8]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense

In [9]:
# Tokenization
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Padding
max_length = max([len(x) for x in X_train_seq])
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length)
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length)


In [10]:
# Model building
model = Sequential()
model.add(Embedding(5000, 128, input_length=max_length))
model.add(Bidirectional(LSTM(64)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model summary
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 38, 128)           640000    
                                                                 
 bidirectional (Bidirection  (None, 128)               98816     
 al)                                                             
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 738945 (2.82 MB)
Trainable params: 738945 (2.82 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
y_train_np = y_train.to_numpy().astype('float32')
y_test_np = y_test.to_numpy().astype('float32')

# Train the model
model.fit(X_train_pad, y_train_np, batch_size=32, epochs=5, validation_data=(X_test_pad, y_test_np))

# Evaluate the model
loss, accuracy = model.evaluate(X_test_pad, y_test_np)
print(f'Loss: {loss}, Accuracy: {accuracy}')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Loss: 0.48831096291542053, Accuracy: 0.7906908988952637


In [14]:
y_pred = model.predict(X_test_pad)
# Assuming binary classification (0 or 1)
y_pred_classes = (y_pred > 0.5).astype("int32")

from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

precision = precision_score(y_test_np, y_pred_classes)
recall = recall_score(y_test_np, y_pred_classes)
f1 = f1_score(y_test_np, y_pred_classes)

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1 Score: {f1}')


Precision: 0.7894780242434237
Recall: 0.7949186316838259
F1 Score: 0.7921889868023664
