In [1]:
import pandas as pd

# Load the dataset
file_path = "all-data.csv"  # Update if the file is in a different location
data = pd.read_csv(file_path, encoding="latin1", header=None)

# Assign meaningful column names
data.columns = ["Sentiment", "Text"]

# Check the first few rows
print(data.head())


  Sentiment                                               Text
0   neutral  According to Gran , the company has no plans t...
1   neutral  Technopolis plans to develop in stages an area...
2  negative  The international electronic industry company ...
3  positive  With the new production plant the company woul...
4  positive  According to the company 's updated strategy f...


In [2]:
# Drop duplicates and null values
data = data.drop_duplicates().dropna()

# Check for any remaining issues
print(data.info())

# Display a summary of sentiment distribution
print(data["Sentiment"].value_counts())


<class 'pandas.core.frame.DataFrame'>
Index: 4840 entries, 0 to 4845
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Sentiment  4840 non-null   object
 1   Text       4840 non-null   object
dtypes: object(2)
memory usage: 113.4+ KB
None
Sentiment
neutral     2873
positive    1363
negative     604
Name: count, dtype: int64


In [4]:
import spacy

# Load spaCy's English language model
nlp = spacy.load("en_core_web_sm")

# Preprocessing function with spaCy
def preprocess_text_spacy(text):
    # Process the text with spaCy
    doc = nlp(text)
    # Remove stopwords, punctuation, and convert to lowercase
    tokens = [token.text.lower() for token in doc if not token.is_stop and not token.is_punct]
    # Join tokens back into a single string
    return " ".join(tokens)

# Apply the preprocessing function
data["Processed_Text"] = data["Text"].apply(preprocess_text_spacy)

# Display the first few rows of processed data
print(data.head())


  Sentiment                                               Text  \
0   neutral  According to Gran , the company has no plans t...   
1   neutral  Technopolis plans to develop in stages an area...   
2  negative  The international electronic industry company ...   
3  positive  With the new production plant the company woul...   
4  positive  According to the company 's updated strategy f...   

                                      Processed_Text  
0  according gran company plans production russia...  
1  technopolis plans develop stages area 100,000 ...  
2  international electronic industry company elco...  
3  new production plant company increase capacity...  
4  according company updated strategy years 2009 ...  


In [10]:
label_mapping = {"positive": 2, "neutral": 1, "negative": 0}
data["Sentiment_Label"] = data["Sentiment"].map(label_mapping)


In [11]:
from sklearn.model_selection import train_test_split

X = data["Processed_Text"]
y = data["Sentiment_Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=["negative", "neutral", "positive"]))




Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.38      0.51       118
     neutral       0.73      0.95      0.83       563
    positive       0.76      0.46      0.57       287

    accuracy                           0.74       968
   macro avg       0.75      0.60      0.63       968
weighted avg       0.74      0.74      0.71       968



In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

# Step 1: Tokenise and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

max_len = 100  # Maximum sequence length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding="post")
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding="post")

# Step 2: Build BiLSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_len))
model.add(Bidirectional(LSTM(64, return_sequences=False)))
model.add(Dense(3, activation="softmax"))  # 3 classes: Positive, Neutral, Negative

# Compile model
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])

# Step 3: Train the model
model.fit(X_train_pad, y_train, epochs=5, batch_size=32, validation_split=0.2)

# Step 4: Evaluate model
y_pred_dl = model.predict(X_test_pad).argmax(axis=1)

# Step 5: Classification report with correct class names
# Define label mapping
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}

# Invert the label mapping to get class names
target_names = {v: k for k, v in label_mapping.items()}

# Print classification report
print("BiLSTM Performance:")
print(classification_report(y_test, y_pred_dl, target_names=[target_names[i] for i in sorted(target_names)]))


Epoch 1/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 107ms/step - accuracy: 0.6042 - loss: 0.9205 - val_accuracy: 0.6606 - val_loss: 0.7821
Epoch 2/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 134ms/step - accuracy: 0.7478 - loss: 0.5989 - val_accuracy: 0.7316 - val_loss: 0.6840
Epoch 3/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 170ms/step - accuracy: 0.9136 - loss: 0.2683 - val_accuracy: 0.7071 - val_loss: 0.8079
Epoch 4/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 116ms/step - accuracy: 0.9587 - loss: 0.1349 - val_accuracy: 0.7123 - val_loss: 0.9720
Epoch 5/5
[1m97/97[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 107ms/step - accuracy: 0.9805 - loss: 0.0705 - val_accuracy: 0.6968 - val_loss: 1.1270
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 33ms/step
BiLSTM Performance:
              precision    recall  f1-score   support

    negative       0.52      0.51      0.

In [25]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import create_optimizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import tensorflow as tf
import numpy as np
import pandas as pd


In [26]:
# Step 1: Load Dataset
file_path = "all-data.csv"  # Update this with the correct path to your dataset
data = pd.read_csv(file_path, encoding="latin1", header=None)
data.columns = ["Sentiment", "Text"]


In [27]:
# Map sentiment labels to numerical values
label_mapping = {"negative": 0, "neutral": 1, "positive": 2}
data["Sentiment_Label"] = data["Sentiment"].map(label_mapping)

# Step 2: Split Data
X = data["Text"]
y = data["Sentiment_Label"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [28]:
# Step 3: Tokenise Data for BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_data(texts, tokenizer, max_length=128):
    return tokenizer(
        list(texts),
        max_length=max_length,
        truncation=True,
        padding="max_length",
        return_tensors="tf",
    )

X_train_encoded = tokenize_data(X_train, tokenizer)
X_test_encoded = tokenize_data(X_test, tokenizer)

# Step 4: Load Pretrained BERT Model
bert_model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Define a TensorFlow-compatible AdamW optimiser
num_train_steps = len(X_train_encoded["input_ids"]) // 16 * 3  # Batch size = 16, epochs = 3
optimizer, lr_schedule = create_optimizer(
    init_lr=5e-5, num_train_steps=num_train_steps, num_warmup_steps=0, weight_decay_rate=0.01
)


In [30]:
# Compile the BERT model
bert_model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])


# Compile the model
bert_model.compile(optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"])


In [31]:
# Step 6: Train the Model
bert_model.fit(
    {"input_ids": X_train_encoded["input_ids"], "attention_mask": X_train_encoded["attention_mask"]},
    y_train,
    validation_split=0.2,
    epochs=3,
    batch_size=16,
)

Epoch 1/3


Epoch 2/3
Epoch 3/3


<tf_keras.src.callbacks.History at 0x274bd0aacc0>

In [32]:
# Step 7: Evaluate the Model
y_pred = np.argmax(
    bert_model.predict({"input_ids": X_test_encoded["input_ids"], "attention_mask": X_test_encoded["attention_mask"]}).logits,
    axis=1,
)

# Step 8: Classification Report
print("BERT Sentiment Analysis Performance:")
print(classification_report(y_test, y_pred, target_names=["negative", "neutral", "positive"]))

BERT Sentiment Analysis Performance:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00       110
     neutral       0.50      0.51      0.51       571
    positive       0.23      0.30      0.26       289

    accuracy                           0.39       970
   macro avg       0.24      0.27      0.26       970
weighted avg       0.36      0.39      0.38       970



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
