### **Sentiment Analysis – Deep Learning (F:ull dataset)**

This notebook trains and compares **LSTM/BiLSTM** and **DistilBERT** models on the large sentiment dataset (1.6M tweets). The data is loaded directly from the remote URL.

**Data loader**

In [None]:
%matplotlib inline

import pandas as pd
import requests, zipfile, io

url = "https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
df = pd.read_csv(z.open("training.1600000.processed.noemoticon.csv"), encoding="latin-1")
df = df.sample(n=200000, random_state=42)

# Datast columns
df.columns = ["target", "ids", "date", "flag", "user", "text"]
df = df[["target", "text"]]

# Convert target from 0=negative, 4=positive
df["target"] = df["target"].replace({0:0, 4:1})

print(df.head())
print(df.shape)


**Text cleaning (preprocessing)**

In [None]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

stop_words = set(stopwords.words("english"))
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = [w for w in text.split() if w not in stop_words]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)
print(df["clean_text"].head())

**LSTM tokenization and model build**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Data processing
max_words = 20000
max_len = 100

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df["clean_text"])
sequences = tokenizer.texts_to_sequences(df["clean_text"])
X = pad_sequences(sequences, maxlen=max_len)
y = df["target"].values

# LSTM model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128),
    LSTM(256, dropout=0.1, recurrent_dropout=0.1),
    Dense(64, activation='relu'),
    Dropout(0.1),
    Dense(1, activation='sigmoid')
])


model.build(input_shape=(None, max_len))

optimizer = Adam(learning_rate=1e-3)
model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
print(model.summary())

**Model training**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

history = model.fit(X_train, y_train, validation_split=0.2, epochs=5, batch_size=512)
history_lstm = history

loss, acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {acc:.4f}")

**Confusion matrix**

In [None]:
# Prediction (Keras LSTM)
y_pred = model.predict(X_test)

# Convert probabilities to categories
y_pred = (y_pred > 0.5).astype(int).flatten()

# Create confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (LSTM)")
plt.show()


## **DistilBERT**

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import pandas as pd
import torch

# Data processing
texts = df["clean_text"].tolist()
labels = df["target"].tolist()

train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings   = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

# Preparing the Dataset in PyTorch format
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = NewsDataset(train_encodings, train_labels)
val_dataset   = NewsDataset(val_encodings, val_labels)

# DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2
)

print(model)

**Model training**

In [None]:
# install evaluate if not already
!pip install evaluate -q

from transformers import Trainer, TrainingArguments
import transformers
import numpy as np
import evaluate
import traceback

print("transformers version:", transformers.__version__)

# training kwargs
training_kwargs = dict(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
    report_to="none",
)

# Flexible: Tries to create TrainingArguments and does fallback between eval_strategy <-> evaluation_strategy
def make_training_args(kwargs):
    from transformers import TrainingArguments
    try:
        return TrainingArguments(**kwargs)
    except TypeError as e:
        kw = kwargs.copy()
        if 'eval_strategy' in kw and 'evaluation_strategy' not in kw:
            kw['evaluation_strategy'] = kw.pop('eval_strategy')
        elif 'evaluation_strategy' in kw and 'eval_strategy' not in kw:
            kw['eval_strategy'] = kw.pop('evaluation_strategy')
        else:
            raise
        return TrainingArguments(**kw)

try:
    training_args = make_training_args(training_kwargs)
except Exception as e:
    print("Failed to create TrainingArguments:")
    traceback.print_exc()
    raise

# Download metric from the evaluate library
accuracy_metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return accuracy_metric.compute(predictions=preds, references=labels)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# training
trainer.train()

# Evaluate and print the result
eval_results = trainer.evaluate()
if 'eval_accuracy' in eval_results:
    print(f"Test Accuracy: {eval_results['eval_accuracy']:.4f}")
else:
    print("eval_results keys:", eval_results.keys())
    print("eval_results:", eval_results)


**Confusion matrix**

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Predict using val_dataset
predictions = trainer.predict(val_dataset)

# Extract forecasts
y_pred = np.argmax(predictions.predictions, axis=1)

y_true = predictions.label_ids

# Create confusion matrix
cm = confusion_matrix(y_true, y_pred)

# drawing the shape
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix (DistilBERT)")
plt.show()

## **Performance Comparison**

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# LSTM Validation Accuracy
lstm_acc = max(history_lstm.history['val_accuracy'])

# DistilBERT Validation Accuracy
eval_results = trainer.evaluate(val_dataset)
bert_acc = eval_results['eval_accuracy']

print(f"LSTM Validation Accuracy: {lstm_acc:.4f}")
print(f"DistilBERT Validation Accuracy: {bert_acc:.4f}")

# Comparison drawing
plt.bar(['LSTM', 'DistilBERT'], [lstm_acc, bert_acc], color=['#66FFFF', '#99FF99'])
plt.title("Validation Accuracy Comparison")
plt.ylabel("Accuracy")
plt.show()