# Sentiment Analysis – Deep Learning (Full dataset)

This notebook trains and compares **LSTM/BiLSTM** and **DistilBERT** models on the large sentiment dataset (1.6M tweets). The data is loaded directly from the remote URL.

In [None]:
# Install required libraries (run this first in Colab or local environment)
!pip install --quiet transformers datasets nltk tensorflow scikit-learn matplotlib pandas

# Note: Installing may take a few minutes.

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from transformers import DistilBertTokenizerFast, TFDistilBertForSequenceClassification
from datasets import Dataset


In [None]:
import requests, zipfile, io
url = 'https://nyc3.digitaloceanspaces.com/ml-files-distro/v1/sentiment-analysis-is-bad/data/training.1600000.processed.noemoticon.csv.zip'
print('Downloading dataset (this may take a few minutes)...')
r = requests.get(url, stream=True)
z = zipfile.ZipFile(io.BytesIO(r.content))
csv_filename = z.namelist()[0]
df = pd.read_csv(z.open(csv_filename), encoding='latin-1', header=None)
df.columns = ['sentiment','id','date','query','user','text']
df = df[['text','sentiment']]
# Map sentiment: 0 -> negative, 2 -> neutral, 4 -> positive
df['sentiment'] = df['sentiment'].map({0:0, 2:1, 4:2})
print('Loaded dataframe shape:', df.shape)
df.head()


In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+','', text)             # remove urls
    text = re.sub(r'[^a-z\s]','', text)            # keep letters and spaces
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text

print('Cleaning texts (this may take several minutes for 1.6M rows)...')
df['clean_text'] = df['text'].apply(clean_text)
df[['clean_text','sentiment']].head()


In [None]:
# LSTM tokenization and model build
vocab_size = 20000
max_len = 50
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(sequences, maxlen=max_len)
y = df['sentiment'].values

print('X shape, y shape:', X.shape, y.shape)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

model_lstm = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_len),
    LSTM(128),
    Dense(3, activation='softmax')
])
model_lstm.build(input_shape=(None, max_len))
model_lstm.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model_lstm.summary()


In [None]:
# Train LSTM (adjust epochs/batch_size based on available resources)
lstm_epochs = 3
lstm_batch = 256
history_lstm = model_lstm.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=lstm_epochs, batch_size=lstm_batch)


In [None]:
# Prepare dataset for DistilBERT using Hugging Face datasets (efficient format)
ds = Dataset.from_pandas(df[['clean_text','sentiment']].rename(columns={'sentiment':'label'}))
print('Dataset length =', len(ds))
ds = ds.train_test_split(test_size=0.2)

tokenizer_bert = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
def tokenize_fn(batch):
    return tokenizer_bert(batch['clean_text'], padding='max_length', truncation=True, max_length=128)

print('Tokenizing dataset (this will take time)...')
tokenized = ds.map(tokenize_fn, batched=True, remove_columns=['clean_text'])
tokenized = tokenized.rename_column('label','labels')
tokenized.set_format(type='tensorflow', columns=['input_ids','attention_mask','labels'])

tf_train = tokenized['train'].to_tf_dataset(columns=['input_ids','attention_mask'], label_cols='labels', shuffle=True, batch_size=16)
tf_val   = tokenized['test'].to_tf_dataset(columns=['input_ids','attention_mask'], label_cols='labels', shuffle=False, batch_size=16)
print('tf_train, tf_val created')


In [None]:
# Build DistilBERT model for sequence classification
model_bert = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model_bert.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])
model_bert.summary()


In [None]:
# Train DistilBERT (this is heavy; consider using GPU/TPU in Colab)
bert_epochs = 1
history_bert = model_bert.fit(tf_train, validation_data=tf_val, epochs=bert_epochs)


In [None]:
# Compare validation accuracies
lstm_val_acc = history_lstm.history.get('val_accuracy', [None])[-1]
bert_val_acc = history_bert.history.get('val_accuracy', [None])[-1]
print('LSTM validation accuracy:', lstm_val_acc)
print('BERT validation accuracy:', bert_val_acc)
plt.figure(figsize=(6,4))
plt.bar(['LSTM','DistilBERT'], [lstm_val_acc or 0, bert_val_acc or 0])
plt.ylim(0,1)
plt.title('Validation Accuracy Comparison')
plt.ylabel('Accuracy')
plt.show()


## Conclusion
- LSTM/BiLSTM provides a good baseline and may train faster on CPU/GPU depending on resources.
- DistilBERT typically achieves higher accuracy and better generalization but requires significantly more memory and compute.
- For full 1.6M training, use a powerful GPU/TPU runtime (Colab Pro/TPU or dedicated cloud instances).

### Notes
- If you run into memory issues, reduce batch sizes or train on a subset when experimenting locally.
- You can upload this notebook directly to your GitHub repository and open it in Colab using: `Open in Colab`.
