In [3]:
!pip install transformers torch scikit-learn



#a. Basic Sentiment Analysis using Logistic Regression

In [9]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
data = {
    "review": [
        "I loved this movie, it was fantastic!",
        "Worst film ever. Waste of time.",
        "Absolutely brilliant acting!",
        "Terrible script and bad direction.",
        "The plot was engaging and exciting.",
        "Horrible movie, I hated it.",
        "An excellent performance by the cast.",
        "Not worth watching, very disappointing."
    ],
    "sentiment": ["positive", "negative", "positive", "negative",
                  "positive", "negative", "positive", "negative"]
}
df = pd.DataFrame(data)

def clean_text(text):
    return text.lower()

df['cleaned_text'] = df['review'].apply(clean_text)
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred))

sample_review = "The movie was boring and slow."
sample_clean = clean_text(sample_review)
sample_vector = vectorizer.transform([sample_clean])
sample_pred = model.predict(sample_vector)[0]

print("\nSample Review:", sample_review)
print("Predicted Sentiment:", "Positive (1)" if sample_pred == 1 else "Negative (0)")


Accuracy: 0.00

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00       2.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0


Sample Review: The movie was boring and slow.
Predicted Sentiment: Positive (1)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# b. Twitter Sentiment Analysis using LSTM and GloVe Embeddings

In [None]:
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.initializers import Constant
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import os
df = pd.read_csv('/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv', 
                 encoding='latin1', header=None,
                 names=['sentiment', 'id', 'date', 'flag', 'user', 'text'])
df.head()
df = df.sample(100000, random_state=42)

# Clean tweets
def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Mentions/hashtags
    text = re.sub(r'[^\w\s]', '', text)  # Punctuation
    return text

df['cleaned_text'] = df['text'].apply(clean_tweet)
df['sentiment'] = df['sentiment'].replace({0:0, 2:1})
# Check unique sentiment values
print("Unique sentiment values:", df['sentiment'].unique())
# Tokenization
max_len = 50
max_words = 20000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
X = pad_sequences(sequences, maxlen=max_len)
y = to_categorical(df['sentiment'])
# Verify number of classes
num_classes = y.shape[1]
print(f"Number of classes: {num_classes}")
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Load GloVe embeddings (pre-uploaded to Kaggle)
embeddings_index = {}
glove_path = '/kaggle/input/glove6b100dtxt/glove.6B.100d.txt'

with open(glove_path) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs
# Prepare embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
# Build LSTM model - NOW WITH 2 OUTPUT CLASSES
model = Sequential([
    Embedding(max_words, embedding_dim, 
              embeddings_initializer=Constant(embedding_matrix),
              input_length=max_len, trainable=False),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(num_classes, activation='softmax')  # Dynamic based on num_classes
])

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])
# Train model
history = model.fit(X_train, y_train,
                    batch_size=128,
                    epochs=5,
                    validation_split=0.1)
# Evaluate with correct target names
y_pred = np.argmax(model.predict(X_test), axis=1)
y_true = np.argmax(y_test, axis=1)

# Use labels parameter matching your actual classes
print(classification_report(y_true, y_pred, 
                           target_names=['Negative', 'Positive'],  # Only 2 classes
                           labels=[0, 1]))  # Explicitly specify your label values
def predict_sentiment(text):
    text = clean_tweet(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    return 'Positive' if np.argmax(pred) == 1 else 'Negative'# Test
test_tweets = [
    "I love this product!",
    "This is okay I guess",
    "Terrible experience, never buying again"
]

for tweet in test_tweets:
    print(f"Tweet: {tweet}")
    print(f"Sentiment: {predict_sentiment(tweet)}\n")

#c. Movie Reviews Sentiment Classification with BERT

In [10]:
import pandas as pd, torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

data = {
    "review": [
        "I loved this movie, it was fantastic!",
        "The film was terrible and boring.",
        "What a great experience, highly recommend.",
        "Worst acting I have ever seen.",
        "An absolute masterpiece, brilliant!",
        "I did not enjoy the film at all."
    ],
    "sentiment": ["positive","negative","positive","negative","positive","negative"]
}
df = pd.DataFrame(data)
df["sentiment"] = df["sentiment"].map({"negative":0, "positive":1})

train_texts, val_texts, y_train, y_val = train_test_split(
    df["review"], df["sentiment"], test_size=0.2, random_state=42
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

tok = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2).to(device)

def encode(texts, labels):
    enc = tok(list(texts), truncation=True, padding="max_length", max_length=64, return_tensors="pt")
    return TensorDataset(enc["input_ids"], enc["attention_mask"], torch.tensor(labels.values))

train_loader = DataLoader(encode(train_texts, y_train), batch_size=2, shuffle=True)
val_loader   = DataLoader(encode(val_texts, y_val), batch_size=2)

opt = AdamW(model.parameters(), lr=2e-5)
for epoch in range(2):
    model.train()
    for b in train_loader:
        b = [t.to(device) for t in b]; opt.zero_grad()
        loss = model(input_ids=b[0], attention_mask=b[1], labels=b[2]).loss
        loss.backward(); opt.step()
    print(f"Epoch {epoch+1} done")

model.eval(); preds, true = [], []
for b in val_loader:
    b = [t.to(device) for t in b]
    with torch.no_grad(): out = model(b[0], attention_mask=b[1])
    preds += out.logits.argmax(1).cpu().tolist(); true += b[2].cpu().tolist()

print(classification_report(true, preds, target_names=["Neg","Pos"]))

def predict(txt):
    enc = tok(txt, return_tensors="pt", truncation=True, padding="max_length", max_length=64).to(device)
    return "Positive" if model(**enc).logits.argmax().item() else "Negative"

print(predict("The movie was boring and uninteresting."))


Using device: cpu


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 done
Epoch 2 done


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


              precision    recall  f1-score   support

         Neg       0.50      1.00      0.67         1
         Pos       0.00      0.00      0.00         1

    accuracy                           0.50         2
   macro avg       0.25      0.50      0.33         2
weighted avg       0.25      0.50      0.33         2

Negative
