A)Basic Sentiment Analysis using Logistic Regressing 

In [3]:
#a
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
import re
import string

# Step 1: Load the dataset
# Replace 'path_to_dataset/reviews.csv' with the actual path to the IMDb dataset.
df = pd.read_csv('/kaggle/input/movie-review/labelled_full_dataset.csv')

# Check the structure of the dataframe
print(df.head())

# Step 2: Data Preprocessing
# Clean text function
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Remove digits
    text = re.sub(r'\d+', '', text)
    return text

# Apply cleaning to the 'review' column
df['cleaned_text'] = df['review'].apply(clean_text)

# Step 3: Convert text data into numerical vectors
vectorizer = TfidfVectorizer(max_features=5000)  # Use TF-IDF for vectorization
X = vectorizer.fit_transform(df['cleaned_text'])
y = df['label']  # Use the 'label' column for sentiment labels

# Step 4: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 5: Build and train the logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Step 6: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print(classification_report(y_test, y_pred))

# Sample Input (Review Text)
sample_review = "This movie was amazing! I loved every minute of it."
sample_cleaned = clean_text(sample_review)
sample_vectorized = vectorizer.transform([sample_cleaned])

# Predict sentiment for the sample input
sample_prediction = model.predict(sample_vectorized)
print(f'Sample Review Sentiment: {"Positive" if sample_prediction[0] == 1 else "Negative"}')


   label                                             review
0      0  Once again Mr. Costner has dragged out a movie...
1      0  This is an example of why the majority of acti...
2      0  First of all I hate those moronic rappers, who...
3      0  Not even the Beatles could write songs everyon...
4      0  Brass pictures (movies is not a fitting word f...
Accuracy: 0.89
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      5022
           1       0.88      0.90      0.89      4978

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

Sample Review Sentiment: Positive


B) Twitter Sentiment Analysis Using LSTM and Glove

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Step 1: Load and preprocess the dataset
# Load the dataset
df = pd.read_csv('/kaggle/input/twitter-sentiment/Sentiment Analysis Dataset 2.csv', on_bad_lines='skip')

# Check the structure of the dataframe
print(df.head())

# Clean text function
def clean_text(text):
    # Remove URLs, mentions, hashtags, and special characters
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'#', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    return text

# Apply cleaning to the 'SentimentText' column
df['cleaned_text'] = df['SentimentText'].apply(clean_text)

# Convert sentiment labels to numerical format
df['Sentiment'] = df['Sentiment'].replace({0: 0, 2: 1, 4: 2})  # Adjust based on your labeling scheme

# Step 2: Tokenize text and pad sequences
max_length = 100  # Maximum length of sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Step 3: Split data into training, validation, and testing sets
X = padded_sequences
y = df['Sentiment'].values
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)  # Split the temp set into validation and test sets

# Step 4: Load pre-trained GloVe embeddings
embeddings_index = {}
glove_file = '/kaggle/input/glove-embeddings/glove.6B.100d.txt'  # Adjust the path to your GloVe file
with open(glove_file, 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Create embedding matrix
embedding_dim = 100
embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Step 5: Build LSTM model with GloVe embeddings
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, 
                    output_dim=embedding_dim, 
                    weights=[embedding_matrix], 
                    input_length=max_length, 
                    trainable=False))
model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(100))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))  # 3 classes (0, 1, 2)

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Step 6: Train the model with validation data
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
history = model.fit(X_train, y_train, epochs=3, batch_size=64, 
                    validation_data=(X_val, y_val), callbacks=[early_stopping])

# Step 7: Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Accuracy: {accuracy:.2f}')

# Sample Input (Tweet Text)
sample_tweet = "I really enjoyed the movie, it was fantastic!"
sample_cleaned = clean_text(sample_tweet)
sample_sequence = tokenizer.texts_to_sequences([sample_cleaned])
sample_padded = pad_sequences(sample_sequence, maxlen=max_length, padding='post')

# Predict sentiment for the sample input
sample_prediction = model.predict(sample_padded)
print(f'Sample Tweet Sentiment: {np.argmax(sample_prediction)}')  # Output the sentiment class


   ItemID  Sentiment SentimentSource  \
0       1          0    Sentiment140   
1       2          0    Sentiment140   
2       3          1    Sentiment140   
3       4          0    Sentiment140   
4       5          0    Sentiment140   

                                       SentimentText  
0                       is so sad for my APL frie...  
1                     I missed the New Moon trail...  
2                            omg its already 7:30 :O  
3            .. Omgaga. Im sooo  im gunna CRy. I'...  
4           i think mi bf is cheating on me!!!   ...  




Epoch 1/3
[1m19111/19733[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m2:02[0m 197ms/step - accuracy: 0.4987 - loss: 0.6969

In [5]:
import os
print(os.listdir('/kaggle/input/'))

['using-word-embeddings-for-sentiment-analysis', 'imdb-review', 'twitter-sentiment', 'movie-review', 'twitter-entity-sentiment-analysis', 'd', 'twitter-sentiment-analysis-using-tensorflow']


C) Movie Reviews Sentiment Classification with Bert 

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load and preprocess the dataset
# Load the dataset 
df = pd.read_csv('/kaggle/input/imdb-review/imdb_reviews.csv')  # Adjust to your actual dataset path

# Display the first few rows of the dataset
print(df.head())

# Convert sentiment labels to numerical format (0 for negative, 1 for positive)
df['sentiment'] = df['sentiment'].map({'neg': 0, 'pos': 1})

# Clean text function (optional, based on dataset specifics)
def clean_text(text):
    # Here, you can add any cleaning steps if necessary (e.g., removing special characters)
    return text

# Apply text cleaning
df['text'] = df['text'].apply(clean_text)

# Step 2: Tokenize and encode reviews using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

class IMDBDataset(Dataset):
    def __init__(self, reviews, labels):
        self.reviews = reviews
        self.labels = labels

    def __len__(self):
        return len(self.reviews)

    def __getitem__(self, idx):
        review = self.reviews[idx]
        label = self.labels[idx]

        # Tokenize and encode the review
        encoding = tokenizer.encode_plus(
            review,
            add_special_tokens=True,
            max_length=256,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.2, random_state=42)

# Create datasets
train_dataset = IMDBDataset(X_train.to_numpy(), y_train.to_numpy())
test_dataset = IMDBDataset(X_test.to_numpy(), y_test.to_numpy())

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Step 4: Load pre-trained BERT model and fine-tune for sentiment classification
# Load tokenizer from a local directory where BERT files are stored
tokenizer = BertTokenizer.from_pretrained('/kaggle/input/bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('/kaggle/input/bert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

# Set up the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Train the model
model.train()
for epoch in range(3):  # Adjust number of epochs as necessary
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{3}, Loss: {total_loss / len(train_loader)}')

# Step 5: Evaluate the model
model.eval()
predictions, true_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu().numpy()

        predictions.extend(preds)
        true_labels.extend(batch['labels'].cpu().numpy())

# Calculate accuracy and classification report
accuracy = accuracy_score(true_labels, predictions)
print(f'Accuracy: {accuracy}')
print(classification_report(true_labels, predictions, target_names=['Negative', 'Positive']))

# Sample Input
sample_review = "The movie was boring and uninteresting."
encoded_sample = tokenizer.encode_plus(
    sample_review,
    add_special_tokens=True,
    max_length=256,
    return_token_type_ids=False,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt',
)

# Make prediction on the sample input
model.eval()
with torch.no_grad():
    input_ids = encoded_sample['input_ids'].to(device)
    attention_mask = encoded_sample['attention_mask'].to(device)
    output = model(input_ids, attention_mask=attention_mask)
    prediction = torch.argmax(output.logits, dim=1).cpu().numpy()

print(f'Sample Input: "{sample_review}"')
print(f'Expected Output: {"Positive" if prediction[0] == 1 else "Negative"}')
