In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from collections import Counter
from gensim.models import Word2Vec

from transformers import BertTokenizer, BertModel
import torch

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/murat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/murat/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/murat/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/murat/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
# Load data
df = pd.read_csv('IMDB_Dataset.csv') 
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
# Check data size
print("Dataset Size:")
print(len(df))

Dataset Size:
50000


In [4]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove special characters and digits
    text = re.sub(r'\W|\d', ' ', text)
    # Tokenization
    tokens = word_tokenize(text)
    # Remove stopwords and stemming
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['cleaned_review'] = df['review'].apply(preprocess_text)
df['cleaned_review']

0        one reviewer mentioned watching oz episode you...
1        wonderful little production br br filming tech...
2        thought wonderful way spend time hot summer we...
3        basically there family little boy jake think t...
4        petter matteis love time money visually stunni...
                               ...                        
49995    thought movie right good job wasnt creative or...
49996    bad plot bad dialogue bad acting idiotic direc...
49997    catholic taught parochial elementary school nu...
49998    im going disagree previous comment side maltin...
49999    one expects star trek movie high art fan expec...
Name: cleaned_review, Length: 50000, dtype: object

In [5]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

# Apply preprocessing to all reviews
df['tokens'] = df['cleaned_review'].apply(tokenize_text)

9- Feature Extraction for Sentiment Classification: Convert the text reviews into numerical representations suitable for
machine learning models. First, apply the Bag of Words (BoW) method, which represents the text based on word frequency
without considering word order. Next, implement TF-IDF to assign higher importance to less frequent but more meaningful words in the reviews. Finally, explore word embeddings such as Word2Vec, GloVe, or BERT to capture more advanced and
contextual word representations, providing richer semantic information for the sentiment classification models.

In [6]:
# Create Bag of Words (BoW) model
vectorizer_bow = CountVectorizer(max_features=5000)  # Limit to 5000 most frequent words
X_bow = vectorizer_bow.fit_transform(df['cleaned_review']).toarray()

# Check BoW features
print("BoW Feature Shape:", X_bow.shape)

BoW Feature Shape: (50000, 5000)


In [7]:
# Create TF-IDF model
vectorizer_tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer_tfidf.fit_transform(df['cleaned_review']).toarray()

# Check TF-IDF features
print("TF-IDF Feature Shape:", X_tfidf.shape)

TF-IDF Feature Shape: (50000, 5000)


In [8]:
# Train Word2Vec model
word2vec_model = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=5, workers=4)
X_word2vec = np.array([np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv] or [np.zeros(100)], axis=0) for words in df['tokens']])

# Check Word2Vec features
print("Word2Vec Feature Shape:", X_word2vec.shape)

Word2Vec Feature Shape: (50000, 100)


In [9]:
import gensim.downloader as api

# Load pre-trained GloVe embeddings
glove_model = api.load("glove-wiki-gigaword-100")  # 100-dimensional embeddings

# Convert reviews to GloVe vectors
def get_glove_embeddings(review):
    words = review.split()
    return np.mean([glove_model[word] for word in words if word in glove_model] or [np.zeros(100)], axis=0)

X_glove = np.array([get_glove_embeddings(review) for review in df['cleaned_review']])

# Check GloVe features
print("GloVe Feature Shape:", X_glove.shape)

GloVe Feature Shape: (50000, 100)


In [10]:
# Initialize BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Function to get BERT embeddings
def get_bert_embeddings(review):
    inputs = bert_tokenizer(review, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
    outputs = bert_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy()

# Convert reviews to BERT embeddings
X_bert = np.array([get_bert_embeddings(review)[0] for review in df['cleaned_review']])

# Check BERT features
print("BERT Feature Shape:", X_bert.shape)

KeyboardInterrupt: 

10- Sentiment Prediction Using Extracted Features: Build a sentiment classification model using the features extracted in
Task 9. Train the model on the training dataset using features extracted via Bag of Words (BoW), TF-IDF, and word
embeddings such as Word2Vec, GloVe, or BERT. After training, evaluate the performance of the model on the test dataset.
The goal is to predict whether a review is positive or negative based on these numerical representations. You are required to
compare the performance of various classifiers, including Logistic Regression, Support Vector Machines (SVM), Random
Forest, and Deep Learning models (LSTM or CNN). Each classifier will be applied to BoW, TF-IDF and word embeddings,
and the results should be evaluated using metrics such as accuracy, precision, recall, and F1-score.

In [11]:
# Function to print evaluation metrics
def evaluate_model(true_labels, predicted_labels):
    accuracy = accuracy_score(true_labels, predicted_labels)
    precision = precision_score(true_labels, predicted_labels, pos_label='positive')
    recall = recall_score(true_labels, predicted_labels, pos_label='positive')
    f1 = f1_score(true_labels, predicted_labels, pos_label='positive')
    
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")


# BoW

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['sentiment'], test_size=0.5, random_state=42)

In [14]:
# Train a Logistic Regression model
clf_bow = LogisticRegression(max_iter=1000)
clf_bow.fit(X_train, y_train)
print("Logistic Regression with BoW:")
evaluate_model(y_test, clf_bow.predict(X_test))

Logistic Regression with BoW:
Accuracy: 0.86
Precision: 0.86
Recall: 0.86
F1-Score: 0.86


In [None]:
# Train a Support Vector Machine (SVM)
svm_bow = SVC(kernel='linear')
svm_bow.fit(X_train, y_train)
print("SVM with BoW:")
evaluate_model(y_test, svm_bow.predict(X_test))

In [13]:
# Train a RF Classifier
rf_bow = RandomForestClassifier(n_estimators=100)
rf_bow.fit(X_train, y_train)
print("Random Forest with BoW:")
evaluate_model(y_test, rf_bow.predict(X_test))

Random Forest with BoW:
Accuracy: 0.84
Precision: 0.85
Recall: 0.84
F1-Score: 0.84


In [None]:
# LSTM model:
lstm_model_bow = Sequential()
lstm_model_bow.add(Embedding(input_dim=5000, output_dim=128, input_length=X_train.shape[1])) 
lstm_model_bow.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bow.add(Dense(1, activation='sigmoid'))

lstm_model_bow.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bow.fit(X_train, y_train, epochs=5, batch_size=32)

print("LSTM with BoW:")
evaluate_model(y_test, lstm_model_bow.predict(X_test))

Epoch 1/5


2024-11-02 01:19:18.651315: W tensorflow/core/framework/op_kernel.cc:1816] OP_REQUIRES failed at cast_op.cc:122 : UNIMPLEMENTED: Cast string to float is not supported


UnimplementedError: Graph execution error:

Detected at node compile_loss/binary_crossentropy/Cast defined at (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main

  File "<frozen runpy>", line 88, in _run_code

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel/kernelapp.py", line 701, in start

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/asyncio/base_events.py", line 639, in run_forever

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/asyncio/base_events.py", line 1985, in _run_once

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/asyncio/events.py", line 88, in _run

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 534, in dispatch_queue

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 523, in process_one

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 429, in dispatch_shell

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel/kernelbase.py", line 767, in execute_request

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 429, in do_execute

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3075, in run_cell

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3130, in _run_cell

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3334, in run_cell_async

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3517, in run_ast_nodes

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/IPython/core/interactiveshell.py", line 3577, in run_code

  File "/var/folders/xz/tpjwm4l52hjdcg11dtxxgyxr0000gn/T/ipykernel_50666/1377160367.py", line 8, in <module>

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 320, in fit

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 121, in one_step_on_iterator

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 108, in one_step_on_data

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/backend/tensorflow/trainer.py", line 54, in train_step

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/trainers/trainer.py", line 398, in _compute_loss

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/trainers/trainer.py", line 366, in compute_loss

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/trainers/compile_utils.py", line 618, in __call__

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/trainers/compile_utils.py", line 659, in call

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/losses/loss.py", line 56, in __call__

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/tree/tree_api.py", line 148, in map_structure

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/tree/optree_impl.py", line 79, in map_structure

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/optree/ops.py", line 747, in tree_map

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/losses/loss.py", line 57, in <lambda>

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/ops/core.py", line 917, in convert_to_tensor

  File "/Users/murat/anaconda3/envs/nlp/lib/python3.12/site-packages/keras/src/backend/tensorflow/core.py", line 132, in convert_to_tensor

Cast string to float is not supported
	 [[{{node compile_loss/binary_crossentropy/Cast}}]] [Op:__inference_one_step_on_iterator_9117]

# TF-IDF

In [18]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.5, random_state=42)

In [20]:
# Train a Logistic Regression model
clf_tfidf = LogisticRegression(max_iter=1000)
clf_tfidf.fit(X_train, y_train)
print("Logistic Regression model with tf-idf:")
evaluate_model(y_test, clf_tfidf.predict(X_test))

Logistic Regression model with tf-idf:
Accuracy: 0.88
Precision: 0.87
Recall: 0.90
F1-Score: 0.88


In [None]:
# Train a Support Vector Machine (SVM)
svm_tfidf = SVC(kernel='linear')
svm_tfidf.fit(X_train, y_train)
print("SVM with tf-idf:")
evaluate_model(y_test, svm_tfidf.predict(X_test))

In [None]:
# Train a RF Classifier
rf_tfidf = RandomForestClassifier(n_estimators=100)
rf_tfidf.fit(X_train, y_train)
print("Random Forest with tf-idf:")
evaluate_model(y_test, rf_tfidf.predict(X_test))

Accuracy: 0.88
Precision: 0.87
Recall: 0.90
F1-Score: 0.89


In [None]:
# LSTM model:
lstm_model_tfidf = Sequential()
lstm_model_tfidf.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_tfidf.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_tfidf.add(Dense(1, activation='sigmoid'))

lstm_model_tfidf.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_tfidf.fit(X_train, y_train, epochs=5, batch_size=32)

print("LSTM with tf-idf:")
evaluate_model(y_test, lstm_model_tfidf.predict(X_test))

# Word2Vec

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_word2vec, df['sentiment'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_word2vec = LogisticRegression(max_iter=1000)
clf_word2vec.fit(X_train, y_train)
print("Logistic Regression model with word2vec")
evaluate_model(y_test, clf_word2vec.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_word2vec = SVC(kernel='linear')
svm_word2vec.fit(X_train, y_train)
print("SVM with word2vec:")
evaluate_model(y_test, svm_word2vec.predict(X_test))

In [None]:
# Train a RF Classifier
rf_word2vec = RandomForestClassifier(n_estimators=100)
rf_word2vec.fit(X_train, y_train)
print("Random Forest with word2vec:")
evaluate_model(y_test, rf_word2vec.predict(X_test))

Accuracy: 0.85
Precision: 0.85
Recall: 0.86
F1-Score: 0.86


In [None]:
# LSTM model:
lstm_model_word2vec = Sequential()
lstm_model_word2vec.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_word2vec.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_word2vec.add(Dense(1, activation='sigmoid'))

lstm_model_word2vec.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_word2vec.fit(X_train, y_train, epochs=5, batch_size=32)

print("LSTM with word2vec:")
evaluate_model(y_test, lstm_model_word2vec.predict(X_test))

# GloVe

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_glove, df['sentiment'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_glove = LogisticRegression(max_iter=1000)
clf_glove.fit(X_train, y_train)
print("Logistic Regression model with glove")
evaluate_model(y_test, clf_glove.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_glove = SVC(kernel='linear')
svm_glove.fit(X_train, y_train)
print("SVM with glove:")
evaluate_model(y_test, svm_glove.predict(X_test))

In [None]:
# Train a RF Classifier
rf_glove = RandomForestClassifier(n_estimators=100)
rf_glove.fit(X_train, y_train)
print("Random Forest with glove:")
evaluate_model(y_test, rf_glove.predict(X_test))

Accuracy: 0.76
Precision: 0.76
Recall: 0.77
F1-Score: 0.76


In [None]:
# LSTM model:
lstm_model_glove = Sequential()
lstm_model_glove.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_glove.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_glove.add(Dense(1, activation='sigmoid'))

lstm_model_glove.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_glove.fit(X_train, y_train, epochs=5, batch_size=32)

print("LSTM with glove:")
evaluate_model(y_test, lstm_model_glove.predict(X_test))

# BERT

In [None]:
# Split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_bert, df['sentiment'], test_size=0.5, random_state=42)

In [None]:
# Train a Logistic Regression model
clf_bert = LogisticRegression(max_iter=1000)
clf_bert.fit(X_train, y_train)
print("Logistic Regression model with BERT")
evaluate_model(y_test, clf_bert.predict(X_test))

In [None]:
# Train a Support Vector Machine (SVM)
svm_bert = SVC(kernel='linear')
svm_bert.fit(X_train, y_train)
print("SVM with BERT:")
evaluate_model(y_test, svm_bert.predict(X_test))

In [None]:
# Train a RF Classifier
rf_bert = RandomForestClassifier(n_estimators=100)
rf_bert.fit(X_train, y_train)
print("Random Forest with BERT:")
evaluate_model(y_test, rf_bert.predict(X_test))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy: 0.80
Precision: 0.81
Recall: 0.80
F1-Score: 0.81


In [None]:
# LSTM model:
lstm_model_bert = Sequential()
lstm_model_bert.add(Embedding(input_dim=word2vec_model.wv.vectors.shape[0], output_dim=100, input_length=100)) 
lstm_model_bert.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2))
lstm_model_bert.add(Dense(1, activation='sigmoid'))

lstm_model_bert.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
lstm_model_bert.fit(X_train, y_train, epochs=5, batch_size=32)

print("LSTM with glove:")
evaluate_model(y_test, lstm_model_bert.predict(X_test))