# Simple Sentiment Analyser

Matin Mahmoodkhani - 99522095

In [3]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from tqdm import tqdm
import nltk
import spacy

In [4]:
nltk.download('movie_reviews')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load IMDB Movie Reviews dataset from NLTK
movie_reviews_data = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

# Extract texts and labels
texts, labels = zip(*movie_reviews_data)

# Convert labels to binary (0 for negative, 1 for positive)
labels = [0 if i == "neg" else 1 for i in labels]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [5]:
# Function to calculate document embeddings using spaCy
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        doc = nlp(text)
        # Average word vectors to get document vector
        doc_vector = np.mean([token.vector for token in doc if token.has_vector], axis=0)
        embeddings.append(doc_vector)
    return np.array(embeddings)

# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [04:09<00:00,  6.42it/s]
100%|██████████| 400/400 [00:56<00:00,  7.13it/s]


In [6]:
# Build a pipeline with TruncatedSVD and SVM classifier
# Specify the best n_components hyperparameter.
best_n = 0
best_accuracy = 0
for i in range(1, 97):
    model = make_pipeline(TruncatedSVD(n_components=i), SVC())

    # Fit the model on training data
    model.fit(X_train_embeddings, y_train)

    # Make predictions on test data
    predictions = model.predict(X_test_embeddings)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, predictions)
    if (accuracy > best_accuracy):
      best_n = i
      best_accuracy = accuracy
    # print(f"n_components: {i} Accuracy: {accuracy}")

print(f"Best n : {best_n}, accuracy : {best_accuracy}")

Best n : 76, accuracy : 0.7


#More accurate one

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from transformers import AutoTokenizer, AutoModel
import torch

In [8]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize the text
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Get BERT model output
        with torch.no_grad():
            model_output = bert_model(**tokens)

        # Use mean pooling to get sentence embeddings
        doc_vector = torch.mean(model_output.last_hidden_state, dim=1).numpy()
        embeddings.append(doc_vector)

    return np.array(embeddings)


# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [51:35<00:00,  1.93s/it]
100%|██████████| 400/400 [12:24<00:00,  1.86s/it]


In [15]:
# # Build a pipeline with TruncatedSVD and SVM classifier
# # Specify the best n_components hyperparameter.
best_n = 0
best_accuracy = 0
X_train_embeddings_reshaped = X_train_embeddings.reshape((X_train_embeddings.shape[0], -1))
X_test_embeddings_reshaped = X_test_embeddings.reshape((X_test_embeddings.shape[0], -1))

for i in range(1, 97):
    model = make_pipeline(TruncatedSVD(n_components=i), SVC())

    # Fit the model on training data
    model.fit(X_train_embeddings_reshaped, y_train)

    # Make predictions on test data
    predictions = model.predict(X_test_embeddings_reshaped)

    # Evaluate the accuracy
    accuracy = accuracy_score(y_test, predictions)
    if (accuracy > best_accuracy):
      best_n = i
      best_accuracy = accuracy
    # print(f"n_components: {i} Accuracy: {accuracy}")

print(f"Best n : {best_n}, accuracy : {best_accuracy}")



Best n : 86, accuracy : 0.81
