# Simple Sentiment Analyser

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from tqdm import tqdm
import nltk
import spacy

In [3]:
nltk.download('movie_reviews')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load IMDB Movie Reviews dataset from NLTK
movie_reviews_data = [(movie_reviews.raw(fileid), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)]

# Extract texts and labels
texts, labels = zip(*movie_reviews_data)

# Convert labels to binary (0 for negative, 1 for positive)
labels = [0 if label == 'neg' else 1 for label in labels]

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


In [5]:
# Function to calculate document embeddings using spaCy
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        doc = nlp(text)
        # Average word vectors to get document vector
        doc_vector = np.mean([token.vector for token in doc if token.has_vector], axis=0)
        embeddings.append(doc_vector)
    return np.array(embeddings)

# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [02:59<00:00,  8.92it/s]
100%|██████████| 400/400 [00:43<00:00,  9.15it/s]


In [6]:
X_train_tuning, X_val, y_train_tuning, y_val = train_test_split(X_train_embeddings, y_train, test_size=0.2, random_state=42)
num_components = [10, 20, 30, 40, 50, 60, 70, 80, 90]

best_accuracy = 0
best_num_components = None

for n_components in num_components:
    model = make_pipeline(TruncatedSVD(n_components=n_components), SVC())

    model.fit(X_train_tuning, y_train_tuning)

    predictions = model.predict(X_val)

    accuracy = accuracy_score(y_val, predictions)

    print(f"Accuracy for n_components={n_components}: {accuracy}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_num_components = n_components

print(f"Best n_components: {best_num_components}, Best Accuracy: {best_accuracy}")

model = make_pipeline(TruncatedSVD(n_components=best_num_components), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy for n_components=10: 0.64375
Accuracy for n_components=20: 0.65625
Accuracy for n_components=30: 0.68125
Accuracy for n_components=40: 0.675
Accuracy for n_components=50: 0.665625
Accuracy for n_components=60: 0.671875
Accuracy for n_components=70: 0.68125
Accuracy for n_components=80: 0.68125
Accuracy for n_components=90: 0.678125
Accuracy for n_components=95: 0.678125
Best n_components: 30, Best Accuracy: 0.68125
Accuracy: 0.6825


#More accurate one

In [7]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from transformers import AutoTokenizer, AutoModel
import torch

In [8]:
# Load BERT tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
bert_model = AutoModel.from_pretrained("bert-base-uncased")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [9]:
def calculate_embeddings(texts):
    embeddings = []
    for text in tqdm(texts):
        # Tokenize the text
        tokens = tokenizer(text, return_tensors="pt", truncation=True, padding=True)

        # Get BERT model output
        with torch.no_grad():
            model_output = bert_model(**tokens)

        # Use mean pooling to get sentence embeddings
        doc_vector = torch.mean(model_output.last_hidden_state, dim=1).squeeze().numpy()
        embeddings.append(doc_vector)

    return np.array(embeddings)


# Calculate embeddings for training and testing sets
X_train_embeddings = calculate_embeddings(X_train)
X_test_embeddings = calculate_embeddings(X_test)

100%|██████████| 1600/1600 [40:40<00:00,  1.53s/it]
100%|██████████| 400/400 [09:36<00:00,  1.44s/it]


In [11]:
X_train_tuning, X_val, y_train_tuning, y_val = train_test_split(X_train_embeddings, y_train, test_size=0.2, random_state=42)

num_components = [10, 20, 30, 40, 50, 60, 70, 80, 90]

best_accuracy = 0
best_num_components = None

for n_components in num_components:
    model = make_pipeline(TruncatedSVD(n_components=n_components), SVC())

    model.fit(X_train_tuning, y_train_tuning)

    predictions = model.predict(X_val)

    accuracy = accuracy_score(y_val, predictions)

    print(f"Accuracy for n_components={n_components}: {accuracy}")

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_num_components = n_components

print(f"Best n_components: {best_num_components}, Best Accuracy: {best_accuracy}")

model = make_pipeline(TruncatedSVD(n_components=best_num_components), SVC())

# Fit the model on training data
model.fit(X_train_embeddings, y_train)

# Make predictions on test data
predictions = model.predict(X_test_embeddings)

# Evaluate the accuracy
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy}")

Accuracy for n_components=10: 0.65
Accuracy for n_components=20: 0.740625
Accuracy for n_components=30: 0.734375
Accuracy for n_components=40: 0.78125
Accuracy for n_components=50: 0.76875
Accuracy for n_components=60: 0.778125
Accuracy for n_components=70: 0.778125
Accuracy for n_components=80: 0.778125
Accuracy for n_components=90: 0.784375
Accuracy for n_components=95: 0.784375
Best n_components: 90, Best Accuracy: 0.784375
Accuracy: 0.8025
