# Main

This notebook serves as the main entry point to train, evaluate, and compare the performance of different NLP models implemented in separate Python modules.

In [7]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
base_path = ".."

## I. Base model : TF-IDF and Naive Bayes

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from src.models.tf_idf import TfidfClassifier

In [9]:
datasets_path = os.path.join(base_path, "aclImdb")
df_train = pd.read_csv(os.path.join(datasets_path, "df_train.csv"))
df_test = pd.read_csv(os.path.join(datasets_path, "df_test.csv"))

X = df_train['comment']
y = df_train['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
tfidf_classifier = TfidfClassifier(X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val, 
                                   train_file_path=os.path.join(datasets_path, "df_train.csv"), test_file_path=os.path.join(datasets_path, "df_test.csv"))

tfidf_classifier.run_experiments(
    max_features_list=[1000, 2000, 5000],
    use_idf_list=[True, False],
    alpha_list=[0.1, 1.0, 10.0]
)

tfidf_classifier.get_best_config()

{'max_features': 5000,
 'use_idf': True,
 'alpha': 1.0,
 'train_accuracy': 0.86855,
 'val_accuracy': 0.8498}

In [11]:
#Evaluate perf on test set
train_accuracy, test_accuracy = tfidf_classifier.evaluate_on_test(config = tfidf_classifier.best_config)

print("======== Accuracy on train set for TF-IDF & Naive Bayes ========\n", train_accuracy)
print("======== Accuracy on test set for TF-IDF & Naive Bayes ========\n", test_accuracy)

 0.865
 0.84056


## II. Word2vec and SVC

In [12]:
from src.models.word2vec import ReviewTokenizer, Word2VecEmbedder, SentimentClassifier
from sklearn.svm import LinearSVC

In [9]:
df_train = pd.read_csv(os.path.join(datasets_path, "df_train.csv"))
df_test = pd.read_csv(os.path.join(datasets_path, "df_test.csv"))

# tokenisation
tokenized_reviews_train = [ReviewTokenizer.tokenize(text) for text in df_train['comment']]

# # train embeddings
embedder = Word2VecEmbedder()
embedder.train(tokenized_reviews_train)
X_embeddings = embedder.embed_reviews(tokenized_reviews_train)
embedder.save_embeddings(X_embeddings, os.path.join(base_path, 'aclImdb/embeddings/X_train_word2vec_embeddings.pkl'))

# train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_embeddings, df_train['sentiment'], test_size=0.2, random_state=42
)

# train and evaluate classifier
clf = SentimentClassifier(classifier=LinearSVC())
clf.train(X_train, y_train)
clf.evaluate(X_val, y_val)
print("======== Accuracy on val set for Word2vec and SVC ========\n", train_accuracy)

 0.865


In [10]:
#Check performance on test set 
clf.train(X_embeddings, df_train['sentiment'])

tokenized_reviews_test = [ReviewTokenizer.tokenize(text) for text in df_test['comment']]

X_test_embeddings = embedder.embed_reviews(tokenized_reviews_test)
test_accuracy, test_report = clf.evaluate(X_test_embeddings, df_test['sentiment'])
train_accuracy, train_report = clf.evaluate(X_train, y_train)

print("======== Accuracy on train set for Word2vec and SVC ========\n", train_accuracy)
print("======== Accuracy on test set for Word2vec and SVC ========\n", test_accuracy)

 0.87965
 0.86836


## III- RoBERTa
For more info on training have a look at `roberta_training.ipynb`

In [11]:
import gdown 

In [13]:
url ='https://drive.google.com/file/d/1HcY9WA-ded5y5YLDNtGwH9cnO0F6aFjo/view?usp=sharing' #because model size too large
gdown.download(url, 'roberta_imbd_model.pt', fuzzy=True)

Downloading...
From (original): https://drive.google.com/uc?id=1HcY9WA-ded5y5YLDNtGwH9cnO0F6aFjo
From (redirected): https://drive.google.com/uc?id=1HcY9WA-ded5y5YLDNtGwH9cnO0F6aFjo&confirm=t&uuid=4286792a-9f88-4a43-9fe7-3ced26ff56c4
To: /Users/suzie/NLP-sentiment-analysis/notebooks/roberta_imbd_model.pt
100%|██████████| 499M/499M [00:43<00:00, 11.4MB/s] 


'roberta_imbd_model.pt'

In [14]:
from src.models.roberta import RobertaTokenizer_imdb, RobertaModel_imdb
import tqdm 
dataset_manager = RobertaTokenizer_imdb("roberta-base")
df_train = pd.read_csv(os.path.join(datasets_path, "df_train.csv"))
df_train = dataset_manager.prepare_dataset(df_train)
train_loader = dataset_manager.create_dataloader(df_train)

df_test = pd.read_csv(os.path.join(datasets_path, "df_test.csv"))
df_test = dataset_manager.prepare_dataset(df_test)
test_loader = dataset_manager.create_dataloader(df_test)
model = RobertaModel_imdb("roberta-base", num_labels=2)
model.load_model(filepath="roberta_imbd_model.pt")

train_accuracy = model.evaluate(train_loader)
test_accuracy = model.evaluate(test_loader)

print("======== Accuracy on train set for RoBERTa ========\n", train_accuracy)
print("======== Accuracy on test set for RoBERTa ========\n", test_accuracy)

Map: 100%|██████████| 25000/25000 [00:24<00:00, 1005.49 examples/s]
Map: 100%|██████████| 25000/25000 [00:24<00:00, 1027.93 examples/s]


Using device: mps


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Evaluating on Validation: 100%|██████████| 3125/3125 [15:03<00:00,  3.46it/s]


Validation Accuracy: 0.9776


Evaluating on Validation: 100%|██████████| 3125/3125 [15:39<00:00,  3.33it/s]  

Validation Accuracy: 0.9487
 {'accuracy': 0.97764}
 {'accuracy': 0.94872}



