# Main

This notebook serves as the main entry point to train, evaluate, and compare the performance of different NLP models implemented in separate Python modules.

## I. Base model : TF-IDF and Naive Bayes

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from TF_IDF import TfidfClassifier

In [5]:
df_train = pd.read_csv('aclImdb/df_train.csv')
df_test = pd.read_csv('aclImdb/df_test.csv')

X = df_train['comment']
y = df_train['sentiment']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
tfidf_classifier = TfidfClassifier(X_train=X_train, X_val=X_val, y_train=y_train, y_val=y_val, 
                                   train_file_path='aclImdb/df_train.csv', test_file_path='aclImdb/df_test.csv')

tfidf_classifier.run_experiments(
    max_features_list=[1000, 2000, 5000],
    use_idf_list=[True, False],
    alpha_list=[0.1, 1.0, 10.0]
)

tfidf_classifier.get_best_config()

{'max_features': 5000,
 'use_idf': True,
 'alpha': 1.0,
 'train_accuracy': 0.86855,
 'val_accuracy': 0.8498}

In [8]:
#Evaluate perf on test set
train_accuracy, test_accuracy = tfidf_classifier.evaluate_on_test(config = tfidf_classifier.best_config)

print("======== Accuracy on train set for TF-IDF & Naive Bayes ========\n", train_accuracy)
print("======== Accuracy on test set for TF-IDF & Naive Bayes ========\n", test_accuracy)

 0.865
 0.84056


## II. Word2vec and SVC

In [7]:
from word2vec import ReviewTokenizer, Word2VecEmbedder, SentimentClassifier
from sklearn.svm import LinearSVC

In [None]:
df_train = pd.read_csv('aclImdb/df_train.csv')
df_test = pd.read_csv('aclImdb/df_test.csv')

# tokenisation
tokenized_reviews_train = [ReviewTokenizer.tokenize(text) for text in df_train['comment']]

# # train embeddings
embedder = Word2VecEmbedder()
embedder.train(tokenized_reviews_train)
X_embeddings = embedder.embed_reviews(tokenized_reviews_train)
embedder.save_embeddings(X_embeddings, 'aclImdb/embeddings/X_train_word2vec_embeddings.pkl')

# train/val split
X_train, X_val, y_train, y_val = train_test_split(
    X_embeddings, df_train['sentiment'], test_size=0.2, random_state=42
)

# train and evaluate classifier
clf = SentimentClassifier(classifier=LinearSVC())
clf.train(X_train, y_train)
clf.evaluate(X_val, y_val)
print("======== Accuracy on val set for Word2vec and SVC ========\n", train_accuracy)



 0.8782


In [10]:
#Check performance on test set 
clf.train(X_embeddings, df_train['sentiment'])

tokenized_reviews_test = [ReviewTokenizer.tokenize(text) for text in df_test['comment']]

X_test_embeddings = embedder.embed_reviews(tokenized_reviews_test)
test_accuracy, test_report = clf.evaluate(X_test_embeddings, df_test['sentiment'])
train_accuracy, train_report = clf.evaluate(X_train, y_train)

print("======== Accuracy on train set for Word2vec and SVC ========\n", train_accuracy)
print("======== Accuracy on test set for Word2vec and SVC ========\n", test_accuracy)



 0.8788
 0.86944
