### This notebook illustrates the steps in building a Naive Bayes model for classifying IMDB Movie Review sentiment (negative/positive). 

In [None]:
import pandas as pd

#### LOAD DATASETS ####

train_data_file = "train.csv"
test_data_file = "test.csv"

# Import train and test dataset into data frames and print out their lengths
train_data_df = pd.read_csv(train_data_file)
test_data_df = pd.read_csv(test_data_file)
print ("Train set: ",len(train_data_df))
print ("Test set: ",len(test_data_df))

# print out top 5 rows of the train set
display(train_data_df.head(5))

### Count-based feature extraction using scikit-learn CountVectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# examine the features extracted using CountVectorizer for a small subset of instances
text = train_data_df.iloc[0:3]["review"]
print(text,"\n")
vectorizer = CountVectorizer(ngram_range = (1,1)) # adjust this ngram range to observe different n-grams

# tokenize and build vocabulary
vectorizer.fit(text)
print(vectorizer.vocabulary_,"\n")

# create feature vector representation
vector = vectorizer.transform(text)

# summarize vector information, the integers associated with the n-grams are feature indices
print(vector.shape,"\n") 

# complete vectors
print(vector.toarray())

Interpret the feature vectors. What do they tell you about the individual examples?

### Feature extraction using CountVectorizer for training and testing data on reviews


In [None]:
# use review for model building

train_text = train_data_df["review"]
test_text = test_data_df["review"]

# set the n-gram range
vectorizer = CountVectorizer(ngram_range = (1,1))

# create training data representation
train_data_cv = vectorizer.fit_transform(train_text.values.astype('U'))
print(train_data_cv.shape,"\n") 

# create test data representation
test_data_cv = vectorizer.transform(test_text.values.astype('U'))
print(test_data_cv.shape,"\n") 

Note the difference between how training and test data are vectorized (`fit_transform` vs. `transform`). 
Why is this needed?

## Building a Naive Bayes model using unigram features (bag-of-words)

In [None]:
from sklearn.naive_bayes import MultinomialNB
# define true labels from train set
x_train = train_data_cv
y_train = train_data_df["sentiment"]
x_test = test_data_cv
y_test = test_data_df["sentiment"]

# build model on the training data
model = MultinomialNB()
model.fit(x_train, y_train)

# predict the labels for the test data
predictions = model.predict(x_test)

### Saving and Loading the model

In [None]:
import pickle

# save the model to disk
filename = 'NaiveBayesModelBOW.sav'
pickle.dump(model, open(filename, 'wb'))

# some time later...
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)

### Evaluating the model

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix 

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

### Comparing the reviews where model predicted erroneously

In [None]:
predictions = predictions
labels = test_data_df["sentiment"]
inputs = test_data_df["review"]

for idx, prediction, label in zip(enumerate(inputs), predictions, labels):
    if prediction != label:
        print("Sample", 'has been classified as', prediction, 'and should be', label, idx) 
        print ("\n")

### Feature extraction using TfidfVectorizer 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# this time, we vectorize using TF-IDF
text = train_data_df.iloc[0:3]["review"]
print(text,"\n")

tf = TfidfVectorizer()
tf.fit(text)

print(tf.vocabulary_,"\n")

# encode document
data = tf.transform(text)

# summarize encoded vector
print(data.shape,"\n") 
print(data.toarray())

### Feature Extraction using TfidfVectorizer for training and testing data on Review

In [None]:
train_data_tfidf = tf.fit_transform(train_text)
print(train_data_tfidf.shape,"\n") 

test_data_tfidf = tf.transform(test_text)
print(test_data_tfidf.shape,"\n") 

idf = tf.idf_

# print out feature names (the words) and the IDF values
print(dict(zip(tf.get_feature_names_out(), idf)))

## Building and evaluate a Naive Bayes model using tf-idf representation

In [None]:
# define true labels from train set
x_train = train_data_tfidf
y_train = train_data_df["sentiment"]
x_test = test_data_tfidf
y_test = test_data_df["sentiment"]


model = MultinomialNB()
model.fit(x_train, y_train)
predictions = model.predict(x_test)

print ("Accuracy score: ", accuracy_score(y_test, predictions))
print ("Individual label performance: ")
print (classification_report(y_test, predictions))
print (confusion_matrix(y_test, predictions))

What can you say about the performance of Naive Bayes models created using count vectors and tf-idf vectors?

## Exploring EmoLex lexicon

You can try **EmoLex**'s interactive playground here: https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm under the heading **An Interactive Visualizer**. 

How can we incorporate the information in EmoLex in our classification models?