In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

In [None]:
# Load datasets
covid_train = pd.read_csv('covid-tweets-train.csv')
covid_test = pd.read_csv('covid-tweets-test.csv')

In [None]:
# Drop missing labels or sentiment scores that are strings
covid_train = covid_train.dropna()
covid_test = covid_test.dropna()
# The original Sentiment type of train is Obj while test is Int64
covid_train = covid_train[covid_train['Sentiment'].isin(['0', '1', '2', 0, 1, 2])]
covid_test = covid_test[covid_test['Sentiment'].isin(['0', '1', '2', 0, 1, 2])]

# Convert Sentiment column from string to integer
covid_train["Sentiment"] = covid_train["Sentiment"].astype(int)
covid_test["Sentiment"] = covid_test["Sentiment"].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  covid_train["Sentiment"] = covid_train["Sentiment"].astype(int)


In [None]:
covid_train.head()

Unnamed: 0.1,Unnamed: 0,OriginalTweet,Sentiment
0,0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,1
1,1,advice Talk to your neighbours family to excha...,2
2,2,Coronavirus Australia: Woolworths to give elde...,2
3,3,My food stock is not the only one which is emp...,2
4,4,"Me, ready to go at supermarket during the #COV...",0


In [None]:
covid_test.head()

Unnamed: 0.1,Unnamed: 0,OriginalTweet,Sentiment
0,0,TRENDING: New Yorkers encounter empty supermar...,0
1,1,When I couldn't find hand sanitizer at Fred Me...,2
2,2,Find out how you can protect yourself and love...,2
3,3,#Panic buying hits #NewYork City as anxious sh...,0
4,4,#toiletpaper #dunnypaper #coronavirus #coronav...,1


# Part 1

### A. Consider the training data. What is the balance between the three classes? In other words, what proportion of the observations (in the training set) belong to each class?


In [None]:
# Function that determines the balance between the three classes
def class_count(df):
  return df["Sentiment"].value_counts(normalize=True)

# See the proportion of the observations in the training set
print(f"Class distribution in the training set:\n{class_count(covid_train)}")

Class distribution in the training set:
Sentiment
2    0.438434
0    0.374159
1    0.187407
Name: proportion, dtype: float64


### B. Tokenize the tweets. In other words, for each observation, convert the tweet from a single string of running text into a list of individual tokens (possibly with punctuation), splitting on whitespace. The result should be that each observation (tweet) is a list of individual tokens.


In [None]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
# Create a new column in our DF that contains token lists instead of raw text
def tokenize_text(df):
  df['tokens'] =  df["OriginalTweet"].apply(lambda x: x.split())

# tokenize_text(covid_train)
# print(covid_train['tokens'].head(5))

### C. Using a regular expression, remove any URL tokens from each of the observations.

In [None]:
# Remove any URL tokens
def remove_url(df):
  df['tokens'] = df['tokens'].apply(lambda tokens: [word for word in tokens if not re.match(r'http\S+', word)])

# remove_url(covid_train)
# print(covid_train['tokens'].head(5))

### D. Remove all punctuation (,.?!;:’") and special characters(@, #, +, &, =, $, etc). Also, convert all tokens to lowercase only. Can you think of a scenario when you might want to keep some forms of punctuation?


The use of exclamation mark: When exclamation mark is added to a sentence, the emotion of that sentence may appear to be more extreme. So it's more likely to be positive or negative rather than neutral.

In [None]:
def add_cleaned_tokens(df):
    cleaned_tokens = []
    for row in df['tokens']:
      cleaned_tokens.append([re.sub(r'[^a-zA-Z0-9]', '', t).lower() for t in row if re.sub(r'[^a-zA-Z0-9]', '', t)])
    df['cleaned_tokens'] = cleaned_tokens

# add_cleaned_tokens(covid_train)
# print(covid_train['cleaned_tokens'].head(5))

### E. Now stem your tokens. This will have the effect of converting similar word forms into identical tokens (e.g. run, runs, running → run). Please specify which stemmer you use.

We will be using the Porter stemmer

In [None]:
# Import the Porter stemmer
from nltk.stem.porter import *

In [None]:
# Stemm tokens by the Porter stememr
def stem_tokens(df):
  stemmer = PorterStemmer()
  df['stemmed_tokens'] = df['cleaned_tokens'].apply(lambda tokens: [stemmer.stem(t) for t in tokens])

# stem_tokens(covid_train)
# print(covid_train['stemmed_tokens'].head(5))

### F. Remove stopwords. Using the english stopwords list from nltk, remove these common words from your observations. This list is very long (I think almost 200 words), so remove only the first 100 stopwords in the list.

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
# print the top 100 most popular english words
#sw = stopwords.words('english')[:100]
sw = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
sw

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [None]:
def remove_stopwords(df):
  tokens_no_sw = []
  for row in df['stemmed_tokens']:
    tokens_no_sw.append([t for t in row if t not in sw])
  df['tokens_no_sw'] = tokens_no_sw

# remove_stopwords(covid_train)
# print(covid_train['tokens_no_sw'].head(5))

In [None]:
def process_data(df):
  tokenize_text(df)
  remove_url(df)
  add_cleaned_tokens(df)
  stem_tokens(df)
  remove_stopwords(df)
  return df

train_data = process_data(covid_train)
test_data = process_data(covid_test)

In [None]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,OriginalTweet,Sentiment,tokens,cleaned_tokens,stemmed_tokens,tokens_no_sw
0,0,TRENDING: New Yorkers encounter empty supermar...,0,"[TRENDING:, New, Yorkers, encounter, empty, su...","[trending, new, yorkers, encounter, empty, sup...","[trend, new, yorker, encount, empti, supermark...","[trend, new, yorker, encount, empti, supermark..."
1,1,When I couldn't find hand sanitizer at Fred Me...,2,"[When, I, couldn't, find, hand, sanitizer, at,...","[when, i, couldnt, find, hand, sanitizer, at, ...","[when, i, couldnt, find, hand, sanit, at, fred...","[couldnt, find, hand, sanit, fred, meyer, turn..."
2,2,Find out how you can protect yourself and love...,2,"[Find, out, how, you, can, protect, yourself, ...","[find, out, how, you, can, protect, yourself, ...","[find, out, how, you, can, protect, yourself, ...","[find, protect, love, one, coronaviru]"
3,3,#Panic buying hits #NewYork City as anxious sh...,0,"[#Panic, buying, hits, #NewYork, City, as, anx...","[panic, buying, hits, newyork, city, as, anxio...","[panic, buy, hit, newyork, citi, as, anxiou, s...","[panic, buy, hit, newyork, citi, anxiou, shopp..."
4,4,#toiletpaper #dunnypaper #coronavirus #coronav...,1,"[#toiletpaper, #dunnypaper, #coronavirus, #cor...","[toiletpaper, dunnypaper, coronavirus, coronav...","[toiletpap, dunnypap, coronaviru, coronavirusa...","[toiletpap, dunnypap, coronaviru, coronavirusa..."


### G. Now convert your lists of words into vectors of word counts. You may find Scikit-learn’s CountVectorizer useful here. What is the length of your vocabulary?


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [None]:
from tkinter.constants import X
from itertools import count
def override_fcn(doc):
  # We expect a list of tokens as input
  return doc

# Count Vectorizer
def count_vectorizer(df1, df2):
  X_df1, y_df1 = df1['tokens_no_sw'].to_numpy(), df1['Sentiment'].to_numpy()
  X_df2, y_df2 = df2['tokens_no_sw'].to_numpy(), df2['Sentiment'].to_numpy()
  X_df = np.concatenate((X_df1, X_df2))
  y_df = np.concatenate((y_df1, y_df2))
  vocab_count = X_df.shape[0]

  print(f"The length of vocabulary is: {vocab_count}")

  count_vec = CountVectorizer(
    analyzer='word',
    tokenizer= override_fcn,
    preprocessor= override_fcn,
    token_pattern= None,
    max_features = vocab_count)

  counts_combined = count_vec.fit_transform(X_df)
  counts1 = counts_combined[:len(X_df1)]  # First part: Training data
  counts2 = counts_combined[len(X_df1):]  # Second part: Test data

  print(f"vec_train: {counts1.toarray()}")
  print(f"vec_test: {counts2.toarray()}")
  print(f"vec_train shape: {counts1.shape}")
  print(f"vec_test shape: {counts2.shape}")
  return counts1, counts2, count_vec

In [None]:
# Apply Count Vectorizer to the train data
vec_train, vec_test, count_vec= count_vectorizer(train_data, test_data)

The length of vocabulary is: 44949
vec_train: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
vec_test: [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
vec_train shape: (41151, 44949)
vec_test shape: (3798, 44949)


### H. Fit a Naive Bayes model to your data. Report the training and test error of the model. Use accuracy as the error metric. Also, report the 5 most probable words in each class, along with their counts. You might find Scikit-learn’s MultinomialNB() transformer useful. Use Laplace smoothing to prevent probabilities of zero.


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, roc_curve
import matplotlib.pyplot as plt

In [None]:
y_train = train_data['Sentiment']
y_test = test_data['Sentiment']

In [None]:
# Let's fit the Naive Bayes model to our training data
# Set alpha = 1 to prevent probabilities of zero
nb = MultinomialNB(alpha=1)
# Fit model to training data
nb.fit(vec_train, y_train)
# Predict on train data
y_preds_train = nb.predict(vec_train)
# Predict on test data
y_preds_test = nb.predict(vec_test)

print('Train accuracy with simple Naive Bayes:', accuracy_score(y_train,y_preds_train))
print('Test accuracy with simple Naive Bayes:', accuracy_score(y_test,y_preds_test))

Train accuracy with simple Naive Bayes: 0.7869796602755704
Test accuracy with simple Naive Bayes: 0.6735123749341759


In [None]:
feature_names = count_vec.get_feature_names_out()
class_probs = nb.feature_log_prob_ # The model calculates the conditional log-probabilities of words given each class

# For each class, sort words by their probability and get the top 5 words
for i, class_prob in enumerate(class_probs):
    sorted_idx = np.argsort(class_prob)[::-1]  # Sort indices in descending order of probability
    top_words_idx = sorted_idx[:5]  # Get top 5 words
    top_words = [(feature_names[idx], np.exp(class_prob[idx])) for idx in top_words_idx]  # Convert log probs back to actual probs

    print(f"Class {i} most probable words and counts:")
    for word, prob in top_words:
        print(f"{word}: {prob:.4f}")

Class 0 most probable words and counts:
coronaviru: 0.0193
covid19: 0.0175
price: 0.0125
food: 0.0104
thi: 0.0092
Class 1 most probable words and counts:
coronaviru: 0.0242
covid19: 0.0217
store: 0.0101
supermarket: 0.0092
price: 0.0087
Class 2 most probable words and counts:
coronaviru: 0.0185
covid19: 0.0183
store: 0.0097
thi: 0.0094
price: 0.0083


### I. Would it be appropriate to fit an ROC curve in this scenario? If yes, explain why. If no, explain why not.

The ROC curve is designed for binary classification, comparing the True Positive Rate (TPR) against the False Positive Rate (FPR), which are defined based on a single positive class and a single negative class. However, in this case, we have a multiclass classification problem with three sentiment labels (0, 1, 2).

While it is possible to apply the ROC curve by converting the problem into a one-vs-rest approach—treating one class as positive and the other two as negative—this may not be the most appropriate or optimal evaluation metric. Converting to binary classification restricts the evaluation scope, reducing the ability to capture relationships between all three classes simultaneously.


### J. Redo parts G-H using TF-IDF vectors instead of count vectors. You might find Scikitlearn’s TfidfVectorizer() transformer useful. Report the training and test accuracy. How does this compare to the accuracy using count vectors?  


Redo part G

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

# TF-IDF Vectorizer
def tfidf_vectorizer(counts1, counts2):
    tfidf_train = tfidf.fit_transform(counts1)
    tfidf_test = tfidf.fit_transform(counts2)
    print(f"TF-IDF vec_train shape: {tfidf_train.shape}")
    print(f"TF-IDF vec_test shape: {tfidf_test.shape}")

    return tfidf_train, tfidf_test

In [None]:
# Apply TF-IDF Vectorizer to the train and test data
tfidf_train, tfidf_test = tfidf_vectorizer(vec_train, vec_test)

TF-IDF vec_train shape: (41151, 44949)
TF-IDF vec_test shape: (3798, 44949)


Redo part H

In [None]:
# Let's fit the Naive Bayes model to our training data
nb = MultinomialNB(alpha=1)
# Fit model to training data
nb.fit(tfidf_train, y_train)
# Predict on train data
y_preds_train = nb.predict(tfidf_train)
# Predict on test data
y_preds_test = nb.predict(tfidf_test)

print('Train accuracy with simple Naive Bayes:', accuracy_score(y_train,y_preds_train))
print('Test accuracy with simple Naive Bayes:', accuracy_score(y_test,y_preds_test))

Train accuracy with simple Naive Bayes: 0.7161186848436247
Test accuracy with simple Naive Bayes: 0.6327014218009479


In [None]:
feature_names = count_vec.get_feature_names_out()
class_probs = nb.feature_log_prob_

# For each class, sort words by their probability and get the top 5 words
for i, class_prob in enumerate(class_probs):
    sorted_idx = np.argsort(class_prob)[::-1]  # Sort indices in descending order of probability
    top_words_idx = sorted_idx[:5]  # Get top 5 words
    top_words = [(feature_names[idx], np.exp(class_prob[idx])) for idx in top_words_idx]  # Convert log probs back to actual probs

    print(f"Class {i} most probable words and counts:")
    for word, prob in top_words:
        print(f"{word}: {prob:.4f}")

Class 0 most probable words and counts:
coronaviru: 0.0047
covid19: 0.0043
price: 0.0040
food: 0.0038
thi: 0.0031
Class 1 most probable words and counts:
coronaviru: 0.0047
covid19: 0.0044
store: 0.0030
supermarket: 0.0026
groceri: 0.0026
Class 2 most probable words and counts:
covid19: 0.0047
coronaviru: 0.0047
store: 0.0036
thi: 0.0033
groceri: 0.0032


### K. Recall lemmatization converts each word to its base form, which is a bit stronger than simply taking the stem. Redo parts E-H using TF-IDF vectors instead of count vectors. This time use lemmatization instead of stemming. Report train and test accuracy. How does the accuracy with lemmatization compare to the accuracy with stemming?


In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
def lemmatize_tokens(df):
    lemmatizer = WordNetLemmatizer()
    df['lemmatized_tokens'] = df['cleaned_tokens'].apply(lambda tokens: [lemmatizer.lemmatize(t) for t in tokens])

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')
# print the top 100 most popular english words
#sw = stopwords.words('english')[:100]
sw = stopwords.words('english')
def remove_lemmatized_stopwords(df):
  tokens_no_sw = []
  for row in df['lemmatized_tokens']:
    tokens_no_sw.append([t for t in row if t not in sw])
  df['lemmatized_tokens_no_sw'] = tokens_no_sw

def process_data(df):
  tokenize_text(df)
  remove_url(df)
  add_cleaned_tokens(df)
  lemmatize_tokens(df)
  remove_lemmatized_stopwords(df)
  return df

train_data = process_data(covid_train)
test_data = process_data(covid_test)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,OriginalTweet,Sentiment,tokens,cleaned_tokens,stemmed_tokens,tokens_no_sw,lemmatized_tokens,lemmatized_tokens_no_sw
0,0,TRENDING: New Yorkers encounter empty supermar...,0,"[TRENDING:, New, Yorkers, encounter, empty, su...","[trending, new, yorkers, encounter, empty, sup...","[trend, new, yorker, encount, empti, supermark...","[trend, new, yorker, encount, empti, supermark...","[trending, new, yorkers, encounter, empty, sup...","[trending, new, yorkers, encounter, empty, sup..."
1,1,When I couldn't find hand sanitizer at Fred Me...,2,"[When, I, couldn't, find, hand, sanitizer, at,...","[when, i, couldnt, find, hand, sanitizer, at, ...","[when, i, couldnt, find, hand, sanit, at, fred...","[couldnt, find, hand, sanit, fred, meyer, turn...","[when, i, couldnt, find, hand, sanitizer, at, ...","[couldnt, find, hand, sanitizer, fred, meyer, ..."
2,2,Find out how you can protect yourself and love...,2,"[Find, out, how, you, can, protect, yourself, ...","[find, out, how, you, can, protect, yourself, ...","[find, out, how, you, can, protect, yourself, ...","[find, protect, love, one, coronaviru]","[find, out, how, you, can, protect, yourself, ...","[find, protect, loved, one, coronavirus]"
3,3,#Panic buying hits #NewYork City as anxious sh...,0,"[#Panic, buying, hits, #NewYork, City, as, anx...","[panic, buying, hits, newyork, city, as, anxio...","[panic, buy, hit, newyork, citi, as, anxiou, s...","[panic, buy, hit, newyork, citi, anxiou, shopp...","[panic, buying, hit, newyork, city, a, anxious...","[panic, buying, hit, newyork, city, anxious, s..."
4,4,#toiletpaper #dunnypaper #coronavirus #coronav...,1,"[#toiletpaper, #dunnypaper, #coronavirus, #cor...","[toiletpaper, dunnypaper, coronavirus, coronav...","[toiletpap, dunnypap, coronaviru, coronavirusa...","[toiletpap, dunnypap, coronaviru, coronavirusa...","[toiletpaper, dunnypaper, coronavirus, coronav...","[toiletpaper, dunnypaper, coronavirus, coronav..."


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

def override_fcn(doc):
    # We expect a list of tokens as input
    return doc

# TF-IDF Vectorizer
def tfidf_vectorizer(df1, df2):
    X_df1, y_df1 = df1['lemmatized_tokens_no_sw'].to_numpy(), df1['Sentiment'].to_numpy()
    X_df2, y_df2 = df2['lemmatized_tokens_no_sw'].to_numpy(), df2['Sentiment'].to_numpy()
    X_df = np.concatenate((X_df1, X_df2))
    y_df = np.concatenate((y_df1, y_df2))
    vocab_count = X_df.shape[0]

    print(f"Vocabulary size: {vocab_count}")

    tfidf_vec = TfidfVectorizer(
        analyzer='word',
        tokenizer=override_fcn,
        preprocessor=override_fcn,
        token_pattern=None,
        max_features=vocab_count)

    tfidf_combined = tfidf_vec.fit_transform(X_df)
    tfidf1 = tfidf_combined[:len(X_df1)]  # First part: Training data
    tfidf2 = tfidf_combined[len(X_df1):]  # Second part: Test data

    print(f"TF-IDF vec_train shape: {tfidf1.shape}")
    print(f"TF-IDF vec_test shape: {tfidf2.shape}")
    return tfidf1, tfidf2, tfidf_vec

# Apply TF-IDF Vectorizer to the train and test data
tfidf_train, tfidf_test, tfidf_vec = tfidf_vectorizer(train_data, test_data)

Vocabulary size: 44949
TF-IDF vec_train shape: (41151, 44949)
TF-IDF vec_test shape: (3798, 44949)


In [None]:
# Let's fit the Naive Bayes model to our training data
nb = MultinomialNB(alpha=1)
# Fit model to training data
nb.fit(tfidf_train, y_train)
# Predict on train data
y_preds_train = nb.predict(tfidf_train)
# Predict on test data
y_preds_test = nb.predict(tfidf_test)

print('Train accuracy with simple Naive Bayes:',accuracy_score(y_train,y_preds_train))
print('Test accuracy with simple Naive Bayes:',accuracy_score(y_test,y_preds_test))

Train accuracy with simple Naive Bayes: 0.726324998177444
Test accuracy with simple Naive Bayes: 0.6408636124275935


In [None]:
feature_names = count_vec.get_feature_names_out()
class_probs = nb.feature_log_prob_

# For each class, sort words by their probability and get the top 5 words
for i, class_prob in enumerate(class_probs):
    sorted_idx = np.argsort(class_prob)[::-1]  # Sort indices in descending order of probability
    top_words_idx = sorted_idx[:5]  # Get top 5 words
    top_words = [(feature_names[idx], np.exp(class_prob[idx])) for idx in top_words_idx]  # Convert log probs back to actual probs

    print(f"Class {i} most probable words and counts:")
    for word, prob in top_words:
        print(f"{word}: {prob:.4f}")

Class 0 most probable words and counts:
clickshopperkit: 0.0046
compartmentalis: 0.0041
resuscit: 0.0039
followgtgt: 0.0036
stonecold2050: 0.0029
Class 1 most probable words and counts:
clickshopperkit: 0.0047
compartmentalis: 0.0042
std: 0.0029
stonecold2050: 0.0026
gunk: 0.0025
Class 2 most probable words and counts:
clickshopperkit: 0.0046
compartmentalis: 0.0045
std: 0.0034
gunk: 0.0031
stonecold2050: 0.0029


### Bonus:  Is the Naive Bayes model generative or discriminative? Explain your response.

-> The Naive Bayes model is a generative model.

-> The Naive Bayes in this context tries to learn how the words are generated given the sentimental level. They estimate the probability of sentimental labels and the likelihood of words given the label, and then use Bayes' theorem to estimate the probability of the label given the words. This principle matches with how a generative model works - learning the likelihood of P(X|Y) and making it generative for P(Y|X).