# Naive Bayes Model

In [None]:
import pandas as pd

# load data
training_data = pd.read_csv('data/training_data_features.csv')

validation_data = pd.read_csv('data/validation_data_features.csv')

test_data = pd.read_csv('data/test_data_features.csv')

### Naive Bayes with TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_training_matrix = tfidf_vectorizer.fit_transform(training_data['content_stem'])
tfidf_validation_matrix = tfidf_vectorizer.transform(validation_data['content_stem'])

# Target labels
y_training_data = training_data['reliable']
y_validation_data = validation_data['reliable']

# Initialize and train Naive Bayes classifier using only TF-IDF features
nb_classifier_tfidf = MultinomialNB()
nb_classifier_tfidf.fit(tfidf_training_matrix, y_training_data)

# Predict on validation set using only TF-IDF features
y_pred_tfidf = nb_classifier_tfidf.predict(tfidf_validation_matrix)

# Evaluate the model using only TF-IDF features
print("NAIVE BAYES w/ TF-IDF")
print(classification_report(y_validation_data, y_pred_tfidf))

##### Test on Fake News

In [None]:
tfidf_test_matrix = tfidf_vectorizer.transform(test_data['content_stem'])

y_test_data = test_data['reliable']

# Predict on test set using only TF-IDF features
y_pred = nb_classifier_tfidf.predict(tfidf_test_matrix)

# Evaluate the model using only TF-IDF features
print("NAIVE BAYES w/ TF-IDF on Fake News")
print(classification_report(y_test_data, y_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test_data,y_pred)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('NAIVE BAYES w/ TF-IDF on Fake News')
plt.show()

##### Test on LIAR

In [None]:
test_data = pd.read_csv('data/liar_dataset/test_features.csv')

tfidf_test_matrix = tfidf_vectorizer.transform(test_data['content_stem'])

y_test_data = test_data['reliable']

# Predict on test set using only TF-IDF features
y_pred = nb_classifier_tfidf.predict(tfidf_test_matrix)

# Evaluate the model using only TF-IDF features
print("NAIVE BAYES w/ TF-IDF on LIAR")
print(classification_report(y_test_data, y_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(y_test_data,y_pred)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('NAIVE BAYES w/ TF-IDF on LIAR')
plt.show()

### Naive Bayes with Transformer

In [None]:
import pandas as pd

# load data
# training data
src = 'data/training_data_embeddings.csv'
training_embeddings = pd.read_csv(src)

# validation data
src = 'data/validation_data_embeddings.csv'
validation_embeddings = pd.read_csv(src)

# test data
src = 'data/test_data_features.csv'
test_data = pd.read_csv(src)

src = 'data/test_data_embeddings.csv'
test_embeddings = pd.read_csv(src)

In [None]:
# REF: https://scikit-learn.org/stable/tutorial/text_analytics/working_with_text_data.html

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler

X_train = training_embeddings
Y_train = training_data['reliable']

X_val = validation_embeddings
Y_val = validation_data['reliable']

# scalar to account for negative values
# Transform features by scaling each feature to a given range.
# REF: https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
scaler = MinMaxScaler()
# Compute the minimum and maximum to be used for later scaling
scaler.fit(X_train)
# Scale features of X according to feature_range.
X_train_scaled = pd.DataFrame(scaler.transform(X_train))
X_val_scaled = pd.DataFrame(scaler.transform(X_val))

# naive bayes model
clf = MultinomialNB().fit(X_train_scaled, Y_train)

# predictions
y_pred = clf.predict(X_val_scaled)
print('NAIVE BAYES w/ TRANSFORMER')

print(classification_report(Y_val, y_pred))

##### Test on Fake News

In [None]:
X_test = test_embeddings
Y_test = test_data['reliable']

X_test_scaled = pd.DataFrame(scaler.transform(X_test))

# predictions
nbayes_pred = clf.predict(X_test_scaled)
print('NAIVE BAYES w/ TRANSFORMER on Fake News')

print(classification_report(Y_test, nbayes_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(Y_test, nbayes_pred)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('NAIVE BAYES w/ TRANSFORMER on Fake News')
plt.show()

##### Test on LIAR

In [None]:

src = 'data/liar_dataset/test_features.csv'
test_data = pd.read_csv(src)

src = 'data/liar_dataset/test_embeddings.csv'
test_embeddings = pd.read_csv(src)

X_test = test_embeddings
Y_test = test_data['reliable']

X_test_scaled = pd.DataFrame(scaler.transform(X_test))

# predictions
nbayes_pred = clf.predict(X_test_scaled)
print('NAIVE BAYES w/ TRANSFORMER on Liar')

print(classification_report(Y_test, nbayes_pred))

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Calculate confusion matrix
conf_matrix = confusion_matrix(Y_test, nbayes_pred)

# Plot confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=['false', 'true'])
disp.plot()
plt.title('NAIVE BAYES w/ TRANSFORMER on LIAR')
plt.show()