In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

In [2]:
# Load the dataset, skipping the first row if it contains headers
file_path = 'text_labels.csv'
msg = pd.read_csv(file_path, skiprows=1, names=['message', 'label'])

In [3]:
# Check for missing values
missing_values = msg.isnull().sum()
print(f"Missing values:\n{missing_values}")

Missing values:
message    0
label      0
dtype: int64


In [4]:
# Map labels to numerical values
msg['labelnum'] = msg.label.map({'pos': 1, 'neg': 0})

In [5]:
# Extract features and labels
X = msg.message
y = msg.labelnum

In [6]:
# Split the dataset into train and test data
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)
print('The total number of Training Data:', ytrain.shape)
print('The total number of Test Data:', ytest.shape)

The total number of Training Data: (14,)
The total number of Test Data: (4,)


In [7]:
# Vectorize the text data
count_vect = CountVectorizer()
xtrain_dtm = count_vect.fit_transform(xtrain)
xtest_dtm = count_vect.transform(xtest)
print('\nThe words or Tokens in the text documents\n')
print(count_vect.get_feature_names_out())



The words or Tokens in the text documents

['about' 'am' 'an' 'and' 'awesome' 'bad' 'beers' 'best' 'boss' 'can'
 'dance' 'deal' 'do' 'enemy' 'feel' 'fun' 'good' 'great' 'have' 'holiday'
 'horrible' 'house' 'is' 'juice' 'like' 'locality' 'love' 'my' 'not' 'of'
 'place' 'sick' 'stay' 'stuff' 'taste' 'that' 'the' 'these' 'this' 'tired'
 'to' 'today' 'tomorrow' 'very' 'view' 'we' 'went' 'what' 'will' 'with'
 'work']


In [8]:
# Train the Naive Bayes classifier
clf = MultinomialNB().fit(xtrain_dtm, ytrain)
predicted = clf.predict(xtest_dtm)

In [9]:
# Evaluate the classifier
print('\nAccuracy of the classifier is', metrics.accuracy_score(ytest, predicted))
print('\nConfusion matrix')
print(metrics.confusion_matrix(ytest, predicted))
print('\nThe value of Precision', metrics.precision_score(ytest, predicted))
print('\nThe value of Recall', metrics.recall_score(ytest, predicted))


Accuracy of the classifier is 1.0

Confusion matrix
[[2 0]
 [0 2]]

The value of Precision 1.0

The value of Recall 1.0
