In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import joblib
import os

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### **Text preprocessing**

In [3]:
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stop_words]
    text = ' '.join(tokens)

    # Remove special characters
    text = text.replace('’', '')

    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    text = ' '.join(tokens)

    return text


In [5]:

file_path = '/content/drive/MyDrive/notebooks/new_dataset.csv'
df = pd.read_csv(file_path)
df = df.dropna(subset=['text'])


df['text'] = df['text'].apply(preprocess_text)


### **Load and evaluate the model**

In [6]:
# Load the saved model and vectorizer
preprocessing_model_dir = '/content/drive/MyDrive/models/naive_bayes/'
model_path = os.path.join(preprocessing_model_dir, 'naive_bayes_preprocessing.joblib')
vectorizer_path = os.path.join(preprocessing_model_dir, 'vectorizer_preprocessing.joblib')

model = joblib.load(model_path)
vectorizer = joblib.load(vectorizer_path)


In [7]:
# Vectorize the text data
X_new = vectorizer.transform(df['text'])


In [8]:

predictions = model.predict(X_new)

true_labels = df['label'].values
report = classification_report(true_labels, predictions, target_names=['Class 0', 'Class 1'])

print(report)


              precision    recall  f1-score   support

     Class 0       0.62      0.83      0.71     21417
     Class 1       0.78      0.54      0.64     23481

    accuracy                           0.68     44898
   macro avg       0.70      0.69      0.68     44898
weighted avg       0.70      0.68      0.67     44898

