-Create a function to apply 3-5 basic text cleaning methods and apply it on the text column.

-Create BOW features for the given text data.

-Train a Naive Bayes model + random forests + xgboost.
    Train once with cleaning and once without cleaning.

-Repeat the same with Tf-IDF.
-Explore ensemble methods for classification.

In [1]:
import numpy as np
import pandas as pd

In [7]:
df = pd.read_csv(r'C:\Users\laava\Desktop\sem 6\AOML\spam.csv', encoding='latin1')
df.head(2)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,


In [8]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.head(2)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...


In [9]:
import re
import string

def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply the clean_text function to the v2 column
df['v2_cleaned'] = df['v2'].apply(clean_text)
df.head(2)

Unnamed: 0,v1,v2,v2_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only in ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni


In [10]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize the CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform the v2_cleaned column
X_bow = vectorizer.fit_transform(df['v2_cleaned'])

# Convert the result to a DataFrame for better readability
bow_df = pd.DataFrame(X_bow.toarray(), columns=vectorizer.get_feature_names_out())
bow_df.head()

Unnamed: 0,aa,aah,aaniye,aaooooright,aathilove,aathiwhere,ab,abbey,abdomen,abeg,...,ìï,ìïll,ûthanks,ûªm,ûªt,ûªve,ûï,ûïharry,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

# Encode the labels
label_encoder = LabelEncoder()
df['v1_encoded'] = label_encoder.fit_transform(df['v1'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_bow, df['v1_encoded'], test_size=0.2, random_state=42)

# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_predictions = nb_model.predict(X_test)
print("Naive Bayes Accuracy:", accuracy_score(y_test, nb_predictions))
print("Naive Bayes Classification Report:\n", classification_report(y_test, nb_predictions))

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_predictions))
print("Random Forest Classification Report:\n", classification_report(y_test, rf_predictions))

# Train an XGBoost model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
xgb_predictions = xgb_model.predict(X_test)
print("XGBoost Accuracy:", accuracy_score(y_test, xgb_predictions))
print("XGBoost Classification Report:\n", classification_report(y_test, xgb_predictions))

Naive Bayes Accuracy: 0.9704035874439462
Naive Bayes Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.98       965
           1       0.88      0.91      0.89       150

    accuracy                           0.97      1115
   macro avg       0.93      0.94      0.94      1115
weighted avg       0.97      0.97      0.97      1115

Random Forest Accuracy: 0.968609865470852
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.77      0.87       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.93      1115
weighted avg       0.97      0.97      0.97      1115

XGBoost Accuracy: 0.9775784753363229
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           

Parameters: { "use_label_encoder" } are not used.



In [13]:
# Create BOW features for the uncleaned text data
X_bow_uncleaned = vectorizer.fit_transform(df['v2'])

# Split the data into training and testing sets
X_train_uncleaned, X_test_uncleaned, y_train_uncleaned, y_test_uncleaned = train_test_split(X_bow_uncleaned, df['v1_encoded'], test_size=0.2, random_state=42)

# Train a Naive Bayes model on uncleaned data
nb_model_uncleaned = MultinomialNB()
nb_model_uncleaned.fit(X_train_uncleaned, y_train_uncleaned)
nb_predictions_uncleaned = nb_model_uncleaned.predict(X_test_uncleaned)
print("Naive Bayes Accuracy (Uncleaned):", accuracy_score(y_test_uncleaned, nb_predictions_uncleaned))
print("Naive Bayes Classification Report (Uncleaned):\n", classification_report(y_test_uncleaned, nb_predictions_uncleaned))

# Train a Random Forest model on uncleaned data
rf_model_uncleaned = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_uncleaned.fit(X_train_uncleaned, y_train_uncleaned)
rf_predictions_uncleaned = rf_model_uncleaned.predict(X_test_uncleaned)
print("Random Forest Accuracy (Uncleaned):", accuracy_score(y_test_uncleaned, rf_predictions_uncleaned))
print("Random Forest Classification Report (Uncleaned):\n", classification_report(y_test_uncleaned, rf_predictions_uncleaned))

# Train an XGBoost model on uncleaned data
xgb_model_uncleaned = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model_uncleaned.fit(X_train_uncleaned, y_train_uncleaned)
xgb_predictions_uncleaned = xgb_model_uncleaned.predict(X_test_uncleaned)
print("XGBoost Accuracy (Uncleaned):", accuracy_score(y_test_uncleaned, xgb_predictions_uncleaned))
print("XGBoost Classification Report (Uncleaned):\n", classification_report(y_test_uncleaned, xgb_predictions_uncleaned))

Naive Bayes Accuracy (Uncleaned): 0.97847533632287
Naive Bayes Classification Report (Uncleaned):
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.91      0.93      0.92       150

    accuracy                           0.98      1115
   macro avg       0.95      0.96      0.95      1115
weighted avg       0.98      0.98      0.98      1115

Random Forest Accuracy (Uncleaned): 0.9748878923766816
Random Forest Classification Report (Uncleaned):
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       965
           1       1.00      0.81      0.90       150

    accuracy                           0.97      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

XGBoost Accuracy (Uncleaned): 0.9775784753363229
XGBoost Classification Report (Uncleaned):
               precision    recall  f1-score   sup

Parameters: { "use_label_encoder" } are not used.



In [14]:
# Summary of results
results = {
    'Model': ['Naive Bayes', 'Random Forest', 'XGBoost'],
    'Cleaned Accuracy': [
        accuracy_score(y_test, nb_predictions),
        accuracy_score(y_test, rf_predictions),
        accuracy_score(y_test, xgb_predictions)
    ],
    'Uncleaned Accuracy': [
        accuracy_score(y_test_uncleaned, nb_predictions_uncleaned),
        accuracy_score(y_test_uncleaned, rf_predictions_uncleaned),
        accuracy_score(y_test_uncleaned, xgb_predictions_uncleaned)
    ]
}

results_df = pd.DataFrame(results)
print(results_df)

           Model  Cleaned Accuracy  Uncleaned Accuracy
0    Naive Bayes          0.970404            0.978475
1  Random Forest          0.968610            0.974888
2        XGBoost          0.977578            0.977578


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the v2_cleaned column
X_tfidf = tfidf_vectorizer.fit_transform(df['v2_cleaned'])

# Split the data into training and testing sets
X_train_tfidf, X_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(X_tfidf, df['v1_encoded'], test_size=0.2, random_state=42)

# Train a Naive Bayes model
nb_model_tfidf = MultinomialNB()
nb_model_tfidf.fit(X_train_tfidf, y_train_tfidf)
nb_predictions_tfidf = nb_model_tfidf.predict(X_test_tfidf)
print("Naive Bayes Accuracy (TF-IDF):", accuracy_score(y_test_tfidf, nb_predictions_tfidf))
print("Naive Bayes Classification Report (TF-IDF):\n", classification_report(y_test_tfidf, nb_predictions_tfidf))

# Train a Random Forest model
rf_model_tfidf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_tfidf.fit(X_train_tfidf, y_train_tfidf)
rf_predictions_tfidf = rf_model_tfidf.predict(X_test_tfidf)
print("Random Forest Accuracy (TF-IDF):", accuracy_score(y_test_tfidf, rf_predictions_tfidf))
print("Random Forest Classification Report (TF-IDF):\n", classification_report(y_test_tfidf, rf_predictions_tfidf))

# Train an XGBoost model
xgb_model_tfidf = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model_tfidf.fit(X_train_tfidf, y_train_tfidf)
xgb_predictions_tfidf = xgb_model_tfidf.predict(X_test_tfidf)
print("XGBoost Accuracy (TF-IDF):", accuracy_score(y_test_tfidf, xgb_predictions_tfidf))
print("XGBoost Classification Report (TF-IDF):\n", classification_report(y_test_tfidf, xgb_predictions_tfidf))

Naive Bayes Accuracy (TF-IDF): 0.9506726457399103
Naive Bayes Classification Report (TF-IDF):
               precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       1.00      0.63      0.78       150

    accuracy                           0.95      1115
   macro avg       0.97      0.82      0.87      1115
weighted avg       0.95      0.95      0.95      1115

Random Forest Accuracy (TF-IDF): 0.9704035874439462
Random Forest Classification Report (TF-IDF):
               precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.78      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.89      0.93      1115
weighted avg       0.97      0.97      0.97      1115



Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy (TF-IDF): 0.9811659192825112
XGBoost Classification Report (TF-IDF):
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.98      0.88      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [16]:
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, VotingClassifier

# Bagging with Random Forest
bagging_model = BaggingClassifier(base_estimator=RandomForestClassifier(n_estimators=100, random_state=42), n_estimators=10, random_state=42)
bagging_model.fit(X_train, y_train)
bagging_predictions = bagging_model.predict(X_test)
print("Bagging Accuracy:", accuracy_score(y_test, bagging_predictions))
print("Bagging Classification Report:\n", classification_report(y_test, bagging_predictions))

# Boosting with AdaBoost
adaboost_model = AdaBoostClassifier(n_estimators=100, random_state=42)
adaboost_model.fit(X_train, y_train)
adaboost_predictions = adaboost_model.predict(X_test)
print("AdaBoost Accuracy:", accuracy_score(y_test, adaboost_predictions))
print("AdaBoost Classification Report:\n", classification_report(y_test, adaboost_predictions))

# Voting Classifier
voting_model = VotingClassifier(estimators=[
    ('nb', nb_model),
    ('rf', rf_model),
    ('xgb', xgb_model)
], voting='hard')
voting_model.fit(X_train, y_train)
voting_predictions = voting_model.predict(X_test)
print("Voting Classifier Accuracy:", accuracy_score(y_test, voting_predictions))
print("Voting Classifier Classification Report:\n", classification_report(y_test, voting_predictions))



Bagging Accuracy: 0.9668161434977578
Bagging Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115

AdaBoost Accuracy: 0.9695067264573991
AdaBoost Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98       965
           1       0.93      0.83      0.88       150

    accuracy                           0.97      1115
   macro avg       0.95      0.91      0.93      1115
weighted avg       0.97      0.97      0.97      1115



Parameters: { "use_label_encoder" } are not used.



Voting Classifier Accuracy: 0.9802690582959641
Voting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       965
           1       0.99      0.86      0.92       150

    accuracy                           0.98      1115
   macro avg       0.99      0.93      0.96      1115
weighted avg       0.98      0.98      0.98      1115

