In [1]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [16]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [3]:
# Read the data
df = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/AAI540/FinalProject/CEAS_08_cleaned.csv')
df.head(5)

Unnamed: 0,sender,subject,body,label,urls
0,Young Esposito <Young@iworld.de>,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1
1,Mok <ipline's1983@icable.ph>,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1
3,Michael Parker <ivqrnai@pobox.com>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1
4,Gretchen Suggs <externalsep1@loanofficertool.com>,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1


In [4]:
df.shape

(39154, 5)

In [5]:
# shorten data for now
df = df[0:5000]

In [6]:
df.shape

(5000, 5)

In [7]:
# Concatenate the 'subject' and 'body' columns
df['text'] = df['subject'] + ' ' + df['body']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['subject'] + ' ' + df['body']


In [8]:
df.head(5)

Unnamed: 0,sender,subject,body,label,urls,text
0,Young Esposito <Young@iworld.de>,Never agree to be a loser,"Buck up, your troubles caused by small dimensi...",1,1,"Never agree to be a loser Buck up, your troubl..."
1,Mok <ipline's1983@icable.ph>,Befriend Jenna Jameson,\nUpgrade your sex and pleasures with these te...,1,1,Befriend Jenna Jameson \nUpgrade your sex and ...
2,Daily Top 10 <Karmandeep-opengevl@universalnet...,CNN.com Daily Top 10,>+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+=+...,1,1,CNN.com Daily Top 10 >+=+=+=+=+=+=+=+=+=+=+=+=...
3,Michael Parker <ivqrnai@pobox.com>,Re: svn commit: r619753 - in /spamassassin/tru...,Would anyone object to removing .so from this ...,0,1,Re: svn commit: r619753 - in /spamassassin/tru...
4,Gretchen Suggs <externalsep1@loanofficertool.com>,SpecialPricesPharmMoreinfo,\nWelcomeFastShippingCustomerSupport\nhttp://7...,1,1,SpecialPricesPharmMoreinfo \nWelcomeFastShippi...


In [9]:
# Split into independent and dependent variables
X = df['body']
y = df['label']
print(f"X shape = {X.shape}")
print(f"y shape = {y.shape}")

X shape = (5000,)
y shape = (5000,)


In [10]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

In [11]:
# Create the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Data preprocessing
train_data = vectorizer.fit_transform(X_train)
test_data = vectorizer.transform(X_test)

# Convert to arrays for training
train_data = train_data.toarray()
test_data = test_data.toarray()

In [12]:
print(f"X_train shape = {X_train.shape}")
print(f"y_train shape = {y_train.shape}")

X_train shape = (3750,)
y_train shape = (3750,)


In [13]:
# Build a Naive Bayes Classifier
model = GaussianNB()

# Model training
model.fit(train_data, y_train)

In [19]:
# Predictions
y_pred = model.predict(test_data)

# Performance Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

Accuracy: 0.9600
Precision: 0.9985
Recall: 0.9325
F1-score: 0.9644
