In [1]:
pip install scikit-learn pandas matplotlib seaborn google-cloud-aiplatform openai


Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

df = pd.read_csv("spam.csv", encoding="latin-1")[["v1", "v2"]]
df.columns = ["label", "message"]
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})


In [3]:
df.head()

Unnamed: 0,label,message,label_num
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label_num'], test_size=0.2, random_state=42
)

count_vec = CountVectorizer(stop_words='english')
X_train_vec = count_vec.fit_transform(X_train)
X_test_vec = count_vec.transform(X_test)

naive_bayes = MultinomialNB()
naive_bayes.fit(X_train_vec, y_train)

predictions = naive_bayes.predict(X_test_vec)
print("=== Naive Bayes ===")
print(classification_report(y_test, predictions))


=== Naive Bayes ===
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       965
           1       0.96      0.92      0.94       150

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [6]:
import pandas as pd
import vertexai
from vertexai.generative_models import GenerativeModel, Part, HarmCategory, HarmBlockThreshold
from google.api_core.exceptions import NotFound

PROJECT_ID = "our-pursuit-426205-c0" 
REGION = "us-central1"
MODEL_NAME = "gemini-2.5-flash-lite"
SAMPLE_SIZE = 50 


try:
    vertexai.init(project=PROJECT_ID, location=REGION)
    model = GenerativeModel(MODEL_NAME)
except Exception as e:
    print(f"Failed to initialize Vertex AI: {e}")
    print("Please ensure your project ID and region are correct.")
    print("You may also need to run 'gcloud auth application-default login' in your terminal.")
    exit()


try:
    df = pd.read_csv('spam.csv', encoding='latin-1', header=0, names=['label', 'text', 'col3', 'col4', 'col5'])
    df = df.drop(columns=['col3', 'col4', 'col5'])
    df['label'] = df['label'].str.lower()
    
    print("--- Debugging Loaded Data ---")
    print("DataFrame head:")
    print(df.head())
    print("\nUnique values in the 'label' column:")
    print(df['label'].unique())
    print(f"\nTotal 'ham' messages: {len(df[df['label'] == 'ham'])}")
    print(f"Total 'spam' messages: {len(df[df['label'] == 'spam'])}")
    print("-----------------------------\n")

    df_ham = df[df['label'] == 'ham']
    df_spam = df[df['label'] == 'spam']

    if df_ham.empty or df_spam.empty:
        print("Error: One of the label classes ('ham' or 'spam') is empty.")
        exit()

    num_ham_samples = min(len(df_ham), SAMPLE_SIZE)
    num_spam_samples = min(len(df_spam), SAMPLE_SIZE)
    
    print(f"Sampling {num_ham_samples} 'ham' messages and {num_spam_samples} 'spam' messages...")

    df_ham_sample = df_ham.sample(n=num_ham_samples, random_state=42)
    df_spam_sample = df_spam.sample(n=num_spam_samples, random_state=42)

    sample_df = pd.concat([df_ham_sample, df_spam_sample]).reset_index(drop=True)

except FileNotFoundError:
    print("Error: 'spam.csv' not found. Please ensure the file is in the same directory as this script.")
    exit()
except Exception as e:
    print(f"An error occurred during data loading or processing: {e}")
    exit()


prompt_template = """
You are an expert SMS spam classifier. Your task is to classify a given SMS message
as either "spam" or "ham" (not spam).

Here are some examples to guide you:

Example 1:
Message: "WINNER!! As a valued network customer you have been selected to receive a £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only."
Classification: spam

Example 2:
Message: "I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today."
Classification: ham

Example 3:
Message: "URGENT! You have won a 1 week FREE membership in our £100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18."
Classification: spam

Example 4:
Message: "Ok lar... Joking wif u oni"
Classification: ham

Now, classify the following message. Respond with only the word "spam" or "ham".

Message: "{sms_text}"
Classification:
"""


predictions = []
true_labels = []

print("Classifying messages...")
for index, row in sample_df.iterrows():
    sms_text = str(row['text'])
    true_label = str(row['label'])
    full_prompt = prompt_template.format(sms_text=sms_text)

    try:
        response = model.generate_content(
            full_prompt,
            generation_config={"temperature": 0.0},
            safety_settings={
                HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
                HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
            }
        )
        prediction = response.text.strip().lower()
        predictions.append(prediction)
        true_labels.append(true_label)
        
        print(f"Message: '{sms_text[:50]}...' -> Predicted: {prediction}, Actual: {true_label}")

    except NotFound as e:
        print(f"A NotFound error occurred. Please verify your project ID, region, and model name: {e}")
        # The program will continue to the next message, but all subsequent calls will likely fail.
        # It's better to add this check and explain the problem.
        predictions.append("error")
        true_labels.append(true_label)
    except Exception as e:
        print(f"An unexpected error occurred for message '{sms_text[:50]}...': {e}")
        predictions.append("error")
        true_labels.append(true_label)


from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

valid_predictions = [pred for pred, true in zip(predictions, true_labels) if pred in ["spam", "ham"]]
valid_true_labels = [true for pred, true in zip(predictions, true_labels) if pred in ["spam", "ham"]]

if valid_true_labels:
    accuracy = accuracy_score(valid_true_labels, valid_predictions)
    conf_matrix = confusion_matrix(valid_true_labels, valid_predictions, labels=['ham', 'spam'])
    
    print("\n--- Evaluation Results ---")
    print(f"Accuracy: {accuracy:.2f}")
    print("\nConfusion Matrix:")
    print(conf_matrix)
    print("\nLabels: ['ham' 'spam']")
else:
    print("\nNo valid predictions to evaluate. Check for errors during classification.")



--- Debugging Loaded Data ---
DataFrame head:
  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...

Unique values in the 'label' column:
['ham' 'spam']

Total 'ham' messages: 4825
Total 'spam' messages: 747
-----------------------------

Sampling 50 'ham' messages and 50 'spam' messages...
Classifying messages...
Message: 'I am late,so call you tomorrow morning.take care s...' -> Predicted: ham, Actual: ham
Message: 'U r too much close to my heart. If u go away i wil...' -> Predicted: ham, Actual: ham
Message: 'Wait  &lt;#&gt;  min.....' -> Predicted: ham, Actual: ham
Message: 'Can you call me plz. Your number shows out of cove...' -> Predicted: ham, Actual: ham
Message: 'MAYBE IF YOU WOKE 