In [None]:
import pandas as pd

spam_1 = pd.read_csv('/content/drive/MyDrive/ColabML/spam.csv', encoding="latin1")
spam_2 = pd.read_csv('/content/drive/MyDrive/ColabML/SPAM text message 20170820 - Data.csv', encoding="latin1")

dataset_1 = pd.DataFrame(spam_1, columns=["v1","v2"]).rename(columns={"v1": "label", "v2": "text"})
dataset_2 = pd.DataFrame(spam_2, columns=["Category","Message"]).rename(columns={"Category": "label", "Message": "text"})

all_spam_1 = dataset_1[dataset_1["label"] == "spam"]
all_spam_2 = dataset_2[dataset_2["label"] == "spam"]

all_spam = pd.concat([all_spam_1, all_spam_2])
all_spam["text"]

In [None]:
from google.colab import drive
drive.mount('/content/drive')


In [None]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = embedder.encode(all_spam['text'].tolist(), show_progress_bar=True)


In [None]:
import umap

dim_reducer = umap.UMAP(
    n_neighbors=15,
    n_components=10,
    random_state=42
)
embeddings_reduced = dim_reducer.fit_transform(embeddings)

In [None]:
from sklearn.cluster import HDBSCAN

clusterer = HDBSCAN(
    min_cluster_size=30,
    metric="euclidean",
    cluster_selection_method="eom"
)

cluster_labels = clusterer.fit_predict(embeddings_reduced)

In [None]:
import numpy as np

labels = np.unique(cluster_labels)
all_spam["c-label"] = cluster_labels

labels

In [None]:
all_spam[all_spam["c-label"] == 3].head(20)

In [None]:
for i, msg in enumerate(all_spam[all_spam["c-label"] == 7]["text"].head(20), 1):
    print(f"{i}: {msg}\n")

1. Ringtone spam
2. Mobile phone scam
3. Bluetooth headset scam
4. Call-back scam
5. Cash scam
6. Video-related spam
7. Sex spam
8. Subscription scam
9. Call-now scam
10. Free-gift scam
11. Free-cash scam
12. Again, free-cash!


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
# Map numeric labels to categories (if needed)
label_map = {
    1: "Ringtone spam",
    2: "Mobile phone scam",
    3: "Bluetooth headset scam",
    4: "Call-back scam",
    5: "Cash scam",
    6: "Video-related spam",
    7: "Sex spam",
    8: "Subscription scam"
}
all_spam['category'] = all_spam['c-label'].map(label_map)

In [None]:
# Remove rows where c-label is -1 or 0
all_spam = all_spam[~all_spam['c-label'].isin([-1, 0])]

# Optional: reset the index
all_spam = all_spam.reset_index(drop=True)

In [None]:
all_spam.shape

In [None]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    all_spam['text'], all_spam['c-label'], test_size=0.2, random_state=42, stratify=all_spam['c-label']
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

model = LogisticRegression(max_iter=1000, multi_class='multinomial')
model.fit(X_train_vect, y_train)

y_pred = model.predict(X_test_vect)
print(classification_report(y_test, y_pred, target_names=all_spam['category'].unique()))

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Plot confusion matrix
plt.figure(figsize=(8,6))
plt.imshow(cm)
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")

# Annotate cells
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        plt.text(j, i, cm[i, j], ha='center', va='center')

plt.colorbar()
plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt
import numpy as np

# Your label map
label_map = {
    1: "Ringtone spam",
    2: "Mobile phone scam",
    3: "Bluetooth headset scam",
    4: "Call-back scam",
    5: "Cash scam",
    6: "Video-related spam",
    7: "Sex spam",
    8: "Subscription scam"
}

# Unique classes
classes = np.unique(y_test)

# Binarize labels for multiclass ROC
y_test_bin = label_binarize(y_test, classes=classes)

# Predict probabilities
y_score = model.predict_proba(X_test_vect)

# Plot ROC curves with category names
plt.figure(figsize=(12, 8))

for i, class_label in enumerate(classes):
    fpr, tpr, _ = roc_curve(y_test_bin[:, i], y_score[:, i])
    roc_auc = auc(fpr, tpr)

    # Use mapped label here
    category_name = label_map[class_label]

    plt.plot(fpr, tpr, label=f"{category_name} (AUC = {roc_auc:.2f})")

# Random baseline
plt.plot([0, 1], [0, 1], linestyle="--")

plt.title("Multiclass ROC Curve by Spam Category")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()