In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Define the path to your dataset
path_to_dataset = '/content/drive/MyDrive/spam.csv'

# Attempt to load the dataset with different encodings
encodings = ['utf-8', 'ISO-8859-1', 'latin1', 'cp1252']
df = None

for encoding in encodings:
    try:
        df = pd.read_csv(path_to_dataset, encoding=encoding)
        print(f"Loaded dataset with encoding: {encoding}")
        break
    except UnicodeDecodeError as e:
        print(f"Failed to load with encoding {encoding}: {e}")

if df is None:
    raise ValueError("Unable to load the dataset with any of the tried encodings.")

# Display the first few rows and column names of the dataset
print("Dataset preview:")
print(df.head())
print("\nColumn names:")
print(df.columns)

# Map columns to the expected names
df.columns = ['label', 'message', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']

# Define features and target variable
X = df['message']
y = df['label']  # Assuming 'label' contains 'spam' or 'ham'

# Clean up the target variable if necessary
y = y.map({'ham': 'ham', 'spam': 'spam'})  # Adjust as needed

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Create pipelines for each model
nb_pipeline = make_pipeline(tfidf, MultinomialNB())
lr_pipeline = make_pipeline(tfidf, LogisticRegression(max_iter=1000))
svm_pipeline = make_pipeline(tfidf, SVC())

# Train and evaluate Naive Bayes model
nb_pipeline.fit(X_train, y_train)
nb_preds = nb_pipeline.predict(X_test)
print("Naive Bayes:")
print("Accuracy:", accuracy_score(y_test, nb_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, nb_preds))
print("Classification Report:\n", classification_report(y_test, nb_preds))

# Train and evaluate Logistic Regression model
lr_pipeline.fit(X_train, y_train)
lr_preds = lr_pipeline.predict(X_test)
print("Logistic Regression:")
print("Accuracy:", accuracy_score(y_test, lr_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, lr_preds))
print("Classification Report:\n", classification_report(y_test, lr_preds))

# Train and evaluate Support Vector Machine model
svm_pipeline.fit(X_train, y_train)
svm_preds = svm_pipeline.predict(X_test)
print("Support Vector Machine:")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, svm_preds))
print("Classification Report:\n", classification_report(y_test, svm_preds))
