In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [3]:
# Load dataset from UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split data into features (X) and labels (y)
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Labels (the last column)

# Split the dataset into 75% training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the data (important for SVM and Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define SVM models with different kernels
kernels = ['linear', 'poly', 'rbf']
svm_models = {kernel: SVC(kernel=kernel, random_state=42) for kernel in kernels}

# Train and evaluate each SVM model

for kernel, model in svm_models.items():
    model.fit(X_train_scaled, y_train)  # Train on scaled data
    y_pred = model.predict(X_test_scaled)  # Test on scaled data
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with {kernel} kernel: {accuracy:.4f}")

# Logistic Regression (with increased max_iter)
lr_model = LogisticRegression(max_iter=2000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy with Logistic Regression (scaled): {lr_accuracy:.4f}")


Accuracy with linear kernel: 0.9288
Accuracy with poly kernel: 0.7715
Accuracy with rbf kernel: 0.9366
Accuracy with Logistic Regression (scaled): 0.9227


In [4]:
# For Bonus 3 Points I am comparing SVM Models with Random Forest classifier
# Load dataset from UCI (URL)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"
data = pd.read_csv(url, header=None)

# Split data into features (X) and labels (y)
X = data.iloc[:, :-1]  # Features (all columns except the last one)
y = data.iloc[:, -1]   # Labels (the last column)

# Split the dataset into 75% training and 25% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the data (important for SVM and Logistic Regression)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define SVM models with different kernels
kernels = ['linear', 'poly', 'rbf']
svm_models = {kernel: SVC(kernel=kernel, random_state=42) for kernel in kernels}

# Train and evaluate each SVM model
for kernel, model in svm_models.items():

    model.fit(X_train_scaled, y_train)  # Train on scaled data
    y_pred = model.predict(X_test_scaled)  # Test on scaled data
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy with {kernel} kernel: {accuracy:.4f}")

# Logistic Regression (with increased max_iter)
lr_model = LogisticRegression(max_iter=2000, random_state=42)
lr_model.fit(X_train_scaled, y_train)
y_pred_lr = lr_model.predict(X_test_scaled)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Accuracy with Logistic Regression (scaled): {lr_accuracy:.4f}")

# Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)  # Random Forest doesn't need scaling
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Accuracy with Random Forest: {rf_accuracy:.4f}")

Accuracy with linear kernel: 0.9288
Accuracy with poly kernel: 0.7715
Accuracy with rbf kernel: 0.9366
Accuracy with Logistic Regression (scaled): 0.9227
Accuracy with Random Forest: 0.9583
