In [1]:
import joblib
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV


In [2]:
column_names = [
    'word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
    'word_freq_our', 'word_freq_over', 'word_freq_remove', 'word_freq_internet',
    'word_freq_order', 'word_freq_mail', 'word_freq_receive', 'word_freq_will',
    'word_freq_people', 'word_freq_report', 'word_freq_addresses', 'word_freq_free',
    'word_freq_business', 'word_freq_email', 'word_freq_you', 'word_freq_credit',
    'word_freq_your', 'word_freq_font', 'word_freq_000', 'word_freq_money',
    'word_freq_hp', 'word_freq_hpl', 'word_freq_george', 'word_freq_650',
    'word_freq_lab', 'word_freq_labs', 'word_freq_telnet', 'word_freq_857',
    'word_freq_data', 'word_freq_415', 'word_freq_85', 'word_freq_technology',
    'word_freq_1999', 'word_freq_parts', 'word_freq_pm', 'word_freq_direct',
    'word_freq_cs', 'word_freq_meeting', 'word_freq_original', 'word_freq_project',
    'word_freq_re', 'word_freq_edu', 'word_freq_table', 'word_freq_conference',
    'char_freq_;', 'char_freq_(', 'char_freq_[', 'char_freq_!', 'char_freq_$',
    'char_freq_#', 'capital_run_length_average', 'capital_run_length_longest',
    'capital_run_length_total', 'is_spam'
]

In [3]:
# Assuming your data has a header, modify your data reading code
with open(r"/content/spam_mail/spambase.data") as file:
    data_lines = file.readlines()

column_names = [f"feature_{i}" for i in range(1, 58)] + ["is_spam"]
data = [list(map(float, line.strip().split(','))) for line in data_lines]
df = pd.DataFrame(data, columns=column_names)


In [4]:
X_train = df.drop(columns=['is_spam'])
y_train = df['is_spam']

In [5]:
df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_49,feature_50,feature_51,feature_52,feature_53,feature_54,feature_55,feature_56,feature_57,is_spam
0,0.0,0.64,0.64,0.0,0.32,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.778,0.0,0.0,3.756,61.0,278.0,1.0
1,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101.0,1028.0,1.0
2,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485.0,2259.0,1.0
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40.0,191.0,1.0
4,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40.0,191.0,1.0


In [6]:
X_train_text = [' '.join(map(str, row)) for row in X_train.values]


In [7]:
# Create TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df=5, max_df=0.95)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)


In [8]:
print(f"X_train_tfidf shape: {X_train_tfidf.shape}")
print(f"y_train shape: {y_train.shape}")

X_train_tfidf shape: (4601, 805)
y_train shape: (4601,)


In [9]:
# If y_train has fewer elements than X_train_tfidf, duplicate y_train elements
if X_train_tfidf.shape[0] != y_train.shape[0]:
    y_train = y_train.iloc[:X_train_tfidf.shape[0]]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_train_tfidf, y_train, test_size=0.2, random_state=42)


In [11]:
# Perform GridSearchCV for hyperparameter tuning
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.01, 0.1, 1, 'auto'],
    'kernel': ['linear', 'rbf', 'poly']
}

In [23]:
svm_classifier = SVC()

grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

In [24]:
best_model = grid_search.best_estimator_

In [25]:
# Evaluate the best model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [26]:
# Print evaluation metrics
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7372421281216069
Classification Report:
              precision    recall  f1-score   support

         0.0       0.73      0.86      0.79       531
         1.0       0.75      0.57      0.65       390

    accuracy                           0.74       921
   macro avg       0.74      0.72      0.72       921
weighted avg       0.74      0.74      0.73       921

Confusion Matrix:
[[455  76]
 [166 224]]


In [29]:
# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vect.joblib')

['tfidf_vect.joblib']

In [30]:
# Save the trained SVM classifier
joblib.dump(best_model, 'svm_model.joblib')


['svm_model.joblib']