In [129]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [130]:
train_df.head()

Unnamed: 0,spam,text
0,0,subject institute international finance annual...
1,1,subject mortgage even worst credit zwzm detail...
2,1,subject partnership mr edward moko independenc...
3,1,subject de la part de enfants ama rue de marty...
4,0,subject synfuel option valuation lenny believe...


In [131]:
test_df.head()

Unnamed: 0,spam,text
0,1,subject perfect logo charset koi r thinking br...
1,0,subject storage model security stinson added t...
2,1,subject wall street micro news report homeland...
3,1,subject logo stationer website design much lt ...
4,0,subject video conference ross mcintyre vince r...


In [132]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   spam    5000 non-null   int64 
 1   text    5000 non-null   object
dtypes: int64(1), object(1)
memory usage: 78.2+ KB


In [133]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   spam    226 non-null    int64 
 1   text    226 non-null    object
dtypes: int64(1), object(1)
memory usage: 3.7+ KB


In [134]:
X_train, X_val, y_train, y_val = train_test_split(train_df['text'], train_df['spam'], test_size=0.2, random_state=42)

In [135]:
X_test = test_df['text']
y_test = test_df['spam']

In [136]:
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

In [137]:
classifier = SVC(kernel='linear', random_state=42)
classifier.fit(X_train_vectorized, y_train)

In [138]:
y_pred_val = classifier.predict(X_val_vectorized)
accuracy = accuracy_score(y_val, y_pred_val)
print("Validation accuracy:", accuracy)
print("\nValidation classification report:\n", classification_report(y_val, y_pred_val))

Validation accuracy: 0.997

Validation classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       593
           1       1.00      1.00      1.00       407

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000



In [139]:
print(confusion_matrix(y_val, y_pred_val))

[[592   1]
 [  2 405]]


In [140]:
f1 = f1_score(y_val, y_pred_val)
print("Test set F1 Score: ", f1)

Test set F1 Score:  0.9963099630996309


In [141]:
X_test_vectorized = vectorizer.transform(X_test)

In [142]:
y_pred_test = classifier.predict(X_test_vectorized)
test_accuracy = accuracy_score(y_test, y_pred_test)
print("Test accuracy:", test_accuracy)
print("\nTest classification report:\n", classification_report(y_test, y_pred_test))

Test accuracy: 0.9867256637168141

Test classification report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       113
           1       0.98      0.99      0.99       113

    accuracy                           0.99       226
   macro avg       0.99      0.99      0.99       226
weighted avg       0.99      0.99      0.99       226



In [143]:
print(confusion_matrix(y_test, y_pred_test))

[[111   2]
 [  1 112]]


In [144]:
f1 = f1_score(y_test, y_pred_test)
print("Test set F1 Score: ", f1)

Test set F1 Score:  0.9867841409691629


In [145]:
predicted_df = test_df.copy()
predicted_df['predicted_spam'] = y_pred_test
predicted_df['prediction'] = ['Spam' if x == 1 else 'Ham' for x in predicted_df['predicted_spam']]
predicted_df

Unnamed: 0,spam,text,predicted_spam,prediction
0,1,subject perfect logo charset koi r thinking br...,1,Spam
1,0,subject storage model security stinson added t...,0,Ham
2,1,subject wall street micro news report homeland...,1,Spam
3,1,subject logo stationer website design much lt ...,1,Spam
4,0,subject video conference ross mcintyre vince r...,0,Ham
...,...,...,...,...
221,0,subject sorry see hyatt lobby vince j kaminski...,0,Ham
222,1,subject yyyy know hgh difference hello jm netn...,1,Spam
223,1,subject try ouut hello welcome pharmon content...,1,Spam
224,0,subject department energy deploying corporate ...,1,Spam


In [146]:
# View mispredicted emails in testing dataset
mispredictions_df = predicted_df[predicted_df['spam'] != predicted_df['predicted_spam']]
mispredictions_df

Unnamed: 0,spam,text,predicted_spam,prediction
83,1,subject delivery failure user antonio lambino ...,0,Ham
97,0,subject basic idea price offer matching clause...,1,Spam
224,0,subject department energy deploying corporate ...,1,Spam


In [147]:
with open ("SVM_formatted_example_email_spam_predictions.txt", "w") as predictions_file:
  for i in range(0,50,2):
    pred = "Email: "+ predicted_df['text'][i] + ".\nPrediction: This is a "+ predicted_df['prediction'][i]+ " email.\n"
    print(pred)
    predictions_file.write(pred+'\n')

Email: subject perfect logo charset koi r thinking breathing new life business start revamping front end logo visuai identity loqodentity offer creative custom design logo stationery web site careful hand powerfui marketinq toois wiii bring breath fresh air business make stand among competitor click away future success click see sample artwork check price hot offer.
Prediction: This is a Spam email.

Email: subject wall street micro news report homeland security investment terror attack united state september changed security landscape foreseeable future physical logical security become paramount industry segment especially banking national resource government sector according giga wholly owned subsidiary forrester research worldwide demand information security product service set eclipse b homeland security investment newsletter dedicated providing reader information pertaining investment opportunity lucrative sector know event related homeland security happen lightning speed investor