In [28]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

In [3]:
df_train = pd.read_csv(r'D:\code\AppliedML_assgn_01\train.csv')
df_validation = pd.read_csv(r'D:\code\AppliedML_assgn_01\validation.csv')
df_test = pd.read_csv(r'D:\code\AppliedML_assgn_01\test.csv')

In [4]:
df_train.head()

Unnamed: 0,text,spam,cleaned_text,preprocessed_text
0,Subject: re : [ 1 ] save over $ 70 on this exq...,1,subject re save over on this exquisite s...,subject save exquisit softwar suit take contro...
1,Subject: re : carnegie mellon recruiting good...,0,subject re carnegie mellon recruiting good a...,subject carnegi mellon recruit good afternoon ...
2,Subject: accounting organizational changes in...,0,subject accounting organizational changes in ...,subject account organiz chang order support en...
3,"Subject: all graphics software available , che...",1,subject all graphics software available cheap...,subject graphic softwar avail cheap oem versio...
4,"Subject: visit to wharton , december 6 i woul...",0,subject visit to wharton december i would l...,subject visit wharton decemb would like invit ...


In [5]:
df_test.head()

Unnamed: 0,text,spam,cleaned_text,preprocessed_text
0,"Subject: vmi agreements hi richard , here is ...",0,subject vmi agreements hi richard here is a ...,subject vmi agreement hi richard mark version ...
1,Subject: the future of continuing education s...,1,subject the future of continuing education se...,subject futur continu educ select state press ...
2,"Subject: re : visiting enron may 4 th susan ,...",0,subject re visiting enron may th susan th...,subject visit enron may th susan thank make se...
3,Subject: branded softs http : / / p ' s . mai...,1,subject branded softs http p s mainoemst...,subject brand soft http p mainoemstor com
4,Subject: you don _ t know how to attract custo...,1,subject you don t know how to attract custome...,subject know attract custom websit submit webs...


In [7]:
tfidf_vectorizer = TfidfVectorizer()

In [16]:
X_train = tfidf_vectorizer.fit_transform(df_train['preprocessed_text'])
y_train = df_train['spam']
X_test = tfidf_vectorizer.transform(df_test['preprocessed_text'])
y_test = df_test['spam']
X_val = tfidf_vectorizer.transform(df_validation['preprocessed_text'])
y_val = df_validation['spam']

In [17]:
(X_train.shape, X_test.shape, X_val.shape), (y_train.shape, y_test.shape, y_val.shape)

(((4009, 23014), (573, 23014), (1146, 23014)), ((4009,), (573,), (1146,)))

1st Model:

In [18]:
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)

In [19]:
y_val_pred = rf.predict(X_val)

In [23]:
print('Accuracy:', accuracy_score(y_val, y_val_pred))
print('Classification Report:', classification_report(y_val, y_val_pred))

Accuracy: 0.9773123909249564
Classification Report:               precision    recall  f1-score   support

           0       0.97      1.00      0.99       884
           1       1.00      0.90      0.95       262

    accuracy                           0.98      1146
   macro avg       0.99      0.95      0.97      1146
weighted avg       0.98      0.98      0.98      1146



2nd Model:

In [25]:
dt = DecisionTreeClassifier(random_state = 1)
dt.fit(X_train, y_train)

In [26]:
y_val_pred = dt.predict(X_val)

In [27]:
print('Accuracy:', accuracy_score(y_val, y_val_pred))
print('Classification Report:', classification_report(y_val, y_val_pred))

Accuracy: 0.9406631762652705
Classification Report:               precision    recall  f1-score   support

           0       0.96      0.96      0.96       884
           1       0.88      0.86      0.87       262

    accuracy                           0.94      1146
   macro avg       0.92      0.91      0.92      1146
weighted avg       0.94      0.94      0.94      1146



3rd Model:

In [29]:
xgb_clf = xgb.XGBClassifier(use_label_encoder = False, eval_metric = 'logloss', random_state = 1)
xgb_clf.fit(X_train, y_train)

In [30]:
y_val_pred = xgb_clf.predict(X_val)

In [31]:
print('Accuracy:', accuracy_score(y_val, y_val_pred))
print('Classification Report:', classification_report(y_val, y_val_pred))

Accuracy: 0.987783595113438
Classification Report:               precision    recall  f1-score   support

           0       0.99      0.99      0.99       884
           1       0.97      0.98      0.97       262

    accuracy                           0.99      1146
   macro avg       0.98      0.98      0.98      1146
weighted avg       0.99      0.99      0.99      1146



Using GridSearchCV for hyper-parameter tuning:  
1. Random Forest Classifier

In [32]:
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


grid_search_rf = GridSearchCV(estimator = rf,
                              param_grid=param_grid_rf,
                              cv=5,  
                              verbose=1,  
                              n_jobs=-1)  


grid_search_rf.fit(X_train, y_train)


print("Best parameters for Random Forest:", grid_search_rf.best_params_)
print("Best score for Random Forest:", grid_search_rf.best_score_)

Fitting 5 folds for each of 24 candidates, totalling 120 fits
Best parameters for Random Forest: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best score for Random Forest: 0.9745561190656318


2. Decision Tree Classifier:

In [33]:
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


grid_search_dt = GridSearchCV(estimator = dt,
                              param_grid=param_grid_dt,
                              cv=5,
                              verbose=1,
                              n_jobs=-1)


grid_search_dt.fit(X_train, y_train)


print("Best parameters for Decision Tree:", grid_search_dt.best_params_)
print("Best score for Decision Tree:", grid_search_dt.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best score for Decision Tree: 0.9501109896918128


3. XGBoost:

In [34]:
param_grid_xgb = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 1],
    'colsample_bytree': [0.8, 1],
}


grid_search_xgb = GridSearchCV(estimator = xgb_clf,
                               param_grid=param_grid_xgb,
                               cv=5,
                               verbose=1,
                               n_jobs=-1)


grid_search_xgb.fit(X_train, y_train)


print("Best parameters for XGBoost:", grid_search_xgb.best_params_)
print("Best score for XGBoost:", grid_search_xgb.best_score_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best parameters for XGBoost: {'colsample_bytree': 1, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 200, 'subsample': 1}
Best score for XGBoost: 0.9815433326795372


Clearly the XGBoost performs the best out of the three chosen.  
Testing the XGBoost model on the unseen test data:

In [35]:
best_xgb = grid_search_xgb.best_estimator_
y_test_pred_xgb = best_xgb.predict(X_test)

In [36]:
accuracy_xgb = accuracy_score(y_test, y_test_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_test_pred_xgb)
report_xgb = classification_report(y_test, y_test_pred_xgb)

print(f"XGBoost Test Accuracy: {accuracy_xgb:.2f}")
print("XGBoost Confusion Matrix:\n", conf_matrix_xgb)
print("XGBoost Classification Report:\n", report_xgb)

XGBoost Test Accuracy: 0.98
XGBoost Confusion Matrix:
 [[430   6]
 [  5 132]]
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.99      0.99       436
           1       0.96      0.96      0.96       137

    accuracy                           0.98       573
   macro avg       0.97      0.97      0.97       573
weighted avg       0.98      0.98      0.98       573

