In [4]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.model_selection import train_test_split

In [5]:
df = pd.read_csv("emails.csv").dropna()
df

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [6]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def preprocess(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    return " ".join(tokens)
df['clean_text'] = df['text'].apply(preprocess)
unique_tokens = set()
for txt in df['clean_text']:
    unique_tokens.update(txt.split())
print(len(unique_tokens))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


33572


In [21]:
vectorizer = TfidfVectorizer(max_features= 4000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['spam']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

Cloning into 'rapidsai-csp-utils'...
remote: Enumerating objects: 603, done.[K
remote: Counting objects: 100% (169/169), done.[K
remote: Compressing objects: 100% (87/87), done.[K
remote: Total 603 (delta 131), reused 82 (delta 82), pack-reused 434 (from 3)[K
Receiving objects: 100% (603/603), 199.38 KiB | 7.67 MiB/s, done.
Resolving deltas: 100% (305/305), done.
Installing RAPIDS remaining 25.08 libraries
Using Python 3.12.11 environment at: /usr
Resolved 177 packages in 2.87s
Prepared 54 packages in 55.72s
Uninstalled 32 packages in 1.21s
Installed 54 packages in 677ms
 + arrow==1.3.0
 - bokeh==3.7.3
 + bokeh==3.6.3
 + cucim-cu12==25.8.0
 + cuda-bindings==12.9.2
 + cuda-pathfinder==1.1.0
 - cuda-python==12.6.2.post1
 + cuda-python==12.9.2
 - cudf-cu12==25.6.0 (from https://pypi.nvidia.com/cudf-cu12/cudf_cu12-25.6.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl)
 + cudf-cu12==25.8.0
 + cugraph-cu12==25.8.0
 - cuml-cu12==25.6.0
 + cuml-cu12==25.8.0
 - cuvs-cu12==25.6.

In [18]:
from sklearn.base import BaseEstimator, ClassifierMixin
from cuml.linear_model import LogisticRegression
from cuml.svm import SVC
from sklearn.model_selection import GridSearchCV

In [14]:

def calculate_sample_weight(y_true, y_pred, sample_weight):
    errors = (y_true != y_pred).astype(int)
    epsilon = np.sum(sample_weight * errors)
    alpha = 0.5 * np.log((1 - epsilon) / (epsilon + 1e-10))

    # Update Weights
    sample_weight = sample_weight * np.exp(alpha * errors)
    sample_weight /= np.sum(sample_weight)
    return sample_weight, alpha


class MyCustomBoostingClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, reg_max_iter=100, svm_c_1=1, svm_c_2=1):
        self.reg_max_iter = reg_max_iter
        self.svm_c_1 = svm_c_1
        self.svm_c_2 = svm_c_2
        self.alphas = []

        self.clf_1 = LogisticRegression(max_iter=reg_max_iter)
        self.clf_2 = SVC(C=svm_c_1, probability=True)
        self.clf_3 = SVC(C=svm_c_2, probability=True)

    def fit(self, X, y, sample_weight=None):
        n = len(y)
        if sample_weight is None:
            sample_weight = np.ones(n) / n

        self.alphas = []
        self.models = []
        self.clf_1.fit(X, y, sample_weight=sample_weight)
        y_pred = self.clf_1.predict(X)
        sample_weight, alpha = calculate_sample_weight(y, y_pred, sample_weight)
        self.models.append(self.clf_1)
        self.alphas.append(alpha)

        self.clf_2.fit(X, y, sample_weight=sample_weight)
        y_pred = self.clf_2.predict(X)
        sample_weight, alpha = calculate_sample_weight(y, y_pred, sample_weight)
        self.models.append(self.clf_2)
        self.alphas.append(alpha)

        self.clf_3.fit(X, y, sample_weight=sample_weight)
        y_pred = self.clf_3.predict(X)
        sample_weight, alpha = calculate_sample_weight(y, y_pred, sample_weight)
        self.models.append(self.clf_3)
        self.alphas.append(alpha)

    def predict(self, X):
        preds = np.zeros((X.shape[0], len(self.models)))
        for i, clf in enumerate(self.models):
            preds[:, i] = clf.predict(X) * self.alphas[i]

        y_final = (np.sum(preds, axis=1) >= 0).astype(int)
        return y_final


In [22]:
param_grid = {
    'reg_max_iter': [100, 200],
    'svm_c_1': [0.5, 1, 2],
    'svm_c_2': [0.5, 1, 2]
}

grid = GridSearchCV(MyCustomBoostingClassifier(), param_grid, scoring='accuracy', cv=3, n_jobs=-1, verbose=2)

grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best score:", grid.best_score_)


Fitting 3 folds for each of 18 candidates, totalling 54 fits
Best params: {'reg_max_iter': 100, 'svm_c_1': 0.5, 'svm_c_2': 0.5}
Best score: 0.23876291328512603
