In [136]:
import numpy as np
import pandas as pd
from numba import jit, njit
from scipy.special import expit, logit

df = pd.read_csv('spam.data', engine='pyarrow', sep=' ', header=None)
X = df.iloc[:, :-1].to_numpy()
Y = df.iloc[:, -1].to_numpy(dtype=np.int8)

In [137]:
from sklearn.model_selection import KFold


def run_classifier(classifier, X: np.ndarray, Y: np.ndarray, n_splits: int=5):
    kf = KFold(n_splits=5)
    
    accuracy = np.empty(n_splits)
    for i, (train_index, test_index) in enumerate(kf.split(X)):
        classifier.fit(X[train_index, :], Y[train_index])
        Y_pred = classifier.predict(X[test_index, :])
        accuracy[i] = np.mean(Y_pred == Y[test_index])

    return accuracy

In [138]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

classifier_lda = LDA()
accuracy_lda = run_classifier(classifier_lda, X, Y)

print(accuracy_lda)
print(accuracy_lda.mean())

[0.62757872 0.70652174 0.94130435 0.93913043 0.86630435]
0.8161679176698297


In [139]:
# from sklearn.linear_model import LogisticRegression as LR

# classifier_lr = LR(max_iter=2500, random_state=0)
# accuracy_lr = run_classifier(classifier_lr, X, Y)

# print(accuracy_lr)
# print(accuracy_lr.mean())

In [140]:
from sklearn.linear_model import LogisticRegression as LR

classifier_lr3 = LR(solver='liblinear')
accuracy_lr3 = run_classifier(classifier_lr3, X, Y)

print(accuracy_lr3)
print(accuracy_lr3.mean())

[0.79913138 0.83478261 0.94456522 0.89891304 0.81847826]
0.8591741018741443


In [141]:
from sklearn.linear_model import LogisticRegression as LR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

kf = KFold(n_splits=5)
accuracy_lr2 = np.empty(5)

pipe = make_pipeline(StandardScaler(), LR())

for i, (train_index, test_index) in enumerate(kf.split(X)):
    pipe.fit(X[train_index, :], Y[train_index])
    accuracy_lr2[i] = pipe.score(X[test_index, :], Y[test_index])

print(accuracy_lr2)
print(accuracy_lr2.mean())

[0.80564604 0.82282609 0.94347826 0.89891304 0.82065217]
0.8583031204267574


In [142]:
from scipy.stats import ttest_ind

X_mail = X[Y == 0, :]
X_spam = X[Y == 1, :]
num_features = 10

print(X_mail.shape)
print(X_spam.shape)

m = X.shape[1]
p_values = np.empty(m)

print(m)

for i in range(m):
    _, p_values[i] = ttest_ind(X_mail[:, i].squeeze(), X_spam[:, i].squeeze())
    
selected_features = np.argpartition(p_values, num_features)[:num_features]
X_reduced = X[:, selected_features]

(2788, 57)
(1813, 57)
57


In [143]:
accuracy_lda_reduced = run_classifier(classifier_lda, X_reduced, Y)
print(accuracy_lda_reduced)
print(accuracy_lda_reduced.mean())

[0.57546145 0.61086957 0.92173913 0.93152174 0.86195652]
0.7803096822924043


In [144]:
from sklearn.linear_model import LogisticRegression as LR

classifier_lr_reduced = LR(max_iter=800)

accuracy_lr_reduced = run_classifier(classifier_lr_reduced, X_reduced, Y)
print(accuracy_lr_reduced)
print(accuracy_lr_reduced.mean())

[0.67318132 0.7076087  0.92391304 0.91630435 0.86847826]
0.8178971344946419


In [145]:
from sklearn.linear_model import LogisticRegression as LR

classifier_lr_reduced = LR(solver='liblinear')

accuracy_lr_reduced = run_classifier(classifier_lr_reduced, X_reduced, Y)
print(accuracy_lr_reduced)
print(accuracy_lr_reduced.mean())

[0.67643865 0.7076087  0.92282609 0.91521739 0.87173913]
0.8187659915970353


In [146]:
%reset -f