# Dataset

1. load csv file (panda, numpy)
2. split dataset. Example code:()
   ```
   random.shuffle(data) # change if you are using pandas dataframe
   training = data[:int(len(data)*0.8)]
   test = data[int(len(data)*0.8):]

   fold5 = KFold(5) # https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
   for train_idx, val_idx in fold5.split(training):
      sub_val = training[val_idx]
      sub_train = training[train_idx]
      clf = model(sub_train, sub_val, ...) # training the model, and evaluate it on validation dataset
      performance(clf, test) # test the model on test dataset
   ```

#Naive bayes

1. model learning:

   Note:

   features: remove attributes that is not related to word (the last four attributes)

   labels: the last column

   count P(c) -> how many samples are positive, and how many are negtive

   if freq_word>0, then this word exists. You could use this to calculate P(a|c) -> for each class, what is the prob of each word

   remember to use laplace smoothing.

2. model evaluation (on val dataset -> performance(model, val)):
   
   for each new sample, $\prod{P(a|c)}P(c)$ if word is in the email(freq_word > 0); and find the maximum class
   

   

In [6]:
import os
print(os.getcwd())
from google.colab import drive
#drive.mount('/content/drive')
!ls
%cd /content/drive/MyDrive/ColabNotebooks/
!ls


/content
Mounted at /content/drive
drive  sample_data
/content/drive/MyDrive/ColabNotebooks
'Final Project (dont do show spam).ipynb'  'Final Project.ipynb'   spambase.csv


In [25]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold

data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/spambase.csv")

data = data.iloc[:, :-4]

data = data.sample(frac=1).reset_index(drop=True)

X = data.iloc[:, :-1]
y = data.iloc[:, -1]

split_index = int(len(data) * 0.8)
X_train, X_test = X[:split_index], X[split_index:]
y_train, y_test = y[:split_index], y[split_index:]

def count_p_c(labels):
    return (labels == 1).sum() / len(labels), (labels == 0).sum() / len(labels)

def naive_bayes_train(training_data):
    p_spam, p_not_spam = count_p_c(training_data.iloc[:, -1])

    prob_spam = []
    prob_not_spam = []
    for i in range(len(training_data.columns) - 1):
        spam_count = (training_data[training_data.iloc[:, -1] == 1].iloc[:, i] > 0).sum()
        not_spam_count = (training_data[training_data.iloc[:, -1] == 0].iloc[:, i] > 0).sum()

        if spam_count == 0 or not_spam_count == 0:
            prob_spam.append(0)
            prob_not_spam.append(0)
        else:
            prob_spam.append(spam_count / len(training_data[training_data.iloc[:, -1] == 1]))
            prob_not_spam.append(not_spam_count / len(training_data[training_data.iloc[:, -1] == 0]))

    return p_spam, p_not_spam, prob_spam, prob_not_spam

def classify_email(email, p_spam, p_not_spam, prob_spam, prob_not_spam):
    prob_spam_given_email = p_spam
    prob_not_spam_given_email = p_not_spam
    for i in range(len(email)):
        if email[i] > 0:
            prob_spam_given_email *= prob_spam[i]
            prob_not_spam_given_email *= prob_not_spam[i]
    return 1 if prob_spam_given_email > prob_not_spam_given_email else 0

def evaluate_model(test_data, p_spam, p_not_spam, prob_spam, prob_not_spam):
    correct = 0
    for i in range(len(test_data)):
        prediction = classify_email(test_data.iloc[i, :-1], p_spam, p_not_spam, prob_spam, prob_not_spam)
        if prediction == test_data.iloc[i, -1]:
            correct += 1
    return correct / len(test_data)

fold5 = 5
kf = KFold(n_splits=fold5)

train_accuracies = []
for train_idx, val_idx in kf.split(X_train):
    sub_val_X, sub_val_y = X_train.iloc[val_idx], y_train.iloc[val_idx]
    sub_train_X, sub_train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]

    p_spam, p_not_spam, prob_spam, prob_not_spam = naive_bayes_train(pd.concat([sub_train_X, sub_train_y], axis=1))
    accuracy = evaluate_model(pd.concat([sub_val_X, sub_val_y], axis=1), p_spam, p_not_spam, prob_spam, prob_not_spam)

    train_accuracies.append(accuracy)

p_spam, p_not_spam, prob_spam, prob_not_spam = naive_bayes_train(pd.concat([X_train, y_train], axis=1))
test_accuracy = evaluate_model(pd.concat([X_test, y_test], axis=1), p_spam, p_not_spam, prob_spam, prob_not_spam)

print("Test Accuracy:", test_accuracy)
print("Average Train Accuracy:", np.mean(train_accuracies))

Test Accuracy: 0.8610206297502715
Average Train Accuracy: 0.8309782608695653


# KNN
1. model learning: None

2. model evaluation(on val dataset): You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.

   ```
   Note:
   parallel programing
   numpy.cos() to calcuate the similarity
   ```

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

def cosine_similarity(x1, x2):
    return np.dot(x1, x2) / (np.linalg.norm(x1) * np.linalg.norm(x2))

def knn_predict(train_data, train_labels, x_val, k):
    similarities = []
    for i in range(len(train_data)):
        sim = cosine_similarity(x_val, train_data[i])
        similarities.append((sim, train_labels[i]))

    similarities = sorted(similarities, reverse=True)[:k]
    labels = [label for _, label in similarities]

    return max(set(labels), key=labels.count)

data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/spambase.csv")
data = data.sample(frac=1).reset_index(drop=True)

data_array = data.to_numpy()

train_size = int(0.8 * len(data_array))
train_data, test_data = data_array[:train_size, :-1], data_array[train_size:, :-1]
train_labels, test_labels = data_array[:train_size, -1], data_array[train_size:, -1]

kf = KFold(n_splits=5)
avg_train_accuracy = 0
avg_val_accuracy = 0

for train_idx, val_idx in kf.split(train_data):
    sub_train_data, sub_val_data = train_data[train_idx], train_data[val_idx]
    sub_train_labels, sub_val_labels = train_labels[train_idx], train_labels[val_idx]

    k = 5
    y_val_pred = []
    for x_val in sub_val_data:
        pred_label = knn_predict(sub_train_data, sub_train_labels, x_val, k)
        y_val_pred.append(pred_label)

    val_accuracy = np.mean(np.array(y_val_pred) == sub_val_labels)
    avg_val_accuracy += val_accuracy

    spam_proportion = y_val_pred.count(1) / len(y_val_pred)

avg_val_accuracy /= kf.get_n_splits()

y_test_pred = []
for x_test in test_data:
    pred_label = knn_predict(train_data, train_labels, x_test, k)
    y_test_pred.append(pred_label)

test_accuracy = np.mean(np.array(y_test_pred) == test_labels)

print("Validation Dataset Accuracy:", avg_val_accuracy)
print("Test Dataset Accuracy:", test_accuracy)

Validation Dataset Accuracy: 0.8345108695652174
Test Dataset Accuracy: 0.8175895765472313


# LR

1. model learning: You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.
    
    $y = sigmoid(MX)$

step 1: add one more column (all value is 1) in X -> X' = np.c_[np.ones((len(X), 1)), X]

step 2:vector M = np.random.randn(len(X[0])+1, 1);

key formula for step 3 (Note: n is the size of the TRAINING dataset; $cdot$ is dot production ):

1. $pred_y = sigmoid(M\cdot X')$

2. $loss = -\sum(y\cdot log(pred_y)+(1-y)\cdot log(1-pred_y))/n$

3. $gm=X'\cdot (pred_y - y)*2/n$

Step 3 example code:
   ```
   #Step 3: performing gradient descent on whole dataset:
   best_model = M
   best_performace = 0
   for i in range(epoch):
     pred_y = ...
     gm = ...
     _p = performace(model, val)
     if _p > best_performance:
        best_model = M
        best_performance = _p
     M = M - learning_rate*gm
   ```

2. model evaluation(on val dataset):
  
   calculate pred_y, if more than 0.5, then the predicted label is 1.

In [None]:
##THIS IS JUST TO CHECK THE CONTENTS OF THE DATABASE
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/spambase.csv")
data = data.sample(frac=1).reset_index(drop=True)
print(data.columns)
print(len(data))

Index(['word_freq_make', 'word_freq_address', 'word_freq_all', 'word_freq_3d',
       'word_freq_our', 'word_freq_over', 'word_freq_remove',
       'word_freq_internet', 'word_freq_order', 'word_freq_mail',
       'word_freq_receive', 'word_freq_will', 'word_freq_people',
       'word_freq_report', 'word_freq_addresses', 'word_freq_free',
       'word_freq_business', 'word_freq_email', 'word_freq_you',
       'word_freq_credit', 'word_freq_your', 'word_freq_font', 'word_freq_000',
       'word_freq_money', 'word_freq_hp', 'word_freq_hpl', 'word_freq_george',
       'word_freq_650', 'word_freq_lab', 'word_freq_labs', 'word_freq_telnet',
       'word_freq_857', 'word_freq_data', 'word_freq_415', 'word_freq_85',
       'word_freq_technology', 'word_freq_1999', 'word_freq_parts',
       'word_freq_pm', 'word_freq_direct', 'word_freq_cs', 'word_freq_meeting',
       'word_freq_original', 'word_freq_project', 'word_freq_re',
       'word_freq_edu', 'word_freq_table', 'word_freq_conference',


In [23]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))

def add_intercept(X):
    return np.c_[np.ones((len(X), 1)), X]

def compute_loss_gradient(X, y, M):
    pred_y = sigmoid(np.dot(X, M))
    pred_y = np.clip(pred_y, 1e-15, 1 - 1e-15)
    loss = -np.sum(y * np.log(pred_y) + (1 - y) * np.log(1 - pred_y)) / len(X)
    gradient = np.dot(X.T, (pred_y - y)) * 2 / len(X)
    return loss, gradient

def evaluate_model(X, y, M):
    pred_y = sigmoid(np.dot(X, M))
    pred_y_binary = (pred_y > 0.5).astype(int)
    accuracy = np.mean(pred_y_binary.squeeze() == y.squeeze())
    return accuracy

def train_model(X_train, y_train, X_val, y_val, epoch, learning_rate):
    M = np.random.randn(X_train.shape[1], 1)
    best_model = M
    best_performance = 0

    for i in range(epoch):
        loss, gradient = compute_loss_gradient(X_train, y_train, M)
        accuracy = evaluate_model(X_val, y_val, M)

        if accuracy > best_performance:
            best_model = M
            best_performance = accuracy

        M = M - learning_rate * gradient

    return best_model, best_performance

data = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/spambase.csv")

data = data.sample(frac=1).reset_index(drop=True)

train_size = int(0.8 * len(data))
train_data, test_data = data[:train_size], data[train_size:]

fold5 = KFold(n_splits=5)

test_accuracies = []
val_accuracies = []

for train_idx, val_idx in fold5.split(train_data):
    sub_train = train_data.iloc[train_idx]
    sub_val = train_data.iloc[val_idx]
    X_train = sub_train.iloc[:, :-1].values
    y_train = sub_train.iloc[:, -1].values.reshape(-1, 1)
    X_val = sub_val.iloc[:, :-1].values
    y_val = sub_val.iloc[:, -1].values.reshape(-1, 1)
    X_train_prime = add_intercept(X_train)
    X_val_prime = add_intercept(X_val)
    best_model, val_accuracy = train_model(X_train_prime, y_train, X_val_prime, y_val, epoch=3000, learning_rate=0.01)

    X_test = test_data.iloc[:, :-1].values
    y_test = test_data.iloc[:, -1].values.reshape(-1, 1)
    X_test_prime = add_intercept(X_test)
    test_accuracy = evaluate_model(X_test_prime, y_test, best_model)
    test_accuracies.append(test_accuracy)
    val_accuracies.append(val_accuracy)

print("Test accuracies:", test_accuracies)
print("Validation accuracies:", val_accuracies)


Test accuracies: [0.7937024972855592, 0.7785016286644951, 0.747014115092291, 0.7676438653637351, 0.7806731813246471]
Validation accuracies: [0.8097826086956522, 0.8070652173913043, 0.7717391304347826, 0.7785326086956522, 0.8029891304347826]


# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def performance(model, x, y):
    predictions = model.predict(x)
    accuracy = accuracy_score(y, predictions)
    print('Accuracy:', accuracy)
    return accuracy

iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
performance(model, X_test, y_test)


Accuracy: 1.0


1.0