In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from math import sqrt  # For KNN Euclidean distance


def load_data():
  """Loads the spam data and separates features and labels."""
  data = pd.read_csv("../content/spambase.csv")
  X = data.drop("spam", axis=1)  # Features
  y = data["spam"]  # Labels
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  return X, y

#Naive bayes

1. model learning:

   Note:

   features: remove attributes that is not related to word (the last four attributes)

   labels: the last column

   count P(c) -> how many samples are positive, and how many are negtive

   if freq_word>0, then this word exists. You could use this to calculate P(a|c) -> for each class, what is the prob of each word

   remember to use laplace smoothing.

2. model evaluation (on val dataset):
   
   for each new sample, $\prod{P(a|c)}P(c)$; and find the maximum class
   

   

In [21]:
def naive_bayes(X_train, y_train, X_test):
  """Naive Bayes model implementation."""
  model = BernoulliNB()  # Use scikit-learn implementation for efficiency
  model.fit(X_train, y_train)
  predicted_labels = model.predict(X_test)

  # Calculate evaluation metrics
  accuracy, precision, recall, f1 = calculate_accuracy(y_test, predicted_labels), \
                                    calculate_precision(y_test, predicted_labels), \
                                    calculate_recall(y_test, predicted_labels), \
                                    calculate_f1_score(precision, recall)

  return accuracy, precision, recall, f1

# KNN
1. model learning: None

2. model evaluation(on val dataset): You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.

   ```
   Note:
   parallel programing
   numpy.cos() to calcuate the similarity
   ```

In [22]:
def knn(X_train, y_train, X_test, k):
  """K-Nearest Neighbors model implementation."""
  predicted_labels = []
  for test_instance in X_test.values:
    distances = []
    for train_instance in X_train.values:
      distance = sqrt(sum((a - b) ** 2 for a, b in zip(test_instance[:-1], train_instance[:-1])))
      distances.append((distance, train_instance[-1]))

    k_nearest_neighbors = sorted(distances, key=lambda x: x[0])[:k]
    neighbor_labels = [neighbor[1] for neighbor in k_nearest_neighbors]
    most_frequent_label = max(set(neighbor_labels), key=neighbor_labels.count)

    predicted_labels.append(most_frequent_label)

  return predicted_labels

# LR

1. model learning: You could use each row(exclude the last column) as the feature of the email. You do not have to recalcuate the freqency.
    
step 1: add one more column (all value is 1) in X -> X' = np.c_[np.ones((len(X), 1)), X]

step 2:vector M = np.random.randn(len(X[0])+1, 1);

key formula for step 3 (Note: n is the size of the dataset):

1. $pred_y = sigmoid(MX')$

2. $loss = -\sum(y\cdot log(pred_y)+(1-y)\cdot log(1-pred_y))/n$

3. $gm=x\cdot (pred_y - y)*2/n$

Step 3 example code:
   ```
   #Step 3: performing gradient descent on whole dataset:
   best_model = M
   small_loss = 999999
   for i in range(epoch):
     pred_y = ...
     gm = ...
     loss = ...
     if loss < small_loss:
        best_model = M
        loss = small_loss
     m = m - learning_rate*gm
   ```

2. model evaluation(on val dataset):
  
   calculate pred_y, if more than 0.5, then the predicted label is 1.

In [23]:
def logistic_regression(learning_rate, epochs):
  """Logistic Regression model implementation."""

  # Add a column of ones to the feature matrix (X')
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  X_train = np.c_[np.ones((len(X_train), 1)), X_train]

  # Initialize weight vector (M) with random values
  m = np.random.randn(len(X_train[0]) + 1, 1)

  for epoch in range(epochs):
    # Step 1: Calculate predicted probabilities using sigmoid function
    pred_y = sigmoid(X_train.dot(m))

    # Step 2: Calculate loss (use binary cross-entropy)
    loss = -np.sum(y_train * np.log(pred_y) + (1 - y_train) * np.log(1 - pred_y)) / len(y_train)

    # Step 3: Calculate gradient (gm)
    gm = X_train.T.dot(pred_y - y_train) * 2 / len(y_train)

    # Update weights using gradient descent
    m -= learning_rate * gm

  # Return the final model weights
  return m


def sigmoid(x):
  """Sigmoid function implementation."""
  return 1 / (1 + np.exp(-x))

def predict(X_test, model_weights):
  """Predict labels for new data using the trained model."""
  X_test_with_ones = np.c_[np.ones((len(X_test), 1)), X_test]
  predicted_probs = sigmoid(X_test_with_ones.dot(model_weights))
  predicted_labels = (predicted_probs > 0.5).astype(int)  # Threshold at 0.5 for spam/not spam
  return predicted_labels



# Model Evaluation

https://scikit-learn.org/stable/modules/model_evaluation.html

In [24]:
def calculate_accuracy(true_labels, predicted_labels):
  """Calculates accuracy metric."""
  correct_predictions = sum(true_labels == predicted_labels)
  accuracy = correct_predictions / len(true_labels)
  print(accuracy)
  return accuracy


def calculate_precision(true_labels, predicted_labels):
  """Calculates precision metric."""
  positives = sum(true_labels)
  true_positives = sum(
      (pred == 1) and (label == 1) for pred, label in zip(predicted_labels, true_labels)
  )
  precision = true_positives / (positives + 1e-10)  # Avoid division by zero
  print(precision)
  return precision


def calculate_recall(true_labels, predicted_labels):
  """Calculates recall metric."""
  positives = sum(true_labels)
  true_positives = sum(
      (pred == 1) and (label == 1) for pred, label in zip(predicted_labels, true_labels)
  )
  recall = true_positives / (positives + 1e-10)  # Avoid division by zero
  print(recall)
  return recall


def calculate_f1_score(precision, recall):
  """Calculates F1 score metric."""
  f1 = 2 * (precision * recall) / (precision + recall + 1e-10)
  print(f1)
  return f1


def calculate_auc(true_labels, predicted_probabilities):
  """Calculates Area Under the ROC Curve (AUC) metric."""

  # Sort together by predicted probabilities (descending)
  sorted_data = sorted(zip(predicted_probabilities, true_labels), reverse=True)
  true_labels_sorted = [label for _, label in sorted_data]
  predicted_probabilities_sorted = [prob for prob, _ in sorted_data]

  # Initialize variables for calculating AUC
  last_fpr = -1  # False Positive Rate
  last_tpr = 1  # True Positive Rate
  auc = 0
  n = len(true_labels_sorted)  # Total number of data points

  # Iterate through sorted data points
  for i in range(n):
    # Update False Positive Rate (when label changes from 1 to 0)
    if true_labels_sorted[i] == 0 and true_labels_sorted[i-1] == 1:
      fpr = i / (n-1)
    else:
      fpr = last_fpr

    # Calculate True Positive Rate for the current data point
    tpr = sum(true_labels_sorted[:i+1]) / sum(true_labels)

    # Avoid duplicate AUC calculations due to equal probabilities
    if fpr != last_fpr:
      # Calculate trapezoidal area under the ROC curve for this step
      auc += (fpr - last_fpr) * (last_tpr + tpr) / 2
      last_fpr = fpr
      last_tpr = tpr

  # Add the final triangular area
  auc += fpr * tpr / 2

  print(auc)
  return auc

