<a href="https://colab.research.google.com/github/Vite-Noiz/Machine-Learning-Course/blob/main/My%20HW/MiniP2/MiniP2_Q1_partC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import csv
import re
from collections import defaultdict

# Read Spam.csv file
data = []
with open('spam.csv', 'r', encoding='Windows-1252') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        if row:
            label = row[0]
            text = row[1]
            data.append((label, text))

# Text Cleaning
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)
    text = text.lower()
    return text

# make list
cleaned_data = [(label, clean_text(text)) for (label, text) in data]

In [None]:
# Spliting Data
split_ratio = 0.8
split_idx = int(len(cleaned_data) * split_ratio)
train_data = cleaned_data[:split_idx]
test_data = cleaned_data[split_idx:]

In [None]:
class MultinomialNaiveBayes:
    def __init__(self):
        self.class_counts = defaultdict(int)  # num of class
        self.word_counts = defaultdict(lambda: defaultdict(int))  # num of word in each class
        self.class_probs = defaultdict(float)  # each class probility
        self.word_probs = defaultdict(lambda: defaultdict(float))  # probility of each word in class

    def train(self, train_data):
        # total class and word
        total_docs = len(train_data)
        vocabulary = set()

        for label, text in train_data:
            self.class_counts[label] += 1
            words = text.split()
            for word in words:
                self.word_counts[label][word] += 1
                vocabulary.add(word)

        for label in self.class_counts:
            self.class_probs[label] = (self.class_counts[label] + 1) / (total_docs + len(self.class_counts))

        for label in self.class_counts:
            total_words_in_class = sum(self.word_counts[label].values())
            vocab_size = len(vocabulary)
            for word in vocabulary:
                count = self.word_counts[label][word] + 1  # اسموثینگ
                self.word_probs[label][word] = count / (total_words_in_class + vocab_size)

    def predict(self, text):
        words = text.split()
        max_prob = -float('inf')
        best_label = None

        for label in self.class_probs:
            log_prob = np.log(self.class_probs[label])
            for word in words:
                if word in self.word_probs[label]:
                    log_prob += np.log(self.word_probs[label][word])
                else:
                    pass  # Ignore unknown word
            if log_prob > max_prob:
                max_prob = log_prob
                best_label = label
        return best_label

# Traning Model
model = MultinomialNaiveBayes()
model.train(train_data)

In [None]:
# Evaluate Model
import numpy as np
def evaluate_model(test_data, model):
    confusion_matrix = {'TP': 0, 'TN': 0, 'FP': 0, 'FN': 0}

    for true_label, text in test_data:
        pred_label = model.predict(text)

        if true_label == 'spam':
            if pred_label == 'spam':
                confusion_matrix['TP'] += 1
            else:
                confusion_matrix['FN'] += 1
        else:
            if pred_label == 'ham':
                confusion_matrix['TN'] += 1
            else:
                confusion_matrix['FP'] += 1

    accuracy = (confusion_matrix['TP'] + confusion_matrix['TN']) / len(test_data)

    # Precision
    if confusion_matrix['TP'] + confusion_matrix['FP'] == 0:
        precision = 0
    else:
        precision = confusion_matrix['TP'] / (confusion_matrix['TP'] + confusion_matrix['FP'])

    # Recall
    if confusion_matrix['TP'] + confusion_matrix['FN'] == 0:
        recall = 0
    else:
        recall = confusion_matrix['TP'] / (confusion_matrix['TP'] + confusion_matrix['FN'])

    # F1-Score
    if precision + recall == 0:
        f1 = 0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)

    return confusion_matrix, accuracy, precision, recall, f1

conf_matrix, acc, prec, rec, f1 = evaluate_model(test_data, model)

In [None]:
# Accuracy
import numpy as np
correct = 0
total = len(test_data)
for label, text in test_data:
    prediction = model.predict(text)
    if prediction == label:
        correct += 1

accuracy = (correct / total) * 100
print(f'Accuracy of Model is: {accuracy:.2f}%')

print(f"- Confusion Matrix:")
print(f"  TP: {conf_matrix['TP']} | FP: {conf_matrix['FP']}")
print(f"  FN: {conf_matrix['FN']} | TN: {conf_matrix['TN']}")
print(f"- Accuracy: {acc * 100:.2f}%")
print(f"- Precision: {prec * 100:.2f}%")
print(f"- Recall: {rec * 100:.2f}%")
print(f"- F1 score: {f1 * 100:.2f}%")

Accuracy of Model is: 98.21%
- Confusion Matrix:
  TP: 131 | FP: 6
  FN: 14 | TN: 964
- Accuracy: 98.21%
- Precision: 95.62%
- Recall: 90.34%
- F1 score: 92.91%
