In [1]:
import os
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load training data
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']] 
df.columns = ['label','message']
df['label'] = df['label'].map({'spam': 1, 'ham': 0})

In [3]:
test = df.tail(200)
test

Unnamed: 0,label,message
5346,0,"My Parents, My Kidz, My Friends n My Colleague..."
5347,0,No sir. That's why i had an 8-hr trip on the b...
5348,0,Do I? I thought I put it back in the box
5349,0,I'm home...
5350,0,No one interested. May be some business plan.
...,...,...
5541,0,U still havent got urself a jacket ah?
5542,0,"I'm taking derek &amp; taylor to walmart, if I..."
5543,0,Hi its in durban are you still on this number
5544,0,Ic. There are a lotta childporn cars then.


In [73]:
test['mail_id'] = np.zeros(200)
for i in range(5346,5546): 
    test['mail_id'][i] = f'test{i-5346}.txt'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test['mail_id'] = np.zeros(200)
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returnin

In [75]:
test

Unnamed: 0,label,message,mail_id
5346,0,"My Parents, My Kidz, My Friends n My Colleague...",test0.txt
5347,0,No sir. That's why i had an 8-hr trip on the b...,test1.txt
5348,0,Do I? I thought I put it back in the box,test2.txt
5349,0,I'm home...,test3.txt
5350,0,No one interested. May be some business plan.,test4.txt
...,...,...,...
5541,0,U still havent got urself a jacket ah?,test195.txt
5542,0,"I'm taking derek &amp; taylor to walmart, if I...",test196.txt
5543,0,Hi its in durban are you still on this number,test197.txt
5544,0,Ic. There are a lotta childporn cars then.,test198.txt


In [4]:
df.shape

(5546, 2)

In [5]:
train = df.head(5346)

In [6]:
train

Unnamed: 0,label,message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5341,0,And of course you should make a stink!
5342,1,u r subscribed 2 TEXTCOMP 250 wkly comp. 1st w...
5343,0,No go. No openings for that room 'til after th...
5344,0,When you guys planning on coming over?


In [47]:
t=test['message'].iloc[0]
# data for data in books

In [52]:
for i in range(len(test)):
    with open(f'testing_mail/test{i}.txt', 'w', encoding='utf-8') as file: 
        
        books = [test['message'].iloc[i]] 
        
        file.writelines("% s\n" % books[0]) 

In [9]:
# Custom CountVectorizer class
class SimpleCountVectorizer:
    def __init__(self):
        self.vocabulary_ = {}
    
    def fit(self, documents):
        vocabulary = set()
        for doc in documents:
            tokens = self._tokenize(doc)
            vocabulary.update(tokens)
        
        self.vocabulary_ = {word: idx for idx, word in enumerate(sorted(vocabulary))}
    
    def transform(self, documents):
        rows = []
        for doc in documents:
            count_vector = [0] * len(self.vocabulary_)
            tokens = self._tokenize(doc)
            for token in tokens:
                if token in self.vocabulary_:
                    index = self.vocabulary_[token]
                    count_vector[index] += 1
            rows.append(count_vector)
        return np.array(rows)
    
    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)
    
    def _tokenize(self, text):
        text = re.sub(r'\W+', ' ', text.lower())
        text = re.sub(r'\s+', ' ', text.lower())
        return text.split()

In [10]:
# Building Naive Bayes from Scratch
class NaiveBayesClassifier:
    def __init__(self):
        self.word_probs = {}
        self.class_probs = {}

    def train(self, X, y):
        n_samples, n_features = X.shape
        n_spam = np.sum(y == 1)
        n_ham = np.sum(y == 0)
        
        self.class_probs[1] = n_spam / n_samples
        self.class_probs[0] = n_ham / n_samples

        # Calculate conditional probabilities
        spam_counts = X[y == 1].sum(axis=0) + 1  # Laplace smoothing
        ham_counts = X[y == 0].sum(axis=0) + 1  # Laplace smoothing
        total_spam = spam_counts.sum()
        total_ham = ham_counts.sum()

        self.word_probs[1] = np.log(spam_counts / total_spam)
        self.word_probs[0] = np.log(ham_counts / total_ham)

    def predict(self, X):
        predictions = []
        for x in X:
            spam_score = np.sum(x@(self.word_probs[1])) + np.log(self.class_probs[1])
            ham_score = np.sum(x@(self.word_probs[0])) + np.log(self.class_probs[0])
            predictions.append(1 if spam_score > ham_score else 0)
        return np.array(predictions)

In [55]:
# Function to classify emails in the 'test_mails' folder
def classify_emails(test_folder="testing_mail/"):
    # Load training data
    data = pd.read_csv('spam.csv', encoding='latin-1')
    data = data[['v1', 'v2']] 
    data.columns = ['label','message']
    data['label'] = data['label'].map({'spam': 1, 'ham': 0})
    

    # Preprocess and vectorize training data
    vectorizer = SimpleCountVectorizer()
    X_train = vectorizer.fit_transform(data['message'])
    y_train = data['label'].values

    # Train the Naive Bayes classifier
    nb_classifier = NaiveBayesClassifier()
    nb_classifier.train(X_train, y_train)

    # Read test emails from the folder and classify each one
    results = {}
    for filename in os.listdir(test_folder):
        if filename.endswith(".txt"):
            with open(os.path.join(test_folder, filename), 'r', encoding='utf-8') as file:
                email_content = file.read()
                
                # Preprocess and vectorize the email content
                X_test = vectorizer.transform([email_content])
                
                # Predict spam or non-spam
                prediction = nb_classifier.predict(X_test)[0]
                
                # Store the result
                results[filename] = "spam" if prediction == 1 else "ham"

    return results

In [56]:
results = classify_emails()

In [57]:
results

{'test0.txt': 'ham',
 'test1.txt': 'ham',
 'test10.txt': 'ham',
 'test100.txt': 'ham',
 'test101.txt': 'ham',
 'test102.txt': 'ham',
 'test103.txt': 'spam',
 'test104.txt': 'ham',
 'test105.txt': 'ham',
 'test106.txt': 'ham',
 'test107.txt': 'ham',
 'test108.txt': 'ham',
 'test109.txt': 'ham',
 'test11.txt': 'ham',
 'test110.txt': 'spam',
 'test111.txt': 'ham',
 'test112.txt': 'ham',
 'test113.txt': 'ham',
 'test114.txt': 'spam',
 'test115.txt': 'ham',
 'test116.txt': 'spam',
 'test117.txt': 'ham',
 'test118.txt': 'ham',
 'test119.txt': 'ham',
 'test12.txt': 'ham',
 'test120.txt': 'spam',
 'test121.txt': 'spam',
 'test122.txt': 'spam',
 'test123.txt': 'ham',
 'test124.txt': 'ham',
 'test125.txt': 'ham',
 'test126.txt': 'ham',
 'test127.txt': 'ham',
 'test128.txt': 'ham',
 'test129.txt': 'ham',
 'test13.txt': 'ham',
 'test130.txt': 'ham',
 'test131.txt': 'ham',
 'test132.txt': 'ham',
 'test133.txt': 'ham',
 'test134.txt': 'ham',
 'test135.txt': 'ham',
 'test136.txt': 'spam',
 'test137.t

In [76]:
test_mm = list(results.keys())

In [77]:
test_pp = list(results.values())

In [78]:
ree = pd.DataFrame({'mail_id': test_mm,'predict': test_pp})

In [79]:
ree

Unnamed: 0,mail_id,predict
0,test0.txt,ham
1,test1.txt,ham
2,test10.txt,ham
3,test100.txt,ham
4,test101.txt,ham
...,...,...
195,test95.txt,ham
196,test96.txt,ham
197,test97.txt,spam
198,test98.txt,ham


In [17]:
tesst = pd.read_csv('test.txt')
tesst.head()

Unnamed: 0,original,test_mail
0,ham,test1.txt
1,ham,test2.txt
2,ham,test3.txt
3,ham,test4.txt
4,ham,test5.txt


In [80]:
final = pd.merge(test,ree,on='mail_id')
final

Unnamed: 0,label,message,mail_id,predict
0,0,"My Parents, My Kidz, My Friends n My Colleague...",test0.txt,ham
1,0,No sir. That's why i had an 8-hr trip on the b...,test1.txt,ham
2,0,Do I? I thought I put it back in the box,test2.txt,ham
3,0,I'm home...,test3.txt,ham
4,0,No one interested. May be some business plan.,test4.txt,ham
...,...,...,...,...
195,0,U still havent got urself a jacket ah?,test195.txt,ham
196,0,"I'm taking derek &amp; taylor to walmart, if I...",test196.txt,ham
197,0,Hi its in durban are you still on this number,test197.txt,ham
198,0,Ic. There are a lotta childporn cars then.,test198.txt,ham


In [81]:
final['original'] = final['label'].map({0:'ham',1:'spam'})

In [86]:
final['predict'].value_counts()

predict
ham     177
spam     23
Name: count, dtype: int64

In [87]:
j = 0
for i in range(len(final)):
    if final['original'][i] == final['predict'][i]:
        j+=1

accu = j/len(final)*100
print(f'Accuracy on test folder\'s mail dataset is {accu}%')

Accuracy on test folder's mail dataset is 99.5%
