# Naive bayes
#### assumption that prob of all features are independent of each other 

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Sample data: A few emails and their labels
data = {
    'text': [
        'send money now', 'urgent money prize', 'you have won a prize',
        'call me for your prize', 'hello how are you', 'let us meet tomorrow',
        'lunch meeting tomorrow?', 'congratulations you won'
    ],
    'label': [
        'spam', 'spam', 'spam', 'spam',
        'ham', 'ham', 'ham', 'ham'
    ]
}

df = pd.DataFrame(data)

# Separate features (X) and target (y)
X = df['text']
y = df['label']

print("Original Data:")
print(df)

Original Data:
                      text label
0           send money now  spam
1       urgent money prize  spam
2     you have won a prize  spam
3   call me for your prize  spam
4        hello how are you   ham
5     let us meet tomorrow   ham
6  lunch meeting tomorrow?   ham
7  congratulations you won   ham


converting text to numbers
- each row represent a email 
- each column represent a word from vocabulary 

In [2]:
# Initialize the vectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer to the text data and transform it into a matrix of token counts
X_vectorized = vectorizer.fit_transform(X)

# You can see the feature names (our vocabulary)
print("\nVocabulary (Features):")
print(vectorizer.get_feature_names_out())

# And the vectorized data (sparse matrix)
print("\nVectorized Data (Document-Term Matrix):")
print(X_vectorized.toarray())


Vocabulary (Features):
['are' 'call' 'congratulations' 'for' 'have' 'hello' 'how' 'let' 'lunch'
 'me' 'meet' 'meeting' 'money' 'now' 'prize' 'send' 'tomorrow' 'urgent'
 'us' 'won' 'you' 'your']

Vectorized Data (Document-Term Matrix):
[[0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0]
 [0 1 0 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 1]
 [1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0]
 [0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 0 0 0]
 [0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 0 0]
 [0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0]]


In [3]:
# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(
    X_vectorized, y, test_size=0.2, random_state=42
)

In [4]:
# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model using the training data
model.fit(X_train, y_train)

print("\nModel trained successfully!")


Model trained successfully!


In [5]:
# Make predictions on the test data
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Display the confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00         1
        spam       1.00      1.00      1.00         1

    accuracy                           1.00         2
   macro avg       1.00      1.00      1.00         2
weighted avg       1.00      1.00      1.00         2


Confusion Matrix:
[[1 0]
 [0 1]]


In [6]:
# New emails to classify
new_emails = [
    "congratulations you have won a cash prize", # Should be spam
    "Can we meet for lunch tomorrow"             # Should be ham
]

# Transform the new emails using the same vectorizer
new_emails_vectorized = vectorizer.transform(new_emails)

# Make predictions
predictions = model.predict(new_emails_vectorized)

# Print the results
for email, prediction in zip(new_emails, predictions):
    print(f"\nEmail: '{email}'\nPredicted Label: **{prediction.upper()}**")


Email: 'congratulations you have won a cash prize'
Predicted Label: **SPAM**

Email: 'Can we meet for lunch tomorrow'
Predicted Label: **HAM**
