In [1]:
# Task 1: Theory Questions 
'''
Ques.1 > What is the core assumption of Naive Bayes?
Ans.> Naive Bayes assumes that all features are conditionally independent of each other given
 the class label. This "naive" assumption simplifies computation and makes the model scalable, 
 even though it rarely holds true in practice.

Ques.2 > Differentiate between GaussianNB, MultinomialNB, and BernoulliNB.
Ans.> GaussianNB assumes features follow a normal (Gaussian) distribution and is used for continuous data.

MultinomialNB is designed for discrete features like word counts and is commonly used in text classification.

BernoulliNB works with binary/boolean features, focusing on the presence or absence of terms.

Ques.3 > Why is Naive Bayes considered suitable for high-dimensional data?
Ans.> Naive Bayes is efficient and performs well with high-dimensional data because it
assumes feature independence, reducing the complexity of parameter estimation. Its simplicity makes
it less prone to overfitting even with a large number of features.
'''

'\nQues.1 > What is the core assumption of Naive Bayes?\nAns.> Naive Bayes assumes that all features are conditionally independent of each other given\n the class label. This "naive" assumption simplifies computation and makes the model scalable, \n even though it rarely holds true in practice.\n\nQues.2 > Differentiate between GaussianNB, MultinomialNB, and BernoulliNB.\nAns.> GaussianNB assumes features follow a normal (Gaussian) distribution and is used for continuous data.\n\nMultinomialNB is designed for discrete features like word counts and is commonly used in text classification.\n\nBernoulliNB works with binary/boolean features, focusing on the presence or absence of terms.\n\nQues.3 > Why is Naive Bayes considered suitable for high-dimensional data?\nAns.> Naive Bayes is efficient and performs well with high-dimensional data because it\nassumes feature independence, reducing the complexity of parameter estimation. Its simplicity makes\nit less prone to overfitting even with a

In [2]:
# Task 2: Spam Detection using MultinomialNB
import pandas as pd

# Load dataset directly from a public URL
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None, names=['label', 'message'])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Convert 'ham'/'spam' to binary labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.2, random_state=42)
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-win_amd64.whl (11.2 MB)
Collecting joblib>=1.2.0
  Downloading joblib-1.5.1-py3-none-any.whl (307 kB)
Collecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Downloading scipy-1.13.1-cp39-cp39-win_amd64.whl (46.2 MB)
Installing collected packages: threadpoolctl, scipy, joblib, scikit-learn
Successfully installed joblib-1.5.1 scikit-learn-1.6.1 scipy-1.13.1 threadpoolctl-3.6.0
Note: you may need to restart the kernel to use updated packages.


In [6]:
from sklearn.naive_bayes import MultinomialNB

# Train model
model = MultinomialNB()
model.fit(X_train_vec, y_train)
y_pred = model.predict(X_test_vec)

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

# Print results
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.9919
Precision: 1.0000
Recall: 0.9396
Confusion Matrix:
 [[966   0]
 [  9 140]]


In [9]:
# Task 3: GaussianNB with Iris Dataset
from sklearn.datasets import load_iris

# Load Iris data
iris = load_iris()
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Train GaussianNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)

# Predict and evaluate
y_pred_gnb = gnb.predict(X_test)
acc_gnb = accuracy_score(y_test, y_pred_gnb)

print(f"GaussianNB Accuracy: {acc_gnb:.4f}")

GaussianNB Accuracy: 1.0000


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

# Logistic Regression
lr = LogisticRegression(max_iter=200)
lr.fit(X_train, y_train)
acc_lr = accuracy_score(y_test, lr.predict(X_test))

# Decision Tree
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
acc_dt = accuracy_score(y_test, dt.predict(X_test))

print(f"Logistic Regression Accuracy: {acc_lr:.4f}")
print(f"Decision Tree Accuracy: {acc_dt:.4f}")

Logistic Regression Accuracy: 1.0000
Decision Tree Accuracy: 1.0000


In [12]:
print("Model Comparison on Iris Dataset:")
print(f"GaussianNB         : {acc_gnb:.4f}")
print(f"Logistic Regression: {acc_lr:.4f}")
print(f"Decision Tree      : {acc_dt:.4f}")

Model Comparison on Iris Dataset:
GaussianNB         : 1.0000
Logistic Regression: 1.0000
Decision Tree      : 1.0000
