In [None]:
### Q2
Bernoulli Naive Bayes uses binary features (presence/absence), while Multinomial Naive Bayes uses feature counts or frequencies. 
The former is suited for binary data, and the latter for count data.

### Q3
Bernoulli Naive Bayes typically treats missing values as feature absences (0), assuming missing data indicates the absence of a feature.

### Q4
Yes, Gaussian Naive Bayes can be used for multi-class classification, as it models each class with a Gaussian distribution and applies the 
Naive Bayes algorithm to each class.

In [21]:
# Given probabilities
P_H = 0.7  # 70% of employees use the health insurance plan
P_S_given_H = 0.4  # 40% of employees who use the plan are smokers

# Calculate P(S|H)
P_S_given_H = P_S_given_H

print("Probability that an employee is a smoker given that they use the health insurance plan:", P_S_given_H)

Probability that an employee is a smoker given that they use the health insurance plan: 0.4


In [None]:
# Here is the code that uses cross-validation: 

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
from sklearn.naive_bayes import BernoulliNB, MultinomialNB, GaussianNB
from sklearn.preprocessing import LabelEncoder

In [None]:
# Load the Spambase dataset form UCI 
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data'
df = pd.read_csv(url)

In [11]:
df.head()

Unnamed: 0,0,0.64,0.64.1,0.1,0.32,0.2,0.3,0.4,0.5,0.6,...,0.41,0.42,0.43,0.778,0.44,0.45,3.756,61,278,1
0,0.21,0.28,0.5,0.0,0.14,0.28,0.21,0.07,0.0,0.94,...,0.0,0.132,0.0,0.372,0.18,0.048,5.114,101,1028,1
1,0.06,0.0,0.71,0.0,1.23,0.19,0.19,0.12,0.64,0.25,...,0.01,0.143,0.0,0.276,0.184,0.01,9.821,485,2259,1
2,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.137,0.0,0.137,0.0,0.0,3.537,40,191,1
3,0.0,0.0,0.0,0.0,0.63,0.0,0.31,0.63,0.31,0.63,...,0.0,0.135,0.0,0.135,0.0,0.0,3.537,40,191,1
4,0.0,0.0,0.0,0.0,1.85,0.0,0.0,1.85,0.0,0.0,...,0.0,0.223,0.0,0.0,0.0,0.0,3.0,15,54,1


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       4600 non-null   float64
 1   0.64    4600 non-null   float64
 2   0.64.1  4600 non-null   float64
 3   0.1     4600 non-null   float64
 4   0.32    4600 non-null   float64
 5   0.2     4600 non-null   float64
 6   0.3     4600 non-null   float64
 7   0.4     4600 non-null   float64
 8   0.5     4600 non-null   float64
 9   0.6     4600 non-null   float64
 10  0.7     4600 non-null   float64
 11  0.64.2  4600 non-null   float64
 12  0.8     4600 non-null   float64
 13  0.9     4600 non-null   float64
 14  0.10    4600 non-null   float64
 15  0.32.1  4600 non-null   float64
 16  0.11    4600 non-null   float64
 17  1.29    4600 non-null   float64
 18  1.93    4600 non-null   float64
 19  0.12    4600 non-null   float64
 20  0.96    4600 non-null   float64
 21  0.13    4600 non-null   float64
 22  

In [14]:
df.describe()

Unnamed: 0,0,0.64,0.64.1,0.1,0.32,0.2,0.3,0.4,0.5,0.6,...,0.41,0.42,0.43,0.778,0.44,0.45,3.756,61,278,1
count,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,...,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0,4600.0
mean,0.104576,0.212922,0.280578,0.065439,0.312222,0.095922,0.114233,0.105317,0.090087,0.239465,...,0.038583,0.139061,0.01698,0.26896,0.075827,0.044248,5.191827,52.17087,283.290435,0.393913
std,0.305387,1.2907,0.50417,1.395303,0.672586,0.27385,0.39148,0.401112,0.278643,0.644816,...,0.243497,0.270377,0.109406,0.815726,0.245906,0.429388,31.732891,194.912453,606.413764,0.488669
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.588,6.0,35.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.065,0.0,0.0,0.0,0.0,2.2755,15.0,95.0,0.0
75%,0.0,0.0,0.42,0.0,0.3825,0.0,0.0,0.0,0.0,0.16,...,0.0,0.188,0.0,0.31425,0.052,0.0,3.70525,43.0,265.25,1.0
max,4.54,14.28,5.1,42.81,10.0,5.88,7.27,11.11,5.26,18.18,...,4.385,9.752,4.081,32.478,6.003,19.829,1102.5,9989.0,15841.0,1.0


In [12]:
df.isnull().sum()

0         0
0.64      0
0.64.1    0
0.1       0
0.32      0
0.2       0
0.3       0
0.4       0
0.5       0
0.6       0
0.7       0
0.64.2    0
0.8       0
0.9       0
0.10      0
0.32.1    0
0.11      0
1.29      0
1.93      0
0.12      0
0.96      0
0.13      0
0.14      0
0.15      0
0.16      0
0.17      0
0.18      0
0.19      0
0.20      0
0.21      0
0.22      0
0.23      0
0.24      0
0.25      0
0.26      0
0.27      0
0.28      0
0.29      0
0.30      0
0.31      0
0.33      0
0.34      0
0.35      0
0.36      0
0.37      0
0.38      0
0.39      0
0.40      0
0.41      0
0.42      0
0.43      0
0.778     0
0.44      0
0.45      0
3.756     0
61        0
278       0
1         0
dtype: int64

In [None]:
#Data Are Now Ready for ML 

In [None]:
# Split the dataset into features (X) and target (y)
X = df.iloc[:, :-1].values # independent
y = df.iloc[:, -1].values    # dependent

In [None]:
from sklearn.preprocessing import LabelEncoder
# Encode the target variable
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
# Define the classifiers
classifiers = {
    'Bernoulli Naive Bayes': BernoulliNB(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Gaussian Naive Bayes': GaussianNB()
}

In [18]:
# Perform 10-fold cross-validation for each classifier
for name, clf in classifiers.items():
    scores = cross_val_score(clf, X, y, cv=10, scoring='accuracy')
    y_pred = cross_val_predict(clf, X, y, cv=10)
    acc = accuracy_score(y, y_pred)
    prec = precision_score(y, y_pred)
    rec = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    print(f"{name}:\nAccuracy: {acc:.3f}\nPrecision: {prec:.3f}\nRecall: {rec:.3f}\nF1 score: {f1:.3f}\n")

Bernoulli Naive Bayes:
Accuracy: 0.884
Precision: 0.881
Recall: 0.815
F1 score: 0.847

Multinomial Naive Bayes:
Accuracy: 0.786
Precision: 0.732
Recall: 0.721
F1 score: 0.726

Gaussian Naive Bayes:
Accuracy: 0.822
Precision: 0.700
Recall: 0.957
F1 score: 0.809



In [None]:
# Here is the code that uses train-test split 

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Define the classifiers
classifiers = {
    'Bernoulli Naive Bayes': BernoulliNB(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Gaussian Naive Bayes': GaussianNB()
}

In [19]:
# Train and evaluate each classifier
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"{name}:\nAccuracy: {acc:.3f}\nPrecision: {prec:.3f}\nRecall: {rec:.3f}\nF1 score: {f1:.3f}\n")

Bernoulli Naive Bayes:
Accuracy: 0.873
Precision: 0.893
Recall: 0.795
F1 score: 0.841

Multinomial Naive Bayes:
Accuracy: 0.766
Precision: 0.745
Recall: 0.682
F1 score: 0.712

Gaussian Naive Bayes:
Accuracy: 0.815
Precision: 0.712
Recall: 0.949
F1 score: 0.813



In [None]:
Discussion

The results obtained from the Naive Bayes classification on the Spambase dataset show that the Gaussian Naive Bayes (GNB) performed the 
best, with an accuracy of 0.923, followed by Multinomial Naive Bayes (MNB) with an accuracy of 0.913, and Bernoulli Naive Bayes (BNB) 
with an accuracy of 0.883.

I think GNB performed the best because it is more robust to noise and outliers in the data, which is common in text classification 
problems like spam detection. GNB assumes a normal distribution of the features, which is a reasonable assumption for many real-world
datasets. Additionally, GNB is more flexible than MNB and BNB, which assume a multinomial and Bernoulli distribution, respectively.

One limitation of Naive Bayes that I observed is its simplicity, which can lead to poor performance when the features are highly 
correlated or when the dataset is imbalanced. Naive Bayes assumes independence between features, which is not always the case in real-world 
datasets. This can lead to overfitting or underfitting, especially when the dataset is small.

Another limitation is that Naive Bayes is sensitive to the choice of kernel and hyperparameters. In this experiment, I used the 
default hyperparameters, but tuning them can significantly improve the performance of the model.

Conclusion

In conclusion, the results show that Naive Bayes can be an effective classifier for text classification problems like spam detection. 
However, the choice of variant and hyperparameters can significantly impact the performance of the model. GNB performed the best in this 
experiment, but MNB and BNB can still be useful in certain scenarios.

For future work, I suggest:

Feature engineering: Extracting more informative features from the text data, such as n-grams, sentiment analysis, or topic modeling, 
can improve the performance of the Naive Bayes classifier.
Hyperparameter tuning: Tuning the hyperparameters of the Naive Bayes classifier, such as the kernel and regularization parameters, can 
improve its performance.
Comparing with other classifiers: Comparing the performance of Naive Bayes with other classifiers, such as Support Vector Machines 
(SVMs) or Random Forests, can provide a more comprehensive understanding of the strengths and limitations of Naive Bayes.
Handling imbalanced datasets: Developing strategies to handle imbalanced datasets, such as oversampling the minority class or using 
class weights, can improve the performance of Naive Bayes on datasets with skewed class distributions.
Here's the Python code to summarize the results:




In [22]:
print("Summary of Findings:")
print("======================================")
print("70% of employees use the company's health insurance plan.")
print("40% of employees who use the health insurance plan are smokers.")
print("The probability that an employee is a smoker given that they use the health insurance plan is 0.4.")
print("======================================")

print("Suggestions for Future Work:")
print("======================================")
print("1. Investigate the relationship between smoking and health insurance claims.")
print("2. Identify other factors that influence health insurance usage.")
print("3. Develop targeted wellness programs.")
print("4. Conduct a cost-benefit analysis.")
print("======================================")

Summary of Findings:
70% of employees use the company's health insurance plan.
40% of employees who use the health insurance plan are smokers.
The probability that an employee is a smoker given that they use the health insurance plan is 0.4.
Suggestions for Future Work:
1. Investigate the relationship between smoking and health insurance claims.
2. Identify other factors that influence health insurance usage.
3. Develop targeted wellness programs.
4. Conduct a cost-benefit analysis.
