<a href="https://colab.research.google.com/github/madhumitha781/data_science/blob/main/Naive%20bayes%20.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Play Tennis Dataset (Classic for Weather Prediction)

In [None]:
from collections import defaultdict

data = [
    ['Sunny', 'Hot', 'High', 'Weak', 'No'],
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
    ['Sunny', 'Mild', 'High', 'Weak', 'No'],
    ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'Normal', 'Weak', 'Yes']
]


features = ['Outlook', 'Temperature', 'Humidity', 'Wind']


test_sample = ['Sunny', 'Cool', 'High', 'Strong']


label_counts = defaultdict(int)
feature_counts = defaultdict(lambda: defaultdict(int))

for row in data:
    *attrs, label = row
    label_counts[label] += 1
    for i, attr in enumerate(attrs):
        feature_counts[label, features[i]][attr] += 1


def predict(sample):
    total = sum(label_counts.values())
    results = {}

    for label in label_counts:
        prob = label_counts[label] / total
        for i, val in enumerate(sample):
            attr_count = feature_counts[label, features[i]]
            prob *= (attr_count[val] + 1) / (label_counts[label] + len(attr_count))
        results[label] = prob

    return max(results, key=results.get), results


prediction, probabilities = predict(test_sample)
print("Test Sample:", dict(zip(features, test_sample)))
print("Predicted Class:", prediction)
print("Probabilities:", probabilities)


Test Sample: {'Outlook': 'Sunny', 'Temperature': 'Cool', 'Humidity': 'High', 'Wind': 'Strong'}
Predicted Class: No
Probabilities: {'No': 0.025396825396825393, 'Yes': 0.005555555555555555}


2. SMS Spam Collection Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score


df = pd.read_csv("/content/spam.csv", encoding='latin-1')[['v1', 'v2']]
df.columns = ['label', 'message']


df['label'] = df['label'].map({'ham': 0, 'spam': 1})


X_train, X_test, y_train, y_test = train_test_split(
    df['message'], df['label'], test_size=0.2, random_state=42
)


vectorizer = TfidfVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


model = MultinomialNB()
model.fit(X_train_vec, y_train)


y_pred = model.predict(X_test_vec)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9668161434977578

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       965
           1       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



3. Iris Dataset

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd


iris = load_iris()
X = iris.data
y = iris.target
target_names = iris.target_names


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)


model = GaussianNB()

model.fit(X_train, y_train)


y_pred = model.predict(X_test)


print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=target_names))


Accuracy: 0.9777777777777777

Classification Report:
               precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        19
  versicolor       1.00      0.92      0.96        13
   virginica       0.93      1.00      0.96        13

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.97        45
weighted avg       0.98      0.98      0.98        45



4. Titanic Dataset

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('/content/train.csv')

df = df[['Pclass', 'Sex', 'Age', 'Fare', 'Survived']]


df['Age'].fillna(df['Age'].mean(), inplace=True)


le = LabelEncoder()
df['Sex'] = le.fit_transform(df['Sex'])


X = df[['Pclass', 'Sex', 'Age', 'Fare']]
y = df['Survived']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = GaussianNB()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.7597765363128491

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.79      0.79       105
           1       0.71      0.72      0.71        74

    accuracy                           0.76       179
   macro avg       0.75      0.75      0.75       179
weighted avg       0.76      0.76      0.76       179



The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)


5. Weather Dataset (Simplified)

In [None]:
from collections import defaultdict

data = [
    ['Sunny', 'Hot', False, 'No'],
    ['Overcast', 'Cool', True, 'Yes'],
    ['Rainy', 'Mild', False, 'Yes']
]

features = ['Outlook', 'Temp', 'Windy']
test_sample = ['Rainy', 'Mild', False]


label_counts = defaultdict(int)
feature_counts = defaultdict(lambda: defaultdict(int))

for row in data:
    *attrs, label = row
    label_counts[label] += 1
    for i, attr in enumerate(attrs):
        feature_counts[label, features[i]][attr] += 1


def predict(sample):
    total = sum(label_counts.values())
    results = {}

    for label in label_counts:
        prob = label_counts[label] / total
        for i, val in enumerate(sample):
            attr_count = feature_counts[label, features[i]]
            prob *= (attr_count[val] + 1) / (label_counts[label] + len(attr_count))
        results[label] = prob

    return max(results, key=results.get), results


prediction, probabilities = predict(test_sample)
print("Test Sample:", dict(zip(features, test_sample)))
print("Predicted Class:", prediction)
print("Probabilities:", probabilities)


Test Sample: {'Outlook': 'Rainy', 'Temp': 'Mild', 'Windy': False}
Predicted Class: Yes
Probabilities: {'No': 0.037037037037037035, 'Yes': 0.08333333333333333}
