<a href="https://colab.research.google.com/github/nikhildhavale/pythonLearning/blob/main/Naive_BayesCategoricalNB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# CategoricalNB ---> Golf Dataset:

# -----------------------------------------------
# Naive Bayes on Weather / Play Golf Dataset
# - Categorical features (Outlook, Temperature, Humidity, Windy)
# - Target: Play Golf (Yes / No)  -> Binary classification
# - Uses CategoricalNB + OrdinalEncoder
# - Includes a helper function to predict for new inputs
#   e.g. ("Rainy", "Hot", "High", False)
# -----------------------------------------------

import numpy as np
import pandas as pd

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.naive_bayes import CategoricalNB

# 1. BUILD THE DATASET
# --------------------
data = {
    "Outlook":     ["Rainy", "Rainy", "Overcast", "Sunny", "Sunny", "Sunny",
                    "Overcast", "Rainy", "Rainy", "Sunny", "Rainy", "Overcast",
                    "Overcast", "Sunny"],
    "Temperature": ["Hot", "Hot", "Hot", "Mild", "Cool", "Cool",
                    "Cool", "Mild", "Cool", "Mild", "Mild", "Mild",
                    "Hot", "Mild"],
    "Humidity":    ["High", "High", "High", "High", "Normal", "Normal",
                    "Normal", "High", "Normal", "Normal", "Normal", "High",
                    "Normal", "High"],
    "Windy":       [False, True, False, False, False, True,
                    True, False, False, False, True, True,
                    False, True],
    "PlayGolf":    ["Yes", "No", "Yes", "Yes", "Yes", "No",
                    "Yes", "No", "Yes", "Yes", "Yes", "Yes",
                    "Yes", "No"]
}

df = pd.DataFrame(data)
print("Full dataset:\n")
print(df.to_string(index=False))


# 2. SEPARATE FEATURES AND TARGET
# -------------------------------
X_raw = df[["Outlook", "Temperature", "Humidity", "Windy"]].copy()
y_raw = df["PlayGolf"].copy()

# For consistency, convert Windy to string ("True"/"False") so everything is categorical
X_raw["Windy"] = X_raw["Windy"].astype(str) # changed from boolean to string

print("\nUnique values per feature:")
for col in X_raw.columns:
    print(f"{col}: {X_raw[col].unique()}")


# 3. ENCODING (CATEGORICAL -> INTEGERS)
# -------------------------------------
# CategoricalNB expects each feature as an integer-coded category.
# OrdinalEncoder transforms each column separately into integers 0..n_categories-1.
feature_encoder = OrdinalEncoder()
X_encoded = feature_encoder.fit_transform(X_raw)

# Encode target labels ("Yes"/"No") as 0/1 (label order is learned)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y_raw)

print("\nLabel classes:", label_encoder.classes_)
print("Encoded labels (first 10):", y_encoded[:10])


# 4. TRAIN CATEGORICAL NAIVE BAYES
# --------------------------------
clf = CategoricalNB()
clf.fit(X_encoded, y_encoded)
print("\nCategoricalNB model trained on full dataset.")


# 5. HELPER FUNCTION FOR PREDICTION
# ---------------------------------
def predict_play_golf(outlook, temperature, humidity, windy):
    """
    outlook: str, e.g. "Rainy", "Sunny", "Overcast"
    temperature: str, e.g. "Hot", "Mild", "Cool"
    humidity: str, e.g. "High", "Normal"
    windy: bool or str, e.g. False / True or "False" / "True"
    """
    # Normalize windy to string "True"/"False"
    if isinstance(windy, bool):
        windy_str = str(windy)
    else:
        windy_str = str(windy)

    # Create a single-row DataFrame with same columns as training features
    new_sample = pd.DataFrame(
        [[outlook, temperature, humidity, windy_str]],
        columns=["Outlook", "Temperature", "Humidity", "Windy"]
    )

    # Encode using the same feature encoder
    new_sample_encoded = feature_encoder.transform(new_sample)

    # Predict class and probability
    pred_encoded = clf.predict(new_sample_encoded)[0]
    proba = clf.predict_proba(new_sample_encoded)[0]

    # Decode predicted label back to "Yes"/"No"
    pred_label = label_encoder.inverse_transform([pred_encoded])[0]  # 0,1 ---> Yes and No

    # Map probabilities to class names for clarity
    class_names = label_encoder.classes_
    proba_dict = {cls_name: float(p) for cls_name, p in zip(class_names, proba)} # Show the class name, probability ---> Yes, 72%

    print("\nInput conditions:")
    print(new_sample.to_string(index=False))

    print("\nPrediction:")
    print(f"PlayGolf = {pred_label}")
    print("\nClass probabilities:")
    for cls_name in class_names:
        print(f"P(PlayGolf = {cls_name} | features) = {proba_dict[cls_name]:.4f}")

    # Also return in case you want to use programmatically
    return pred_label, proba_dict


# 6. EXAMPLE PREDICTION (like your example: Rainy, Hot, High, False)
# -------------------------------------------------------------------
example_label, example_proba = predict_play_golf("Rainy", "Hot", "High", "False")


# 7. OPTIONAL: INTERACTIVE PREDICTION (UNCOMMENT FOR CONSOLE USE)
# ----------------------------------------------------------------
if __name__ == "__main__":
     print("\n=== Interactive prediction ===")
     outlook_in = input("Outlook (Sunny/Overcast/Rainy): ")
     temp_in = input("Temperature (Hot/Mild/Cool): ")
     humidity_in = input("Humidity (High/Normal): ")
     windy_in = input("Windy (True/False): ")
     windy_bool = windy_in.strip().lower() == "true"

     predict_play_golf(outlook_in, temp_in, humidity_in, windy_bool)

Full dataset:

 Outlook Temperature Humidity  Windy PlayGolf
   Rainy         Hot     High  False      Yes
   Rainy         Hot     High   True       No
Overcast         Hot     High  False      Yes
   Sunny        Mild     High  False      Yes
   Sunny        Cool   Normal  False      Yes
   Sunny        Cool   Normal   True       No
Overcast        Cool   Normal   True      Yes
   Rainy        Mild     High  False       No
   Rainy        Cool   Normal  False      Yes
   Sunny        Mild   Normal  False      Yes
   Rainy        Mild   Normal   True      Yes
Overcast        Mild     High   True      Yes
Overcast         Hot   Normal  False      Yes
   Sunny        Mild     High   True       No

Unique values per feature:
Outlook: ['Rainy' 'Overcast' 'Sunny']
Temperature: ['Hot' 'Mild' 'Cool']
Humidity: ['High' 'Normal']
Windy: ['False' 'True']

Label classes: ['No' 'Yes']
Encoded labels (first 10): [1 0 1 1 1 0 1 0 1 1]

CategoricalNB model trained on full dataset.

Input conditions: