In [2]:
import numpy as np
from sklearn.naive_bayes import GaussianNB

# Simple dataset: Height (cm) and Weight (kg) to predict Gender
# Format: [Height, Weight]
X = np.array([
    [160, 55],  # Female
    [165, 60],  # Female
    [170, 65],  # Male
    [175, 70],  # Male
    [180, 75],  # Male
    [155, 50],  # Female
    [162, 58],  # Female
    [172, 68],  # Male
])

# Labels: 0 for Female, 1 for Male
y = np.array([0, 0, 1, 1, 1, 0, 0, 1])

# Create and train the Naive Bayes classifier
model = GaussianNB()
model.fit(X, y)

# Function to predict gender and explain the process
def predict_and_explain(height, weight):
    # Make prediction
    features = np.array([[height, weight]])
    prediction = model.predict(features)[0]
    gender = "Male" if prediction == 1 else "Female"
    
    # Explain the process
    print(f"Predicting gender for Height: {height} cm, Weight: {weight} kg")
    print(f"Prediction: {gender}")
    print("\nExplanation:")
    
    # Calculate and show probabilities for each class
    class_probs = model.predict_proba(features)[0]
    print(f"Probability of being Female: {class_probs[0]:.2f}")
    print(f"Probability of being Male: {class_probs[1]:.2f}")
    
    # Show mean and variance for each feature in each class
    for i, class_name in enumerate(["Female", "Male"]):
        print(f"\n{class_name} class statistics:")
        for j, feature in enumerate(["Height", "Weight"]):
            mean = model.theta_[i][j]
            var = model.var_[i][j]
            print(f"  {feature} - Mean: {mean:.2f}, Variance: {var:.2f}")

# Test the model with explanations
predict_and_explain(163, 57)  # Should predict Female
print("\n" + "="*50 + "\n")
predict_and_explain(178, 72)  # Should predict Male

Predicting gender for Height: 163 cm, Weight: 57 kg
Prediction: Female

Explanation:
Probability of being Female: 1.00
Probability of being Male: 0.00

Female class statistics:
  Height - Mean: 160.50, Variance: 13.25
  Weight - Mean: 55.75, Variance: 14.19

Male class statistics:
  Height - Mean: 174.25, Variance: 14.19
  Weight - Mean: 69.50, Variance: 13.25


Predicting gender for Height: 178 cm, Weight: 72 kg
Prediction: Male

Explanation:
Probability of being Female: 0.00
Probability of being Male: 1.00

Female class statistics:
  Height - Mean: 160.50, Variance: 13.25
  Weight - Mean: 55.75, Variance: 14.19

Male class statistics:
  Height - Mean: 174.25, Variance: 14.19
  Weight - Mean: 69.50, Variance: 13.25


In [46]:
import numpy as np
from scipy.stats import norm

# Simplified dataset (using the same data as before)
# Format: [Height, Weight]
X = np.array([
    [160, 55],  # Female
    [165, 60],  # Female
    [170, 65],  # Male
    [175, 70],  # Male
    [180, 75],  # Male
    [155, 50],  # Female
    [162, 58],  # Female
    [172, 68],  # Male
])

# Labels: 0 for Female, 1 for Male
y = np.array([0, 0, 1, 1, 1, 0, 0, 1])

# Calculate mean and variance for each feature in each class
female_data = X[y == 0]
male_data = X[y == 1]

female_mean = np.mean(female_data, axis=0)
female_var = np.var(female_data, axis=0)
male_mean = np.mean(male_data, axis=0)
male_var = np.var(male_data, axis=0)

# Prior probabilities
p_female = np.sum(y == 0) / len(y)
p_male = np.sum(y == 1) / len(y)

def predict_probability(height, weight):
    # Calculate likelihood for female
    p_height_female = norm.pdf(height, female_mean[0], np.sqrt(female_var[0]))
    p_weight_female = norm.pdf(weight, female_mean[1], np.sqrt(female_var[1]))
    print("p_height_female : ",p_height_female, "p_weight_female: ",p_weight_female)
    likelihood_female = p_height_female * p_weight_female * p_female

    # Calculate likelihood for male
    p_height_male = norm.pdf(height, male_mean[0], np.sqrt(male_var[0]))
    p_weight_male = norm.pdf(weight, male_mean[1], np.sqrt(male_var[1]))
    print("p_height_male: ",p_height_male, "p_weight_male: ",p_weight_male)
    likelihood_male = p_height_male * p_weight_male * p_male

    # Normalize probabilities
    total = likelihood_female + likelihood_male
    prob_female = likelihood_female / total
    prob_male = likelihood_male / total
    print("prob_female: ",f"{prob_female:.2f}","%")
    print("prob_male: ",f"{prob_male:.2f}","%")
    return prob_female, prob_male

# Example prediction
height, weight = 163, 57
female_prob, male_prob = predict_probability(height, weight)

print(f"Prediction for Height: {height} cm, Weight: {weight} kg")
print(f"Probability of being Female: {female_prob:.4f}")
print(f"Probability of being Male: {male_prob:.4f}")
print(f"Predicted class: {'Female' if female_prob > male_prob else 'Male'}")

# Show calculation steps
print("\nCalculation Steps:")
print(f"1. Female mean - Height: {female_mean[0]:.2f}, Weight: {female_mean[1]:.2f}")
print(f"   Female variance - Height: {female_var[0]:.2f}, Weight: {female_var[1]:.2f}")
print(f"2. Male mean - Height: {male_mean[0]:.2f}, Weight: {male_mean[1]:.2f}")
print(f"   Male variance - Height: {male_var[0]:.2f}, Weight: {male_var[1]:.2f}")
print(f"3. P(Female) = {p_female:.2f}, P(Male) = {p_male:.2f}")
print(f"4. P(Height|Female) = {norm.pdf(height, female_mean[0], np.sqrt(female_var[0])):.4f}")
print(f"   P(Weight|Female) = {norm.pdf(weight, female_mean[1], np.sqrt(female_var[1])):.4f}")
print(f"5. P(Height|Male) = {norm.pdf(height, male_mean[0], np.sqrt(male_var[0])):.4f}")
print(f"   P(Weight|Male) = {norm.pdf(weight, male_mean[1], np.sqrt(male_var[1])):.4f}")

p_height_female :  0.08657134857458065 p_weight_female:  0.10024027147772196
p_height_male:  0.001224195232747188 p_weight_male:  0.00030137243740354123
prob_female:  1.00 %
prob_male:  0.00 %
Prediction for Height: 163 cm, Weight: 57 kg
Probability of being Female: 1.0000
Probability of being Male: 0.0000
Predicted class: Female

Calculation Steps:
1. Female mean - Height: 160.50, Weight: 55.75
   Female variance - Height: 13.25, Weight: 14.19
2. Male mean - Height: 174.25, Weight: 69.50
   Male variance - Height: 14.19, Weight: 13.25
3. P(Female) = 0.50, P(Male) = 0.50
4. P(Height|Female) = 0.0866
   P(Weight|Female) = 0.1002
5. P(Height|Male) = 0.0012
   P(Weight|Male) = 0.0003


In [86]:
X_encoded

Unnamed: 0,age,income,student,credit_rating
0,2,1,1,1
1,2,1,1,0
2,0,0,0,1
3,1,2,0,1
4,1,1,0,1
5,1,1,0,0
6,0,0,0,0
7,2,2,0,1
8,2,1,0,1
9,1,2,0,1


In [88]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Create a sample dataset
data = {
    'age': ['young', 'young', 'middle', 'senior', 'senior', 'senior', 'middle', 'young', 'young', 'senior'],
    'income': ['low', 'low', 'high', 'medium', 'low', 'low', 'high', 'medium', 'low', 'medium'],
    'student': ['yes', 'yes', 'no', 'no', 'no', 'no', 'no', 'no', 'no', 'no'],
    'credit_rating': ['fair', 'excellent', 'fair', 'fair', 'fair', 'excellent', 'excellent', 'fair', 'fair', 'fair'],
    'buys_car': ['no', 'no', 'yes', 'yes', 'yes', 'no', 'yes', 'no', 'yes', 'yes']
}

# Convert the data to a pandas DataFrame
df = pd.DataFrame(data)

# Separate features and target
X = df.drop('buys_car', axis=1)
y = df['buys_car']

# Initialize LabelEncoder for features
le_features = {}
X_encoded = X.copy()

# Fit and transform each feature column
for column in X.columns:
    le_features[column] = LabelEncoder()
    X_encoded[column] = le_features[column].fit_transform(X[column])

# Initialize LabelEncoder for target and fit
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_classifier = CategoricalNB()
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

# Example of predicting a new instance
new_instance = pd.DataFrame([['young', 'high', 'yes', 'fair']], columns=X.columns)
new_instance_encoded = new_instance.copy()

# Transform the new instance (without fitting)
for column in new_instance.columns:
    new_instance_encoded[column] = le_features[column].transform(new_instance[column])

prediction = nb_classifier.predict(new_instance_encoded)
probabilities = nb_classifier.predict_proba(new_instance_encoded)

print("\nNew instance:", new_instance.values[0])
print(f"Encoded new instance: {new_instance_encoded.values[0]}")
print(f"Prediction for new instance: {le_target.inverse_transform(prediction)[0]}")
print("\nProbabilities for new instance:")
for class_name, prob in zip(le_target.classes_, probabilities[0]):
    print(f"{class_name}: {prob:.4f}")

# Print encodings
print("\nEncodings:")
for column in X.columns:
    print(f"\n{column}:")
    for category, code in zip(le_features[column].classes_, le_features[column].transform(le_features[column].classes_)):
        print(f"  {category}: {code}")

Accuracy: 0.33

Classification Report:
              precision    recall  f1-score   support

          no       0.50      0.50      0.50         2
         yes       0.00      0.00      0.00         1

    accuracy                           0.33         3
   macro avg       0.25      0.25      0.25         3
weighted avg       0.33      0.33      0.33         3


New instance: ['young' 'high' 'yes' 'fair']
Encoded new instance: [2 0 1 1]
Prediction for new instance: no

Probabilities for new instance:
no: 0.7901
yes: 0.2099

Encodings:

age:
  middle: 0
  senior: 1
  young: 2

income:
  high: 0
  low: 1
  medium: 2

student:
  no: 0
  yes: 1

credit_rating:
  excellent: 0
  fair: 1


In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score, classification_report

# Create the dataset based on the frequency tables
data = []

# Sunny data
data.extend([['Sunny', 'Yes', 'Summer', 'Yes']] * 3)
data.extend([['Sunny', 'No', 'Summer', 'No']] * 4)
data.extend([['Sunny', 'Yes', 'Winter', 'Yes']] * 6)
data.extend([['Sunny', 'No', 'Winter', 'No']] * 1)

# Windy data
data.extend([['Windy', 'Yes', 'Monsoon', 'Yes']] * 6)
data.extend([['Windy', 'No', 'Monsoon', 'No']] * 2)
data.extend([['Windy', 'Yes', 'Winter', 'Yes']] * 3)
data.extend([['Windy', 'No', 'Winter', 'No']] * 3)

# Season data (for remaining entries)
data.extend([['Sunny', 'Yes', 'Monsoon', 'Yes']] * 4)
data.extend([['Windy', 'Yes', 'Summer', 'Yes']] * 2)
data.extend([['Windy', 'No', 'Summer', 'No']] * 2)

# Convert to DataFrame
df = pd.DataFrame(data, columns=['Weather', 'Humidity', 'Season', 'Play'])

# Display the first few rows and data info
print(df.head())
print(df.info())

# Separate features and target
X = df[['Weather', 'Humidity', 'Season']]
y = df['Play']

# Encode categorical features
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize and train the Naive Bayes classifier
nb_classifier = CategoricalNB()
nb_classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"\nAccuracy: {accuracy:.2f}")

# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Function to predict new instances
def predict_play(weather, humidity, season):
    new_instance = encoder.transform([[weather, humidity, season]])
    prediction = nb_classifier.predict(new_instance)
    probability = nb_classifier.predict_proba(new_instance)
    return prediction[0], probability[0]

# Example predictions
print("\nExample Predictions:")
examples = [
    ('Sunny', 'Yes', 'Summer'),
    ('Windy', 'No', 'Winter'),
    ('Sunny', 'No', 'Monsoon')
]

for weather, humidity, season in examples:
    prediction, probability = predict_play(weather, humidity, season)
    print(f"Weather: {weather}, Humidity: {humidity}, Season: {season}")
    print(f"Prediction: {prediction}")
    print(f"Probability (No, Yes): {probability}")
    print()

# Print the encoding for each feature
print("Feature Encodings:")
for i, feature in enumerate(X.columns):
    print(f"\n{feature}:")
    for category, code in enumerate(encoder.categories_[i]):
        print(f"  {category}: {code}")

  Weather Humidity  Season Play
0   Sunny      Yes  Summer  Yes
1   Sunny      Yes  Summer  Yes
2   Sunny      Yes  Summer  Yes
3   Sunny       No  Summer   No
4   Sunny       No  Summer   No
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36 entries, 0 to 35
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Weather   36 non-null     object
 1   Humidity  36 non-null     object
 2   Season    36 non-null     object
 3   Play      36 non-null     object
dtypes: object(4)
memory usage: 1.2+ KB
None

Accuracy: 1.00

Classification Report:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00         4
         Yes       1.00      1.00      1.00         4

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8


Example Predictions:
Weather: Sunny, Humidity: Yes, Season: Summe



In [24]:
import pandas as pd
import numpy as np

# Recreate the dataset
data = []
data.extend([['Sunny', 'Yes', 'Summer', 'Yes']] * 3)
data.extend([['Sunny', 'No', 'Summer', 'No']] * 4)
data.extend([['Sunny', 'Yes', 'Winter', 'Yes']] * 6)
data.extend([['Sunny', 'No', 'Winter', 'No']] * 1)
data.extend([['Windy', 'Yes', 'Monsoon', 'Yes']] * 6)
data.extend([['Windy', 'No', 'Monsoon', 'No']] * 2)
data.extend([['Windy', 'Yes', 'Winter', 'Yes']] * 3)
data.extend([['Windy', 'No', 'Winter', 'No']] * 3)
data.extend([['Sunny', 'Yes', 'Monsoon', 'Yes']] * 4)
data.extend([['Windy', 'Yes', 'Summer', 'Yes']] * 2)
data.extend([['Windy', 'No', 'Summer', 'No']] * 2)

df = pd.DataFrame(data, columns=['Weather', 'Humidity', 'Season', 'Play'])

# Function to create frequency table
def create_frequency_table(df, feature):
    return pd.crosstab(df[feature], df['Play'])

# Function to create likelihood table
def create_likelihood_table(freq_table):
    return freq_table.div(freq_table.sum(axis=1), axis=0)

# Create and display frequency tables
features = ['Weather', 'Humidity', 'Season']
freq_tables = {feature: create_frequency_table(df, feature) for feature in features}

print("Frequency Tables:")
for feature, table in freq_tables.items():
    print(f"\n{feature}:")
    print(table)

# Create and display likelihood tables
likelihood_tables = {feature: create_likelihood_table(table) for feature, table in freq_tables.items()}

print("\nLikelihood Tables:")
for feature, table in likelihood_tables.items():
    print(f"\n{feature}:")
    print(table)

# Calculate prior probabilities
prior_prob = df['Play'].value_counts(normalize=True)
print("\nPrior Probabilities:")
print(prior_prob)

# Example of using these tables for prediction
def naive_bayes_predict(weather, humidity, season):
    p_yes = prior_prob['Yes']
    p_no = prior_prob['No']
    
    p_yes *= likelihood_tables['Weather'].loc[weather, 'Yes']
    p_yes *= likelihood_tables['Humidity'].loc[humidity, 'Yes']
    p_yes *= likelihood_tables['Season'].loc[season, 'Yes']
    
    p_no *= likelihood_tables['Weather'].loc[weather, 'No']
    p_no *= likelihood_tables['Humidity'].loc[humidity, 'No']
    p_no *= likelihood_tables['Season'].loc[season, 'No']
    
    total = p_yes + p_no
    return {'Yes': p_yes / total, 'No': p_no / total}

# Example prediction
example = ('Sunny', 'Yes', 'Summer')
prediction = naive_bayes_predict(*example)
print(f"\nPrediction for {example}:")
print(f"Probability of Yes: {prediction['Yes']:.4f}")
print(f"Probability of No: {prediction['No']:.4f}")

Frequency Tables:

Weather:
Play     No  Yes
Weather         
Sunny     5   13
Windy     7   11

Humidity:
Play      No  Yes
Humidity         
No        12    0
Yes        0   24

Season:
Play     No  Yes
Season          
Monsoon   2   10
Summer    6    5
Winter    4    9

Likelihood Tables:

Weather:
Play           No       Yes
Weather                    
Sunny    0.277778  0.722222
Windy    0.388889  0.611111

Humidity:
Play       No  Yes
Humidity          
No        1.0  0.0
Yes       0.0  1.0

Season:
Play           No       Yes
Season                     
Monsoon  0.166667  0.833333
Summer   0.545455  0.454545
Winter   0.307692  0.692308

Prior Probabilities:
Play
Yes    0.666667
No     0.333333
Name: proportion, dtype: float64

Prediction for ('Sunny', 'Yes', 'Summer'):
Probability of Yes: 1.0000
Probability of No: 0.0000
