In [10]:
import pandas as pd
from sklearn.preprocessing import OrdinalEncoder
from sklearn.naive_bayes import CategoricalNB
import numpy as np

# Load the dataset
file_path = "play.csv"
df = pd.read_csv(file_path)

# Compute class prior probabilities
class_counts = df["play"].value_counts()
total_samples = len(df)
prior_yes = class_counts["Yes"] / total_samples
prior_no = class_counts["No"] / total_samples

# Function to compute likelihoods
def compute_likelihood(feature, value, target_class):
    subset = df[df["play"] == target_class]
    return len(subset[subset[feature] == value]) / len(subset)

# Given test sample: <Rain, Cool, High, Strong>
test_sample = {"outlook": "Rain", "temp": "Cool", "humidity": "High", "wind": "Strong"}

# Compute likelihoods P(X|Yes) and P(X|No)
likelihoods_yes = {feature: compute_likelihood(feature, value, "Yes") for feature, value in test_sample.items()}
likelihoods_no = {feature: compute_likelihood(feature, value, "No") for feature, value in test_sample.items()}

# Compute class conditional probabilities P(Yes|X) and P(No|X)
prob_yes_given_x = prior_yes * likelihoods_yes["outlook"] * likelihoods_yes["temp"] * \
                   likelihoods_yes["humidity"] * likelihoods_yes["wind"]
prob_no_given_x = prior_no * likelihoods_no["outlook"] * likelihoods_no["temp"] * \
                  likelihoods_no["humidity"] * likelihoods_no["wind"]

# Normalize probabilities
total_prob = prob_yes_given_x + prob_no_given_x
prob_yes_given_x /= total_prob
prob_no_given_x /= total_prob

# Print results
print("Class Prior Probabilities:")
print(f"P(Yes) = {prior_yes:.4f}, P(No) = {prior_no:.4f}\n")

print("Likelihoods for <Rain, Cool, High, Strong>:")
print("Given Yes:", likelihoods_yes)
print("Given No:", likelihoods_no, "\n")

print("Class Conditional Probabilities:")
print(f"P(Yes | X) = {prob_yes_given_x:.4f}, P(No | X) = {prob_no_given_x:.4f}\n")
print("Prediction:", "Yes" if prob_yes_given_x > prob_no_given_x else "No")

# Validate using sklearn CategoricalNB
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(df[["outlook", "temp", "humidity", "wind"]])
y_encoded = np.where(df["play"] == "Yes", 1, 0)

nb_model = CategoricalNB()
nb_model.fit(X_encoded, y_encoded)

test_sample_encoded = encoder.transform([list(test_sample.values())])
predicted_class = nb_model.predict(test_sample_encoded)
predicted_probs = nb_model.predict_proba(test_sample_encoded)

print("\nValidation using sklearn CategoricalNB:")
print(f"Predicted Class: {'Yes' if predicted_class[0] == 1 else 'No'}")
print(f"Predicted Probabilities: P(Yes | X) = {predicted_probs[0][1]:.4f}, P(No | X) = {predicted_probs[0][0]:.4f}")


Class Prior Probabilities:
P(Yes) = 0.6429, P(No) = 0.3571

Likelihoods for <Rain, Cool, High, Strong>:
Given Yes: {'outlook': 0.3333333333333333, 'temp': 0.3333333333333333, 'humidity': 0.3333333333333333, 'wind': 0.3333333333333333}
Given No: {'outlook': 0.4, 'temp': 0.2, 'humidity': 0.8, 'wind': 0.6} 

Class Conditional Probabilities:
P(Yes | X) = 0.3666, P(No | X) = 0.6334

Prediction: No

Validation using sklearn CategoricalNB:
Predicted Class: No
Predicted Probabilities: P(Yes | X) = 0.4087, P(No | X) = 0.5913




In [11]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import OrdinalEncoder
from graphviz import Source
from IPython.display import SVG
from scipy.stats import entropy

# Load dataset
file_path = "play.csv"
df = pd.read_csv(file_path)

# Function to calculate entropy
def calculate_entropy(column):
    _, counts = np.unique(column, return_counts=True)
    return entropy(counts, base=2)

# Compute information gain for each attribute
def information_gain(df, feature, target):
    total_entropy = calculate_entropy(df[target])
    values, counts = np.unique(df[feature], return_counts=True)
    weighted_entropy = sum((counts[i] / sum(counts)) * calculate_entropy(df[df[feature] == values[i]][target]) for i in range(len(values)))
    return total_entropy - weighted_entropy

# Compute information gain for all attributes
info_gains = {feature: information_gain(df, feature, "play") for feature in ["outlook", "temp", "humidity", "wind"]}

# Find the root node (attribute with max information gain)
root_node = max(info_gains, key=info_gains.get)

# Print information gain values
print("Information Gain for each attribute:")
for feature, gain in info_gains.items():
    print(f"{feature}: {gain:.4f}")
print(f"\nRoot node based on Information Gain: {root_node}\n")

# Encode categorical features for DecisionTreeClassifier
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(df[["outlook", "temp", "humidity", "wind"]])
y_encoded = np.where(df["play"] == "Yes", 1, 0)

# Train Decision Tree Classifier
id3_model = DecisionTreeClassifier(criterion="entropy", max_depth=2)
id3_model.fit(X_encoded, y_encoded)

# Classify test sample <Rain, Cool, High, Weak>
test_sample = encoder.transform([["Rain", "Cool", "High", "Weak"]])
predicted_class = id3_model.predict(test_sample)
print("Predicted Class for <Rain, Cool, High, Weak>:", "Yes" if predicted_class[0] == 1 else "No")

# Visualize Decision Tree
graph = Source(export_graphviz(id3_model, feature_names=["outlook", "temp", "humidity", "wind"], class_names=["No", "Yes"], filled=True))
SVG(graph.pipe(format='svg'))

# Check if the root node matches
scikit_root_node = ["outlook", "temp", "humidity", "wind"][id3_model.tree_.feature[0]]
print(f"Root node from Scikit DecisionTreeClassifier: {scikit_root_node}")
print("Match with computed root node:", root_node == scikit_root_node)


Information Gain for each attribute:
outlook: 0.2467
temp: 0.0292
humidity: 0.1518
wind: 0.0481

Root node based on Information Gain: outlook

Predicted Class for <Rain, Cool, High, Weak>: No
Root node from Scikit DecisionTreeClassifier: outlook
Match with computed root node: True




In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
diabetes_file = "diabetes.csv"
iris_file = "Iris.csv"

diabetes_df = pd.read_csv(diabetes_file)
iris_df = pd.read_csv(iris_file)

### Diabetes Prediction ###
# Prepare data
X_diabetes = diabetes_df.iloc[:, :-1]
y_diabetes = diabetes_df.iloc[:, -1]
X_train_d, X_test_d, y_train_d, y_test_d = train_test_split(X_diabetes, y_diabetes, test_size=0.2, random_state=42)

# Standardize data
scaler = StandardScaler()
X_train_d = scaler.fit_transform(X_train_d)
X_test_d = scaler.transform(X_test_d)

# Train and evaluate Naïve Bayes
nb_diabetes = GaussianNB()
nb_diabetes.fit(X_train_d, y_train_d)
y_pred_nb_d = nb_diabetes.predict(X_test_d)

# Train and evaluate Decision Tree
dt_diabetes = DecisionTreeClassifier(criterion='entropy', max_depth=5, random_state=42)
dt_diabetes.fit(X_train_d, y_train_d)
y_pred_dt_d = dt_diabetes.predict(X_test_d)

print("Diabetes Prediction:")
print("Naïve Bayes Accuracy:", accuracy_score(y_test_d, y_pred_nb_d))
print(classification_report(y_test_d, y_pred_nb_d))
print("Decision Tree Accuracy:", accuracy_score(y_test_d, y_pred_dt_d))
print(classification_report(y_test_d, y_pred_dt_d))

### Iris Classification ###
# Prepare data
X_iris = iris_df.iloc[:, 1:-1]
y_iris = iris_df.iloc[:, -1]
le = LabelEncoder()
y_iris = le.fit_transform(y_iris)  # Convert species names to numerical labels
X_train_i, X_test_i, y_train_i, y_test_i = train_test_split(X_iris, y_iris, test_size=0.2, random_state=42)

# Train and evaluate Naïve Bayes
nb_iris = GaussianNB()
nb_iris.fit(X_train_i, y_train_i)
y_pred_nb_i = nb_iris.predict(X_test_i)

# Train and evaluate Decision Tree
dt_iris = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=42)
dt_iris.fit(X_train_i, y_train_i)
y_pred_dt_i = dt_iris.predict(X_test_i)

print("Iris Classification:")
print("Naïve Bayes Accuracy:", accuracy_score(y_test_i, y_pred_nb_i))
print(classification_report(y_test_i, y_pred_nb_i, target_names=le.classes_))
print("Decision Tree Accuracy:", accuracy_score(y_test_i, y_pred_dt_i))
print(classification_report(y_test_i, y_pred_dt_i, target_names=le.classes_))


Diabetes Prediction:
Naïve Bayes Accuracy: 0.7662337662337663
              precision    recall  f1-score   support

           0       0.83      0.80      0.81        99
           1       0.66      0.71      0.68        55

    accuracy                           0.77       154
   macro avg       0.75      0.75      0.75       154
weighted avg       0.77      0.77      0.77       154

Decision Tree Accuracy: 0.7857142857142857
              precision    recall  f1-score   support

           0       0.83      0.84      0.83        99
           1       0.70      0.69      0.70        55

    accuracy                           0.79       154
   macro avg       0.77      0.76      0.77       154
weighted avg       0.78      0.79      0.79       154

Iris Classification:
Naïve Bayes Accuracy: 1.0
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        10
Iris-versicolor       1.00      1.00      1.00         9
 Iris-virginica       