In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

##### Problem 1 - Predict the class label of X using Naïve Bayes classification

In [3]:
data = {'ID': [1,2,3,4,5,6,7,8,9,10,11,12],
        'A1': ['Medium', 'Low', 'High', 'Low', 'Low', 'Medium', 'High', 'Low', 'Medium', 'High', 'Medium', 'Low'],
        'A2': ['Mild', 'Mild', 'Mild', 'Mild', 'Cool', 'Hot', 'Hot', 'Cool', 'Hot', 'Cool', 'Mild', 'Cool'],
        'A3': ['East', 'East', 'East', 'West', 'East', 'West', 'East', 'West', 'East', 'East', 'East', 'West'],
        'Class': ['Y', 'Y', 'N', 'N', 'Y', 'N', 'Y', 'N', 'Y', 'Y', 'Y', 'N']}

nbdf = pd.DataFrame(data)
nbdf.head(12)

Unnamed: 0,ID,A1,A2,A3,Class
0,1,Medium,Mild,East,Y
1,2,Low,Mild,East,Y
2,3,High,Mild,East,N
3,4,Low,Mild,West,N
4,5,Low,Cool,East,Y
5,6,Medium,Hot,West,N
6,7,High,Hot,East,Y
7,8,Low,Cool,West,N
8,9,Medium,Hot,East,Y
9,10,High,Cool,East,Y


In [6]:
total_values = len(nbdf)
print(total_values)

12


In [8]:
#creating a summary table 
count_A1 = nbdf.groupby(['Class', 'A1']).size().unstack(fill_value=0)
count_A2 = nbdf.groupby(['Class', 'A2']).size().unstack(fill_value=0)
count_A3 = nbdf.groupby(['Class', 'A3']).size().unstack(fill_value=0)

In [10]:
print(count_A1)
print(count_A2)
print(count_A3)

A1     High  Low  Medium
Class                   
N         1    3       1
Y         2    2       3
A2     Cool  Hot  Mild
Class                 
N         2    1     2
Y         2    2     3
A3     East  West
Class            
N         1     4
Y         7     0


In [12]:
#Calculating Priors
count_class = nbdf['Class'].value_counts()
prior_Y = count_class['Y']/total_values
prior_N = count_class['N']/total_values

print("P(C1) = P(Y) = ", prior_Y, 
      "\nP(C2) = P(N) = ", prior_N)

P(C1) = P(Y) =  0.5833333333333334 
P(C2) = P(N) =  0.4166666666666667


In [14]:
#Likelihood for each class P(X|Ci)
A1Med_Y = count_A1.loc['Y', 'Medium'] / count_class['Y']
A1Med_N = count_A1.loc['N', 'Medium'] / count_class['N']
A2Cool_Y = count_A2.loc['Y', 'Cool'] / count_class['Y']
A2Cool_N = count_A2.loc['N', 'Cool'] / count_class['N']
A3East_Y = count_A3.loc['Y', 'East'] / count_class['Y']
A3East_N = count_A3.loc['N', 'East'] / count_class['N']

In [16]:
#Probability of each class
prob_X_Y = A1Med_Y * A2Cool_Y * A3East_Y
prob_X_N = A1Med_N * A2Cool_N * A3East_N

print("P(X|class=Y) = ", prob_X_Y, 
      "\nP(X|class=N) = ", prob_X_N)

P(X|class=Y) =  0.12244897959183672 
P(X|class=N) =  0.016000000000000004


In [18]:
#P(X|Ci)*P(Ci)
final_prob_Y = prob_X_Y * prior_Y
final_prob_N = prob_X_N * prior_N

print("P(X|class=Y)*P(class=Y) = ", final_prob_Y, 
      "\nP(X|class=N)*P(class=N) = ", final_prob_N)

P(X|class=Y)*P(class=Y) =  0.07142857142857142 
P(X|class=N)*P(class=N) =  0.006666666666666669


##### Problem 2 - Information Gain based on Entropy

In [21]:
#entropy of whole dataset -- 7 Y and 5 N
entropy_parent = 0
if prior_Y > 0:
    entropy_parent -= prior_Y * np.log2(prior_Y)
if prior_N > 0:
    entropy_parent -= prior_N * np.log2(prior_N)
print(f"Parent entropy: {entropy_parent:.4f}")

Parent entropy: 0.9799


In [23]:
#split on feature A2
weighted_entropy_A2 = 0
for value in nbdf['A2'].unique():
    subset = nbdf[nbdf['A2'] == value]
    total_Y_sub_A2 = sum(subset['Class'] == 'Y')
    total_N_sub_A2 = sum(subset['Class'] == 'N')

    total_sub_A2 = total_Y_sub_A2 + total_N_sub_A2
    if total_sub_A2 == 0:
        continue
    prob_Y_sub_A2 = total_Y_sub_A2 / total_sub_A2
    prob_N_sub_A2 = total_N_sub_A2 / total_sub_A2

    entropy_sub_A2 = 0
    if prob_Y_sub_A2 > 0:
        entropy_sub_A2 -= prob_Y_sub_A2 * np.log2(prob_Y_sub_A2)
    if prob_N_sub_A2 > 0:
        entropy_sub_A2 -= prob_N_sub_A2 * np.log2(prob_N_sub_A2)

    weighted_entropy_A2 += (total_sub_A2 / total_values) * entropy_sub_A2

    print(f"A2={value}: Y={total_Y_sub_A2}, N={total_N_sub_A2}, Entropy={entropy_sub_A2:.4f}, Weight={total_sub_A2}/{total_values}")

print(f"Weighted Entropy of A2 = {weighted_entropy_A2:.4f}")

A2=Mild: Y=3, N=2, Entropy=0.9710, Weight=5/12
A2=Cool: Y=2, N=2, Entropy=1.0000, Weight=4/12
A2=Hot: Y=2, N=1, Entropy=0.9183, Weight=3/12
Weighted Entropy of A2 = 0.9675


In [25]:
#information gain calculation
information_gain_a2 = entropy_parent - weighted_entropy_A2
print(f"Information Gain A2 = {information_gain_a2:.4f}")

Information Gain A2 = 0.0124


In [27]:
#split on feature A2
weighted_entropy_A3 = 0
for value in nbdf['A3'].unique():
    subset = nbdf[nbdf['A3'] == value]
    total_Y_sub_A3 = sum(subset['Class'] == 'Y')
    total_N_sub_A3 = sum(subset['Class'] == 'N')

    total_sub_A3 = total_Y_sub_A3 + total_N_sub_A3
    if total_sub_A3 == 0:
        continue
    prob_Y_sub_A3 = total_Y_sub_A3 / total_sub_A3
    prob_N_sub_A3 = total_N_sub_A3 / total_sub_A3

    entropy_sub_A3 = 0
    if prob_Y_sub_A3 > 0:
        entropy_sub_A3 -= prob_Y_sub_A3 * np.log2(prob_Y_sub_A3)
    if prob_N_sub_A3 > 0:
        entropy_sub_A3 -= prob_N_sub_A3 * np.log2(prob_N_sub_A3)

    weighted_entropy_A3 += (total_sub_A3 / total_values) * entropy_sub_A3

    print(f"A3={value}: Y={total_Y_sub_A3}, N={total_N_sub_A3}, Entropy={entropy_sub_A3:.4f}, Weight={total_sub_A3}/{total_values}")

print(f"Weighted Entropy of A3 = {weighted_entropy_A3:.4f}")

A3=East: Y=7, N=1, Entropy=0.5436, Weight=8/12
A3=West: Y=0, N=4, Entropy=0.0000, Weight=4/12
Weighted Entropy of A3 = 0.3624


In [29]:
#information gain calculation A3
information_gain_a3 = entropy_parent - weighted_entropy_A3
print(f"Information Gain A3 = {information_gain_a3:.4f}")

Information Gain A3 = 0.6175


##### Problem 3 - Naïve Bayes Model

In [32]:
autism_raw = pd.read_csv("autism-adult.csv", na_values=["?", ""])
autism_raw.head()

Unnamed: 0,A1_Score,A2_Score,A3_Score,A4_Score,A5_Score,A6_Score,A7_Score,A8_Score,A9_Score,A10_Score,age,gender,ethnicity,jaundice,austim,country_of_res,used_app_before,relation,Class/ASD
0,1,1,1,1,0,0,1,1,0,0,26.0,f,White-European,no,no,'United States',no,Self,NO
1,1,1,0,1,0,0,0,1,0,1,24.0,m,Latino,no,yes,Brazil,no,Self,NO
2,1,1,0,1,1,0,1,1,1,1,27.0,m,Latino,yes,yes,Spain,no,Parent,YES
3,1,1,0,1,0,0,1,1,0,1,35.0,f,White-European,no,yes,'United States',no,Self,NO
4,1,0,0,0,0,0,0,1,0,0,40.0,f,,no,no,Egypt,no,,NO


##### [3.1] Generate training and holdout partitions on the data set. Use 1/3 of the data in the holdout.

In [35]:
#encode categorical variables by factorizing them into integers --> 0 or 1
factor_cols = ['gender','ethnicity', 'jaundice', 'austim', 'country_of_res','used_app_before', 'relation']
for col in factor_cols:
    autism_raw[col] = autism_raw[col].astype("category")

#convert the target variable into categorical data
autism_raw["Class/ASD"] = pd.Categorical(autism_raw["Class/ASD"], categories=["NO", "YES"])

autism = autism_raw.dropna()

In [37]:
from sklearn.model_selection import train_test_split

#train/test split -- 2/3 train, 1/3 test, stratified
train, test = train_test_split(autism, test_size=1/3, random_state=699, stratify=autism["Class/ASD"])

##### [3.2] Fit a Naïve Bayes model. Make predictions on the holdout data. Generate a confusion matrix and measure the model’s performance

In [40]:
from sklearn.naive_bayes import CategoricalNB

#prepare features and target for Naive Bayes
X_train = train.drop(columns=["Class/ASD"])
y_train = train["Class/ASD"]
X_test = test.drop(columns=["Class/ASD"])
y_test = test["Class/ASD"]

#encode categorical features as codes for CategoricalNB
for col in X_train.columns:
    if pd.api.types.is_categorical_dtype(X_train[col]):
        # Use the same categories for train and test
        X_test[col] = X_test[col].astype("category")
        X_test[col].cat.set_categories(X_train[col].cat.categories)
        X_train[col] = X_train[col].cat.codes
        X_test[col] = X_test[col].cat.codes
    else:
        # If numeric, leave as is
        pass

  if pd.api.types.is_categorical_dtype(X_train[col]):


In [42]:
#fit naive bayes on training data
model_nb = CategoricalNB(alpha=1)
model_nb.fit(X_train, y_train)

In [44]:
#predict on holdout data
y_pred = model_nb.predict(X_test)

In [67]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

#confusion matrix, accuracy score, and overall classification report
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(pd.DataFrame(cm, index=["NO", "YES"], columns=["NO", "YES"]))

print(f"\nAccuracy Score: {accuracy_score(y_test, y_pred):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, digits=4))

Confusion Matrix:
      NO  YES
NO   140    3
YES    5   55

Accuracy Score: 0.9606

Classification Report:
              precision    recall  f1-score   support

          NO     0.9655    0.9790    0.9722       143
         YES     0.9483    0.9167    0.9322        60

    accuracy                         0.9606       203
   macro avg     0.9569    0.9478    0.9522       203
weighted avg     0.9604    0.9606    0.9604       203



##### Problem 4 - Full Tree and Pruned Tree

In [72]:
res = pd.read_csv('restaurantdata-small.csv')
res.head()

Unnamed: 0,Location,Cuisine,Rating,Ambience.Score,Service.Quality.Score,Revenue
0,Rural,Japanese,4.0,1.3,7.0,638945.52
1,Downtown,Mexican,3.2,2.6,3.4,490207.83
2,Rural,Italian,4.7,5.3,6.7,541368.62
3,Rural,Italian,4.4,4.6,2.8,404556.8
4,Downtown,Japanese,4.9,8.6,2.1,1491046.35


##### [4.1] Generate training and holdout partitions on the data set. Use 1/3 of the data in the holdout.

In [74]:
X = pd.get_dummies(res.drop(columns=['Revenue']), drop_first=True)
y = res['Revenue']

X.head()

Unnamed: 0,Rating,Ambience.Score,Service.Quality.Score,Location_Rural,Location_Suburban,Cuisine_French,Cuisine_Indian,Cuisine_Italian,Cuisine_Japanese,Cuisine_Mexican
0,4.0,1.3,7.0,True,False,False,False,False,True,False
1,3.2,2.6,3.4,False,False,False,False,False,False,True
2,4.7,5.3,6.7,True,False,False,False,True,False,False
3,4.4,4.6,2.8,True,False,False,False,True,False,False
4,4.9,8.6,2.1,False,False,False,False,False,True,False


##### [4.2] Fit a regression tree model with Complexity Parameter of 0. Use the tree to make predictions on the holdout data. Measure the tree’s performance using one or more appropriate metrics of your choice.

In [77]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/3, random_state=42)
print(f"Training size: {X_train.shape[0]}, Holdout size: {X_test.shape[0]}")

Training size: 5578, Holdout size: 2790


In [83]:
from sklearn.tree import DecisionTreeRegressor

# fit full regression tree on training (complexity parameter = 0 --> no pruning)
reg_tree = DecisionTreeRegressor(random_state=42, min_samples_split=2, min_samples_leaf=1)
reg_tree.fit(X_train, y_train)

In [85]:
# Predict on test data
y_pred_reg = reg_tree.predict(X_test)

In [109]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Evaluate full regression tree performance
mse_reg = mean_squared_error(y_test, y_pred_reg)
mae_reg = mean_absolute_error(y_test, y_pred_reg)
r2_reg = r2_score(y_test, y_pred_reg)

print("Full Tree Performance:")
print(f"  MSE: {mse_reg:,.2f}")
print(f"  MAE: {mae_reg:,.2f}")
print(f"  R²: {r2_reg:.4f}")

Full Tree Performance:
  MSE: 13,009,204,841.60
  MAE: 90,720.66
  R²: 0.8171


##### [4.3] Prune the tree by implementing the minimum cross-validation error method. Use the pruned tree to make predictions on the holdout data. Measure the tree’s performance using one or more appropriate metrics of your choice.

In [89]:
from sklearn.model_selection import cross_val_score

In [91]:
# the cost-complexity pruning path
path = reg_tree.cost_complexity_pruning_path(X_train, y_train)
ccp_alphas = path.ccp_alphas  

#training trees for each alpha and using cross-validation to find the best one
cv_scores = []
for ccp_alpha in ccp_alphas:
    tree = DecisionTreeRegressor(random_state=42, ccp_alpha=ccp_alpha)

    #scoring based on negative MSE
    scores = cross_val_score(tree, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

    #appending scores to cv_scores
    cv_scores.append(np.mean(scores))

# choosing the alpha with the highest mean cross-validation score and lowest MSE
optimal_alpha = ccp_alphas[np.argmax(cv_scores)]

In [94]:
# fit the pruned tree
pruned_tree = DecisionTreeRegressor(random_state=42, ccp_alpha=optimal_alpha)
pruned_tree.fit(X_train, y_train)

In [95]:
#predict on test data
y_pred_pruned = pruned_tree.predict(X_test)

In [111]:
#evaluate
mse_pruned = mean_squared_error(y_test, y_pred_pruned)
mae_pruned = mean_absolute_error(y_test, y_pred_pruned)
r2_pruned = r2_score(y_test, y_pred_pruned)

print(f"Best ccp_alpha: {optimal_alpha:.6f}")

print("Pruned Tree Performance:")
print(f"  MSE: {mse_pruned:,.2f}")
print(f"  MAE: {mae_pruned:,.2f}")
print(f"  R²: {r2_pruned:.4f}")

Best ccp_alpha: 49749861.242212
Pruned Tree Performance:
  MSE: 6,211,468,956.55
  MAE: 63,954.57
  R²: 0.9127


##### [4.4] Variable importance for the pruned tree

In [113]:
#taking the features of X_train to see how important each one is
importances = pruned_tree.feature_importances_
feature_names = X_train.columns

#sort importance
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances}).sort_values(by='Importance', ascending=False)

print(importance_df)

                 Feature  Importance
3         Location_Rural    0.320932
8       Cuisine_Japanese    0.254648
5         Cuisine_French    0.185342
4      Location_Suburban    0.129384
7        Cuisine_Italian    0.084609
9        Cuisine_Mexican    0.020081
6         Cuisine_Indian    0.005004
0                 Rating    0.000000
1         Ambience.Score    0.000000
2  Service.Quality.Score    0.000000
