<a href="https://colab.research.google.com/github/michaeledge27/CSCI290/blob/main/notebooks/partnerProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
from scipy.stats import norm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Data Exploration

In [3]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [4]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [5]:
penguins[['island']].value_counts()

Unnamed: 0_level_0,count
island,Unnamed: 1_level_1
Biscoe,168
Dream,124
Torgersen,52


In [6]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


# NaiveBayesClassifier

In [7]:
def calculate_priors(y):
    return y.value_counts( normalize=True ).to_dict()  #Calculate prior probabilities for each class

In [8]:
# Calcultate conditional probabilities for y(target) given X(features)
# P(B|A)
def calculate_likelihoods(X, y):
  likelihoods = {}              # Initialize dict
  for feature in X.columns:     # Loop through all features
    likelihoods[feature] = {}    # Initialize nested likelihood dict
    for clas in y.unique():     # Loop through all unique values of our target (Gentoo, Adelie, Chinstrap)
      subset = X[y == clas]       # Create a subset of X where y equals the current class
      value_counts = subset[feature].value_counts(normalize=True)  # Calculate value counts for the current feature in the subset and normalize them
      likelihoods[feature][clas] = value_counts.to_dict() #store in the nested likelihoods dict
  return likelihoods

In [9]:

def predict_naive_bayes(instance, priors, likelihoods, classes):
    posteriors = {}

    for clas in classes:
      # Start with log(prior) prob for each class
      posterior = np.log(priors[clas])
      # Loop through all features and corresponding values for the new instance
      for feature, value in instance.items():
        # Check if the feature's value exists in likelihoods for the current class
        if value in likelihoods.get(feature, {}).get(clas, {}):
          # If the value exists, add the log-likelihood to the posterior probability
            posterior += np.log(likelihoods[feature][clas].get(value))

      # Store the posterior probability for the current class
      posteriors[clas] = posterior

    # Return the class with the highest posterior probability
    return max(posteriors, key=posteriors.get)

In [10]:
def naive_bayes(X, y, new_instance):
  priors = calculate_priors(y)    # Calculate priors
  likelihoods = calculate_likelihoods(X, y)   # Calculate likelihoods
  classes = y.unique()    # get all classes
  return predict_naive_bayes(new_instance, priors, likelihoods, classes)

In [11]:
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']]
y = penguins['species']

In [12]:
new_penguin_instance = {
    'bill_length_mm': 45.0,
    'bill_depth_mm': 14.0,
    'flipper_length_mm': 210.0,
    'body_mass_g': 4500,
    'sex': 'Male'
}

In [13]:
predicted_species = naive_bayes(X, y, new_penguin_instance)
print(f"Predicted species: {predicted_species}")

Predicted species: Chinstrap


# SciKit Comparisons


In [16]:
np.random.seed(42)
test = pd.DataFrame( {"x1": np.random.randn( 200 ), "x2": 2*np.random.randn( 200) - 5, "x3": np.random.randint(3, size=200) } )
test["target"] = (test["x1"] > -2)*(test["x2"] > -7 )*(test["x3"]!=1)*1

In [17]:
test

Unnamed: 0,x1,x2,x3,target
0,0.496714,-4.284425,2,1
1,-0.138264,-3.878431,2,1
2,0.647689,-2.833898,0,1
3,1.523030,-2.892396,0,1
4,-0.234153,-7.755339,2,0
...,...,...,...,...
195,0.385317,-5.938351,0,1
196,-0.883857,-8.426269,1,0
197,0.153725,-2.292255,2,1
198,0.058209,-5.229080,0,1


# SciKit Learn Classifier