<a href="https://colab.research.google.com/github/michaeledge27/CSCI290/blob/main/notebooks/partnerProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [384]:
import pandas as pd
from scipy.stats import norm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Data Exploration

In [385]:
np.random.seed(17)

In [386]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [387]:
penguins.head(10)

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,male,2007
6,Adelie,Torgersen,38.9,17.8,181.0,3625.0,female,2007
7,Adelie,Torgersen,39.2,19.6,195.0,4675.0,male,2007
8,Adelie,Torgersen,34.1,18.1,193.0,3475.0,,2007
9,Adelie,Torgersen,42.0,20.2,190.0,4250.0,,2007


In [388]:
penguins[['island']].value_counts()

Unnamed: 0_level_0,count
island,Unnamed: 1_level_1
Biscoe,168
Dream,124
Torgersen,52


In [389]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


# NaiveBayesClassifier

In [390]:
def calculate_priors(y):
    return y.value_counts( normalize=True ).to_dict()  #Calculate prior probabilities for each class

In [391]:
# Calcultate conditional probabilities for y(target) given X(features)
# P(B|A)
def calculate_likelihoods(X, y):
  likelihoods = {}              # Initialize dict
  for feature in X.columns:     # Loop through all features
    likelihoods[feature] = {}         # Initialize nested likelihood dict
    for cls in y.unique():
        value_counts = X[feature][y == cls].value_counts()
        total_count = value_counts.sum()
        likelihoods[feature][cls] = {}
        subset = X[y == cls]
        total_count = len(subset)
        value_counts = subset[feature].value_counts()

        # Apply Laplace smoothing
        for value in value_counts.index:
            likelihoods[feature][cls][value] = (value_counts[value] + 1) / (total_count + len(value_counts))

        # Handle unseen values by assigning a small probability (laplace corrections)
            likelihoods[feature][cls]['__UNSEEN__'] = 1 / (total_count + len(value_counts))
  return likelihoods

In [392]:

def predict_naive_bayes(instance, priors, likelihoods, classes):
    posteriors = {}

    for clas in classes:
      # Start with log(prior) prob for each class
      posterior = np.log(priors[clas])
      # Loop through all features and corresponding values for the new instance
      for feature, value in instance.items():
        # Check if the feature's value exists in likelihoods for the current class
        if feature in likelihoods and clas in likelihoods[feature] and value in likelihoods[feature][clas]:
          # If the value exists, add the log-likelihood to the posterior probability

            likelihood = likelihoods[feature][clas].get(value, 1e-6)  # Smoothing for unseen values
            posterior += np.log(likelihood)
      # Store the posterior probability for the current class
      posteriors[clas] = posterior

    # Return the class with the highest posterior probability
    return max(posteriors, key=posteriors.get)

In [393]:
def naive_bayes(X, y, new_instance):
  priors = calculate_priors(y)    # Calculate priors
  likelihoods = calculate_likelihoods(X, y)   # Calculate likelihoods
  classes = y.unique()    # get all classes
  return predict_naive_bayes(new_instance, priors, likelihoods, classes)

In [394]:
X = penguins[['island', 'sex', 'year']]
y = penguins['species']

In [395]:
new_penguin_instance = {
    'island': 'Dream',
    'sex': 'female',
    'year': 2009
}

In [396]:
predicted_species = naive_bayes(X, y, new_penguin_instance)
print(f"Predicted species: {predicted_species}")

Predicted species: Gentoo


# SciKit Comparisons


In [397]:
np.random.seed(17)
test = pd.DataFrame( {"x1": np.random.randn( 200 ), "x2": 2*np.random.randn( 200) - 5, "x3": np.random.randint(1, 3, size=200), "x4": np.random.randint(1, 4, size=200)} )
test["target"] = (test["x1"] > -2)*(test["x2"] > -7 )*(test["x3"]!=1)*1

In [398]:
test

Unnamed: 0,x1,x2,x3,x4,target
0,0.276266,-1.790080,2,3,1
1,-1.854628,-2.802276,2,3,1
2,0.623901,-8.206350,1,1,0
3,1.145311,-6.642125,1,1,0
4,1.037190,-3.882229,2,3,1
...,...,...,...,...,...
195,2.052304,-3.812932,1,1,0
196,1.145372,-5.475068,1,2,0
197,0.076480,-4.829170,1,2,0
198,-0.860655,-4.465796,1,3,0


In [399]:
test['target'].value_counts( normalize=True )

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
0,0.63
1,0.37


In [400]:
test.columns

Index(['x1', 'x2', 'x3', 'x4', 'target'], dtype='object')

In [401]:
for feature in test[['x1', 'x2']]:
  print(feature)

x1
x2


In [402]:
subset = test[['x1', 'x2']][test['target'] == 0]
subset

Unnamed: 0,x1,x2
2,0.623901,-8.206350
3,1.145311,-6.642125
5,1.886639,-1.362248
6,-0.111698,-8.061925
10,2.171257,-7.883595
...,...,...
195,2.052304,-3.812932
196,1.145372,-5.475068
197,0.076480,-4.829170
198,-0.860655,-4.465796


In [403]:
len(subset['x1'])

126

In [404]:
subset['x1'].nunique()

126

In [405]:
value_counts = subset['x1'].value_counts(normalize=True)
value_counts

Unnamed: 0_level_0,proportion
x1,Unnamed: 1_level_1
0.623901,0.007937
-0.314661,0.007937
-0.579169,0.007937
-0.661848,0.007937
1.612115,0.007937
...,...
-0.230373,0.007937
1.185404,0.007937
0.798402,0.007937
0.429564,0.007937


In [406]:
from sklearn.naive_bayes import CategoricalNB, GaussianNB

In [407]:
gnb = GaussianNB().fit( np.array( test[["x1","x2"]] ), np.array( test["target"] ).reshape(-1,1))

  y = column_or_1d(y, warn=True)


In [408]:
gnb.predict( np.array( pd.DataFrame( {"x1": [-3,-1.1], "x2": [-9,-3] } ) ) )

array([0, 1])

In [409]:
test_new_instance = {
    'x1': -3,
    'x2': -9
}

In [410]:
naive_bayes(test[['x1', 'x2']], test['target'], test_new_instance)

0

In [421]:
testarray1 = [-3, -1.1, 10, -5, -100]
testarray2 = [-9, -3, 100, -15, -100]

In [423]:
for i in range(len(testarray1)):
    t = testarray1[i]
    t2 = testarray2[i]
    test_new_instance = {
        'x1': t,
        'x2': t2
    }
    result = naive_bayes(test[['x1', 'x2']], test['target'], test_new_instance)
    print(f"{test_new_instance} = Predicted class: {result}")

{'x1': -3, 'x2': -9} = Predicted class: 0
{'x1': -1.1, 'x2': -3} = Predicted class: 0
{'x1': 10, 'x2': 100} = Predicted class: 0
{'x1': -5, 'x2': -15} = Predicted class: 0
{'x1': -100, 'x2': -100} = Predicted class: 0


In [413]:
cnb = CategoricalNB().fit( np.array( test[["x3"]] ).reshape(-1,1), np.array( test["target"] ).reshape(-1,1) )

  y = column_or_1d(y, warn=True)


In [414]:
cnb.predict( np.array([0,1,2] ).reshape(-1,1) )

array([0, 0, 1])

In [415]:
test_new_instance2 = {
    'x3': 1,
    'x4': 3
}

In [416]:
X = test[['x3', 'x4']]
X

Unnamed: 0,x3,x4
0,2,3
1,2,3
2,1,1
3,1,1
4,2,3
...,...,...
195,1,1
196,1,2
197,1,2
198,1,3


In [417]:
y = test['target']

In [418]:
naive_bayes(X, y, test_new_instance2)

0

# SciKit Learn Classifier