<a href="https://colab.research.google.com/github/michaeledge27/CSCI290/blob/main/notebooks/partnerProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [208]:
import pandas as pd
from scipy.stats import norm
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

# Data Exploration

In [209]:
np.random.seed(17)

In [210]:
penguins = pd.read_csv("https://github.com/mbrudd/csci290/raw/refs/heads/main/data/penguins.csv")

In [211]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,male,2007
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,female,2007
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,female,2007
3,Adelie,Torgersen,,,,,,2007
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,female,2007


In [212]:
penguins[['island']].value_counts()

Unnamed: 0_level_0,count
island,Unnamed: 1_level_1
Biscoe,168
Dream,124
Torgersen,52


In [213]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
 7   year               344 non-null    int64  
dtypes: float64(4), int64(1), object(3)
memory usage: 21.6+ KB


# NaiveBayesClassifier

In [214]:
def calculate_priors(y):
    return y.value_counts( normalize=True ).to_dict()  #Calculate prior probabilities for each class

In [215]:
# Calcultate conditional probabilities for y(target) given X(features)
# P(B|A)
def calculate_likelihoods(X, y):
  likelihoods = {}              # Initialize dict
  for feature in X:     # Loop through all features
    likelihoods[feature] = {}    # Initialize nested likelihood dict
    for clas in y.unique():     # Loop through all unique values of our target (Gentoo, Adelie, Chinstrap)
      subset = X[y == clas]       # Create a subset of X where y equals the current class
      value_counts = subset[feature].value_counts(normalize=True)  # Calculate value counts for the current feature in the subset and normalize them
      likelihoods[feature][clas] = value_counts.to_dict() #store in the nested likelihoods dict
  return likelihoods

In [216]:

def predict_naive_bayes(instance, priors, likelihoods, classes):
    posteriors = {}

    for clas in classes:
      # Start with log(prior) prob for each class
      posterior = np.log(priors[clas])
      # Loop through all features and corresponding values for the new instance
      for feature, value in instance.items():
        # Check if the feature's value exists in likelihoods for the current class
        if value in likelihoods.get(feature, {}).get(clas, {}):
          # If the value exists, add the log-likelihood to the posterior probability
            posterior += np.log(likelihoods[feature][clas].get(value))

      # Store the posterior probability for the current class
      posteriors[clas] = posterior

    # Return the class with the highest posterior probability
    return max(posteriors, key=posteriors.get)

In [217]:
def naive_bayes(X, y, new_instance):
  priors = calculate_priors(y)    # Calculate priors
  likelihoods = calculate_likelihoods(X, y)   # Calculate likelihoods
  classes = y.unique()    # get all classes
  return predict_naive_bayes(new_instance, priors, likelihoods, classes)

In [218]:
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g', 'sex']]
y = penguins['species']

In [219]:
new_penguin_instance = {
    'bill_length_mm': 45.0,
    'bill_depth_mm': 14.0,
    'flipper_length_mm': 210.0,
    'body_mass_g': 4500,
    'sex': 'Male'
}

In [220]:
predicted_species = naive_bayes(X, y, new_penguin_instance)
print(f"Predicted species: {predicted_species}")

Predicted species: Chinstrap


# SciKit Comparisons


In [221]:
np.random.seed(17)
test = pd.DataFrame( {"x1": np.random.randn( 200 ), "x2": 2*np.random.randn( 200) - 5, "x3": np.random.randint(3, size=200) } )
test["target"] = (test["x1"] > -2)*(test["x2"] > -7 )*(test["x3"]!=1)*1

In [222]:
test

Unnamed: 0,x1,x2,x3,target
0,0.276266,-1.790080,2,1
1,-1.854628,-2.802276,2,1
2,0.623901,-8.206350,1,0
3,1.145311,-6.642125,0,1
4,1.037190,-3.882229,1,0
...,...,...,...,...
195,2.052304,-3.812932,2,1
196,1.145372,-5.475068,1,0
197,0.076480,-4.829170,0,1
198,-0.860655,-4.465796,1,0


In [223]:
test['target'].value_counts( normalize=True )

Unnamed: 0_level_0,proportion
target,Unnamed: 1_level_1
1,0.56
0,0.44


In [224]:
test.columns

Index(['x1', 'x2', 'x3', 'target'], dtype='object')

In [225]:
for feature in test[['x1', 'x2']]:
  print(feature)

x1
x2


In [243]:
subset = test[['x1', 'x2']][test['target'] == 0]
subset

Unnamed: 0,x1,x2
2,0.623901,-8.206350
4,1.037190,-3.882229
5,1.886639,-1.362248
6,-0.111698,-8.061925
10,2.171257,-7.883595
...,...,...
192,0.050414,-4.799354
194,-0.146796,-4.109879
196,1.145372,-5.475068
198,-0.860655,-4.465796


In [247]:
value_counts = subset[test['x1']].value_counts(normalize=True)
value_counts

KeyError: "None of [Index([ 0.27626589002131874,   -1.854628078806505,   0.6239011113263563,\n         1.1453112895720903,   1.0371904682278827,   1.8866389297816077,\n       -0.11169829012876616, -0.36210133839904846,  0.14867504504524393,\n        -0.4377831525146323,\n       ...\n         0.3382879238734603,   0.5102855550310551,  0.05041355903881306,\n        -0.8123132807691472,  -0.1467960824005453,   2.0523040848275786,\n          1.145372412072999,  0.07647991830718862,  -0.8606554654163185,\n       -0.38769897939626885],\n      dtype='float64', length=200)] are in the [columns]"

In [227]:
from sklearn.naive_bayes import CategoricalNB, GaussianNB

In [228]:
gnb = GaussianNB().fit( np.array( test[["x1","x2"]] ), np.array( test["target"] ).reshape(-1,1))

  y = column_or_1d(y, warn=True)


In [238]:
gnb.predict( np.array( pd.DataFrame( {"x1": [-3,-1.1], "x2": [-9,-3] } ) ) )

array([0, 1])

In [239]:
test_new_instance = {
    'x1': -3,
    'x2': -9
}

In [240]:
naive_bayes(test[['x1', 'x2']], test['target'], test_new_instance)

1

In [232]:
testarray1 = [0, 3, 5, 100]
testarray2 = [0, 8, 15, 100]

In [233]:
for t in testarray1:
    for t2 in testarray2:
        test_new_instance = {
            'x1': t,
            'x2': t2
        }
        print(naive_bayes(test[['x1', 'x2']], test['target'], test_new_instance))

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [234]:
cnb = CategoricalNB().fit( np.array( test[["x3"]] ).reshape(-1,1), np.array( test["target"] ).reshape(-1,1) )

  y = column_or_1d(y, warn=True)


In [235]:
cnb.predict( np.array([0,1,2] ).reshape(-1,1) )

array([1, 0, 1])

In [236]:
test_new_instance2 = {
    'x3': 1
}

In [237]:
naive_bayes(test[['x3']], test['target'], test_new_instance2)

1

# SciKit Learn Classifier