# Machine Learning Project Report

# 1. Environment & Dateset preparation

## 1.1 Import libraries

In [16]:
import os
import glob
import math
import random
import pathlib
import importlib
import numpy as np
import pandas as pd
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
import tensorflow as tf
import tensorflow.keras.layers as layers
import tensorflow.keras.activations as activations
import tensorflow.keras.optimizers as optimizers
import tensorflow.keras.losses as losses
import tensorflow.keras.metrics as metrics

In [17]:
os.environ["KAGGLE_KEY"] = '6c8ce03a58d29eae65ff234d38d43a8b'
os.environ["KAGGLE_USERNAME"] = 'orangethefish'

In [18]:
!kaggle datasets download -d uciml/iris
!unzip -qq -o "iris.zip" -d "dataset"
!echo "Unzip completed"

Dataset URL: https://www.kaggle.com/datasets/uciml/iris
License(s): CC0-1.0
iris.zip: Skipping, found more recently modified local copy (use --force to force download)
Unzip completed


In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer

def preprocess_dataset(df, target_variable, test_size=0.2, random_state=None):
    """
    General preprocessing pipeline for a dataset to handle missing values, encoding categorical variables, 
    and scaling numerical features.

    Parameters:
    - df: Pandas DataFrame containing the dataset.
    - target_variable: The name of the column to be used as the target variable.
    - test_size: Proportion of the dataset to include in the test split.
    - random_state: Controls the shuffling applied to the data before applying the split.

    Returns:
    - X_train, X_test, y_train, y_test: preprocessed split data.
    """
    
    # Separate features and target variable
    X = df.drop(columns=[target_variable])
    y = df[target_variable]
    
    # Splitting the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    
    # Identifying numerical and categorical columns
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    
    # Creating transformers for numerical and categorical features
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    # Combining transformers into a ColumnTransformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)
        ])
    
    # Applying the ColumnTransformer to the training and testing sets
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)
    
    return X_train, X_test, y_train, y_test

# Example usage with the Iris dataset
from sklearn.datasets import load_iris
iris = load_iris()
df_iris = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

X_train, X_test, y_train, y_test = preprocess_dataset(df_iris, 'target')

In [25]:
import numpy as np

class GaussianNaiveBayes:
    def fit(self, X, y):
        # Separate the data by class
        self.classes = np.unique(y)
        self.parameters = {}
        
        for c in self.classes:
            X_c = X[y == c]
            self.parameters[c] = {
                'mean': X_c.mean(axis=0),
                'var': X_c.var(axis=0),
                'prior': X_c.shape[0] / X.shape[0]
            }
            
    def calculate_likelihood(self, class_idx, x):
        mean = self.parameters[class_idx]["mean"]
        var = self.parameters[class_idx]["var"]
        numerator = np.exp(-(x-mean)**2 / (2 * var))
        denominator = np.sqrt(2 * np.pi * var)
        return numerator / denominator
    
    def calculate_posterior(self, x):
        posteriors = []
        
        for c in self.classes:
            prior = np.log(self.parameters[c]["prior"])
            conditional_prob = np.sum(np.log(self.calculate_likelihood(c, x)))
            posteriors.append(prior + conditional_prob)
            
        return self.classes[np.argmax(posteriors)]
    
    def predict(self, X):
        return [self.calculate_posterior(x) for x in X]

# Assume `X_train`, `y_train` contains the training data and labels
nb = GaussianNaiveBayes()
nb.fit(X_train, y_train)

# To predict the class of a new sample:
predictions = nb.predict(X_test)
accuracy = np.mean(predictions == y_test)
print(f"Accuracy: {accuracy}")

Accuracy: 0.9333333333333333
