In [39]:
import math
import csv
from random import shuffle


In [40]:
class DataHandler:
    def __init__(self, filepath):
        # Constructor to initialize the filepath
        self.filepath = filepath

    def read_csv(self):
        # Read data from a CSV file and store it in a list
        with open(self.filepath, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)  # Skip the header row
            dataset = [row for row in csv_reader]
        return dataset

    def train_test_split(self, dataset, test_size=0.2):
        # Shuffle the dataset to ensure randomness
        shuffle(dataset)
        # Determine the split index based on the test size
        split_index = int(len(dataset) * (1 - test_size))
        # Split the dataset into training and testing sets
        return dataset[:split_index], dataset[split_index:]


    def separate_features_labels(self, dataset):
        # Separate the features and labels from the dataset
        # Convert the feature values to floats for computation
        features = [list(map(float, data[1:-1])) for data in dataset]  # Exclude the ID and label
        labels = [data[-1] for data in dataset]  # The label is the last element in each row
        return features, labels

In [41]:
class NaiveBayesClassifier:
    def __init__(self):
        # Initialize dictionaries to store the means, standard deviations,
        # and class probabilities for each class
        self.means = {}
        self.stds = {}
        self.class_probabilities = {}

    def fit(self, X, y):
        # Train the classifier by calculating the class probabilities
        # and the means and standard deviations for each feature
        self._calculate_class_probabilities(y)
        self._calculate_means_stds(X, y)

    def _calculate_class_probabilities(self, y):
        # Calculate the probability of each class based on label frequency
        class_counts = {label: y.count(label) for label in set(y)}
        total_count = len(y)
        self.class_probabilities = {label: count / total_count for label, count in class_counts.items()}

    def _calculate_means_stds(self, X, y):
        # Calculate the mean and standard deviation for each class and each feature
        for label in self.class_probabilities:
            # Extract features for instances of the current class
            label_features = [X[i] for i in range(len(X)) if y[i] == label]
            # Calculate mean and standard deviation for each feature
            self.means[label] = [sum(f) / len(f) for f in zip(*label_features)]
            self.stds[label] = [math.sqrt(sum([(x - mean)**2 for x in f]) / len(f)) for mean, f in zip(self.means[label], zip(*label_features))]

    def predict_single(self, input_features):
        # Predict the class of a single feature set
        probabilities = {}
        for label, _ in self.means.items():
            # Start with the prior probability of the class
            probabilities[label] = self.class_probabilities[label]
            # Multiply by the probability of each feature
            for i, feature in enumerate(input_features):
                probabilities[label] *= self._calculate_probability(feature, self.means[label][i], self.stds[label][i])
        # Return the class with the highest probability
        return max(probabilities, key=probabilities.get)

    def _calculate_probability(self, x, mean, std):
        # Calculate the probability of a feature value with a Gaussian distribution
        exponent = math.exp(-(math.pow(x-mean,2)/(2*math.pow(std,2))))
        return (1 / (math.sqrt(2*math.pi) * std)) * exponent

    def predict(self, X):
        # Predict a list of feature sets
        return [self.predict_single(features) for features in X]

    def classification_report(self, y_true, y_pred):
        # Generate a classification report for the predictions
        unique_labels = set(y_true)
        report = {}
        for label in unique_labels:
            tp = sum(1 for i in range(len(y_true)) if y_true[i] == label and y_pred[i] == label)
            fp = sum(1 for i in range(len(y_true)) if y_true[i] != label and y_pred[i] == label)
            fn = sum(1 for i in range(len(y_true)) if y_true[i] == label and y_pred[i] != label)
            tn = sum(1 for i in range(len(y_true)) if y_true[i] != label and y_pred[i] != label)

            # Calculate precision, recall, and F1-score for each class
            precision = tp / (tp + fp) if tp + fp > 0 else 0
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
            accuracy = (tp + tn) / len(y_true)

            report[label] = {
                'Precision': precision,
                'Recall': recall,
                'F1-score': f1,
                'Accuracy': accuracy
            }

        return report

In [42]:
def main():
    # Define the path to the CSV file containing the boston housing dataset
    filepath = '/content/banana_quality.csv'

    # Initialize the data handler with the filepath
    # This object will handle all data operations
    data_handler = DataHandler(filepath)

    # Read the dataset from the CSV file using the read_csv method
    # The dataset is returned as a list of lists, where each sublist is a row from the file
    dataset = data_handler.read_csv()

    # Split the dataset into training and testing parts using the train_test_split method
    # Default split is 80% training and 20% testing
    train_set, test_set = data_handler.train_test_split(dataset)

    # Separate features and labels for the training set
    # train_features will contain the data attributes, and train_labels will contain the target labels
    train_features, train_labels = data_handler.separate_features_labels(train_set)

    # Separate features and labels for the testing set
    # This setup mirrors the training separation
    test_features, test_labels = data_handler.separate_features_labels(test_set)

    # Initialize the Naive Bayes Classifier
    # This object will perform all classification tasks
    classifier = NaiveBayesClassifier()

    # Fit the classifier on the training data
    # This process involves calculating necessary statistical parameters for the Naive Bayes algorithm
    classifier.fit(train_features, train_labels)

    # Predict the class labels for the test set features
    # The predict method uses the trained model to estimate the labels of unseen data
    predictions = classifier.predict(test_features)

    # Generate a classification report comparing the true labels and predicted labels
    # This report includes precision, recall, F1-score, and accuracy for each class
    report = classifier.classification_report(test_labels, predictions)

    # Print out the classification report for each class
    print("Classification Report:")
    for label, metrics in report.items():
        print(f"Class {label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.2f}")
        print()

# This block checks if this script is the main program and runs the main function
if __name__ == "__main__":
    main()

Classification Report:
Class Good:
  Precision: 0.86
  Recall: 0.85
  F1-score: 0.86
  Accuracy: 0.86

Class Bad:
  Precision: 0.85
  Recall: 0.86
  F1-score: 0.85
  Accuracy: 0.86



In [43]:
import csv
from random import shuffle
from math import sqrt
from collections import Counter

In [44]:
class DataHandler:
    def __init__(self, filepath):
        # Constructor to initialize the filepath
        self.filepath = filepath

    def read_csv(self):
        # Read data from a CSV file and store it in a list
        with open(self.filepath, 'r') as file:
            csv_reader = csv.reader(file)
            next(csv_reader)  # Skip the header row
            dataset = [row for row in csv_reader]
        return dataset

    def train_test_split(self, dataset, test_size=0.2):
        # Shuffle the dataset to ensure randomness
        shuffle(dataset)
        # Determine the split index based on the test size
        split_index = int(len(dataset) * (1 - test_size))
        # Split the dataset into training and testing sets
        return dataset[:split_index], dataset[split_index:]

    def separate_features_labels(self, dataset):
        # Separate the features and labels from the dataset
        # Convert the feature values to floats for computation
        features = [list(map(float, data[1:-1])) for data in dataset]  # Exclude the ID and label
        labels = [data[-1] for data in dataset]  # The label is the last element in each row
        return features, labels

In [45]:
import numpy as np

class KNNClassifier:
    def __init__(self, k=3):
        self.k = k
        self.X_train = None
        self.y_train = None

    def fit(self, X_train, y_train):
        # KNN is a lazy learner; 'fit' just stores the data
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)


    def predict_single(self, input_features):
        # Calculate the distance from the input features to all training data
        distances = [np.sqrt(np.sum((x - input_features) ** 2)) for x in self.X_train]
        # Get the indices of the k smallest distances
        k_indices = np.argsort(distances)[:self.k]
        # Find the most common class among these indices
        k_labels = [self.y_train[i] for i in k_indices]
        # Return the most common class
        return max(set(k_labels), key=k_labels.count)

    def predict(self, X):
        # Predict a label for each feature set in X
        return [self.predict_single(features) for features in X]

    def classification_report(self, y_true, y_pred):
        # Generate a classification report for the predictions
        unique_labels = set(y_true)
        report = {}
        for label in unique_labels:
            tp = sum(1 for i in range(len(y_true)) if y_true[i] == label and y_pred[i] == label)
            fp = sum(1 for i in range(len(y_true)) if y_true[i] != label and y_pred[i] == label)
            fn = sum(1 for i in range(len(y_true)) if y_true[i] == label and y_pred[i] != label)
            tn = sum(1 for i in range(len(y_true)) if y_true[i] != label and y_pred[i] != label)

            # Calculate precision, recall, F1-score, and accuracy for each class
            precision = tp / (tp + fp) if tp + fp > 0 else 0
            recall = tp / (tp + fn) if tp + fn > 0 else 0
            f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
            accuracy = (tp + tn) / len(y_true)

            report[label] = {
                'Precision': precision,
                'Recall': recall,
                'F1-score': f1,
                'Accuracy': accuracy
            }
        return report


In [None]:
def main():
   # Define the path to the CSV file containing the boston housing dataset
    filepath = '/content/banana_quality.csv'

    # Initialize the data handler with the filepath
    # This object will handle all data operations
    data_handler = DataHandler(filepath)

    # Initialize the data handler with the filepath
    # This object will handle all data operations
    dataset = data_handler.read_csv()


    # Split the dataset into training and testing parts using the train_test_split method
    # Default split is 80% training and 20% testing
    train_set, test_set = data_handler.train_test_split(dataset)


    # Separate features and labels for the training set
    train_features, train_labels = data_handler.separate_features_labels(train_set)


    # Separate features and labels for the testing set
    # This setup mirrors the training separation
    test_features, test_labels = data_handler.separate_features_labels(test_set)

     # Initialize the KNN Classifier
    # This object will perform all classification tasks
    classifier = KNNClassifier(k=5)

    # Fit the classifier on the training data
    classifier.fit(train_features, train_labels)

    # Predict the class labels for the test set features
    predictions = classifier.predict(test_features)


    # Generate a classification report comparing the true labels and predicted labels
    # This report includes precision, recall, F1-score, and accuracy for each class
    report = classifier.classification_report(test_labels, predictions)

    print("Classification Report:")
    for label, metrics in report.items():
        print(f"Class {label}:")
        for metric, value in metrics.items():
            print(f"  {metric}: {value:.2f}")
        print()

if __name__ == "__main__":
    main()
