In [1]:
import pandas as pd
import numpy as np

In [2]:
def load_data(file_path):
    return pd.read_csv(file_path)

In [3]:
def split_features_target(dataset, target_column):
    X = dataset.drop(columns=[target_column])  # Features
    y = dataset[target_column]  # Target variable
    return X, y

In [4]:
def calculate_prior(y):
    class_counts = y.value_counts().to_dict()
    total_count = len(y)
    prior = {cls: count / total_count for cls, count in class_counts.items()}
    return prior

In [5]:
def calculate_likelihood(X, y):
    likelihood = {}
    for feature in X.columns:
        likelihood[feature] = {}
        for feature_value in X[feature].unique():
            likelihood[feature][feature_value] = {}
            for class_value in y.unique():
                feature_class_count = len(X[(X[feature] == feature_value) & (y == class_value)])
                class_count = len(y[y == class_value])
                likelihood[feature][feature_value][class_value] = (feature_class_count + 1) / (class_count + len(X[feature].unique()))
    return likelihood

In [6]:
def calculate_posterior(sample, prior, likelihood):
    posteriors = {}
    for class_value, class_prior in prior.items():
        posterior = class_prior
        for feature, feature_value in sample.items():
            if feature_value in likelihood[feature]:
                posterior *= likelihood[feature][feature_value].get(class_value, 1e-5)
        posteriors[class_value] = posterior
    return posteriors

In [7]:
def predict(sample, prior, likelihood):
    posteriors = calculate_posterior(sample, prior, likelihood)
    return max(posteriors, key=posteriors.get)

In [8]:
def predict_dataset(X, prior, likelihood):
    predictions = []
    for _, row in X.iterrows():
        predictions.append(predict(row, prior, likelihood))
    return predictions

In [12]:
from sklearn.model_selection import train_test_split
def naive_bayes(file_path, target_column):
    # Load and split data
    dataset = load_data(file_path)
    # dataset = dataset.iloc[:, :10]
    X, y = split_features_target(dataset, target_column)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    prior = calculate_prior(y_train)
    likelihood = calculate_likelihood(X_train, y_train)

    print(X_test, y_test)
    predictions = predict_dataset(X_test, prior, likelihood)
    accuracy = np.mean(predictions == y_test)

    print(f"Predictions: {predictions}")
    print(f"Accuracy on test data: {accuracy:.2f}")
    
    return predictions, accuracy

In [13]:
if __name__ == '__main__':
    # Path to your CSV file
    file_path = 'breast_cancer_data.csv'
    target_column = 'diagnosis'  # Name of the target column in your CSV
    
    naive_bayes(file_path, target_column)

           id  radius_mean  texture_mean  perimeter_mean  area_mean  \
204     87930        12.47         18.60           81.09      481.9   
70     859575        18.94         21.31          123.60     1130.0   
131      8670        15.46         19.48          101.70      748.9   
431    907915        12.40         17.68           81.47      467.8   
540    921385        11.54         14.44           74.65      402.9   
..        ...          ...           ...             ...        ...   
486    913102        14.64         16.85           94.21      666.0   
75    8610404        16.07         19.65          104.10      817.7   
249    884689        11.52         14.93           73.87      406.3   
238    883270        14.22         27.85           92.55      623.9   
265  88995002        20.73         31.12          135.70     1419.0   

     smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
204          0.09965           0.10580         0.08005              