In [1]:
pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [2]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer_wisconsin_diagnostic = fetch_ucirepo(id=17) 
  
# data (as pandas dataframes) 
X = breast_cancer_wisconsin_diagnostic.data.features 
y = breast_cancer_wisconsin_diagnostic.data.targets 
  
# metadata 
print(breast_cancer_wisconsin_diagnostic.metadata) 
  
# variable information 
print(breast_cancer_wisconsin_diagnostic.variables) 


{'uci_id': 17, 'name': 'Breast Cancer Wisconsin (Diagnostic)', 'repository_url': 'https://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic', 'data_url': 'https://archive.ics.uci.edu/static/public/17/data.csv', 'abstract': 'Diagnostic Wisconsin Breast Cancer Database.', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 569, 'num_features': 30, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['Diagnosis'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 1993, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C5DW2B', 'creators': ['William Wolberg', 'Olvi Mangasarian', 'Nick Street', 'W. Street'], 'intro_paper': {'title': 'Nuclear feature extraction for breast tumor diagnosis', 'authors': 'W. Street, W. Wolberg, O. Mangasarian', 'published_in': 'Electronic imaging', 'year': 1993, 'url': 'https://www.semanticscholar.org/paper/53

In [3]:
import pandas as pd
import numpy as np

# Load the dataset
data_url = 'https://archive.ics.uci.edu/static/public/17/data.csv'
df = pd.read_csv(data_url)

# Preprocessing
# Convert 'M' to 1 and 'B' to 0 in the Diagnosis column for binary classification
df['Diagnosis'] = df['Diagnosis'].map({'M': 1, 'B': 0})

# Drop the 'ID' column as it is not a feature
df = df.drop(columns=['ID'])

# Split the dataset into features and target
X = df.drop(columns=['Diagnosis']).values
y = df['Diagnosis'].values


In [4]:
# Assuming a 70-30 train-test split
np.random.seed(0)  # For reproducibility
msk = np.random.rand(len(df)) < 0.7
X_train = X[msk]
y_train = y[msk]
X_test = X[~msk]
y_test = y[~msk]


In [8]:
class GaussianNaiveBayes:
    def fit(self, X, y):
        self.classes = np.unique(y)
        self.parameters = {}
        for c in self.classes:
            X_c = X[y == c]
            self.parameters[c] = {
                'mean': X_c.mean(axis=0),
                'var': X_c.var(axis=0),
                'prior': X_c.shape[0] / X.shape[0]
            }
    
    def calculate_likelihood(self, X, mean, var):
        # Adding a small constant to the variance to prevent divide by zero
        small_constant = 1e-6
        adjusted_var = var + small_constant
        exponent = np.exp(-(X - mean) ** 2 / (2 * adjusted_var))
        return (1 / np.sqrt(2 * np.pi * adjusted_var)) * exponent
    
    def calculate_posterior(self, X):
        posteriors = []
        for c in self.classes:
            prior = np.log(self.parameters[c]['prior'])
            likelihood = self.calculate_likelihood(X, self.parameters[c]['mean'], self.parameters[c]['var'])
            # Replace zeros in likelihood with a very small number before taking log
            likelihood[likelihood == 0] = 1e-9
            posterior = np.sum(np.log(likelihood), axis=1) + prior
            posteriors.append(posterior)
        return self.classes[np.argmax(posteriors, axis=0)]
    
    def predict(self, X):
        return self.calculate_posterior(X)

# Assume X_train, y_train are prepared and are numeric arrays
model = GaussianNaiveBayes()

# Train the model
model.fit(X_train, y_train)

# Predictions
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)


In [9]:
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

# Calculate accuracies
train_accuracy = accuracy(y_train, y_train_pred)
test_accuracy = accuracy(y_test, y_test_pred)

print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)


Training Accuracy: 0.9318734793187348
Test Accuracy: 0.9177215189873418


In [13]:
# Feature scaling function
def standardize_data(X):
    return (X - np.mean(X, axis=0)) / np.std(X, axis=0)

X_train_scaled = standardize_data(X_train)
X_test_scaled = standardize_data(X_test)

# Stable sigmoid function
def sigmoid(z):
    z = np.clip(z, -500, 500)  # Clipping z to prevent overflow
    return 1 / (1 + np.exp(-z))

# Logistic Regression implementation
def logistic_regression(X, y, learning_rate=0.01, max_iter=1000):
    weights = np.zeros(X.shape[1])
    for _ in range(max_iter):
        z = np.dot(X, weights)
        predictions = sigmoid(z)
        errors = y - predictions
        gradient = np.dot(X.T, errors) / len(X)
        weights += learning_rate * gradient
    return weights

def predict_logistic(X, weights):
    probabilities = sigmoid(np.dot(X, weights))
    return (probabilities >= 0.5).astype(int)

# Train the logistic regression model
weights_lr = logistic_regression(X_train_scaled, y_train, learning_rate=0.001, max_iter=1000)

# Make predictions
y_train_pred_lr = predict_logistic(X_train_scaled, weights_lr)
y_test_pred_lr = predict_logistic(X_test_scaled, weights_lr)

# Calculate accuracies
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)

train_accuracy_lr = accuracy(y_train, y_train_pred_lr)
test_accuracy_lr = accuracy(y_test, y_test_pred_lr)

print("Training Accuracy for Logistic Regression:", train_accuracy_lr)
print("Test Accuracy for Logistic Regression:", test_accuracy_lr)

Training Accuracy for Logistic Regression: 0.948905109489051
Test Accuracy for Logistic Regression: 0.9493670886075949
