In [22]:
# Import necessary libraries
import numpy as np
import pandas as pd
from scipy.optimize import minimize

# Load the dataset from the locally saved CSV file
df = pd.read_csv('breast_cancer_data.csv')

# Check the first few rows to verify loading
print(df.head())

   mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
0        17.99         10.38          122.80     1001.0          0.11840   
1        20.57         17.77          132.90     1326.0          0.08474   
2        19.69         21.25          130.00     1203.0          0.10960   
3        11.42         20.38           77.58      386.1          0.14250   
4        20.29         14.34          135.10     1297.0          0.10030   

   mean compactness  mean concavity  mean concave points  mean symmetry  \
0           0.27760          0.3001              0.14710         0.2419   
1           0.07864          0.0869              0.07017         0.1812   
2           0.15990          0.1974              0.12790         0.2069   
3           0.28390          0.2414              0.10520         0.2597   
4           0.13280          0.1980              0.10430         0.1809   

   mean fractal dimension  ...  worst texture  worst perimeter  worst area  \
0             

In [23]:
import random

# Custom function to split data manually into training and testing sets
def train_test_split_custom(X, y, test_size=0.2, random_state=None):
    if random_state:
        np.random.seed(random_state)

    # Combine features and labels to shuffle them together
    data = np.hstack((X, y.reshape(-1, 1)))

    # Shuffle the data
    np.random.shuffle(data)

    # Calculate the split index
    split_index = int((1 - test_size) * len(data))

    # Split the data into train and test
    train_data = data[:split_index, :]
    test_data = data[split_index:, :]

    # Split into X_train, X_test, y_train, y_test
    X_train = train_data[:, :-1]
    y_train = train_data[:, -1]
    X_test = test_data[:, :-1]
    y_test = test_data[:, -1]

    return X_train, X_test, y_train, y_test

# Sigmoid function
def sigmoid(z):
    # Clip z to avoid overflow in np.exp()
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))


# Cost function (negative log likelihood)
def cost_function(theta, X, y):
    m = len(y)
    h = sigmoid(X @ theta)
    epsilon = 1e-5
    cost = -(1/m) * (y.T @ np.log(h + epsilon) + (1 - y).T @ np.log(1 - h + epsilon))
    return cost

# Gradient function
def gradient(theta, X, y):
    m = len(y)
    h = sigmoid(X @ theta)
    grad = (1/m) * X.T @ (h - y)
    return grad

# Logistic regression training
def logistic_regression(X, y):
    # Add intercept term to X
    X = np.hstack((np.ones((X.shape[0], 1)), X))

    # Initialize parameters (theta)
    initial_theta = np.zeros(X.shape[1])

    # Minimize the cost function
    result = minimize(fun=cost_function, x0=initial_theta, args=(X, y), method='TNC', jac=gradient)

    return result.x

# Predict function
def predict(X, theta):
    X = np.hstack((np.ones((X.shape[0], 1)), X))  # Add intercept term
    probabilities = sigmoid(X @ theta)
    return probabilities >= 0.5

In [24]:
# Split the data manually
X = df.drop(columns=['target']).values
y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split_custom(X, y, test_size=0.2, random_state=42)

# Train the logistic regression model
theta = logistic_regression(X_train, y_train)

print("Trained model parameters:", theta)

Trained model parameters: [ 2.99271243e+01  1.99234033e+01 -1.03539889e-01  1.08520496e+00
 -2.20351679e-01 -7.08059707e+02  5.24406914e+02 -9.91309496e+01
 -5.00491141e+02  3.53381243e+02 -2.53526423e+00 -9.66264084e+00
 -5.07809775e+00 -5.86854276e+00 -9.37309409e-01 -1.00450669e+02
  8.17273743e+02  1.31844666e+01 -1.09480928e+02  3.18754737e+02
  5.26023759e+01  2.08893540e+00 -1.88051961e+00  6.06282540e-02
 -1.43791531e-01  5.24539923e+01 -1.26860721e+02 -3.20621316e+00
 -3.88481345e+02 -1.12919514e+02 -3.66151510e+02]


In [26]:
# Make predictions on the test set
predictions = predict(X_test, theta)

# Output predictions and actual labels side by side
comparison = pd.DataFrame({
    'Predicted': predictions.astype(int),
    'Actual': y_test.astype(int)
})

# Print the side-by-side comparison
print(comparison.head(20))

# Calculate accuracy
accuracy = np.mean(predictions == y_test)
print(f"\nAccuracy: {accuracy * 100:.2f}%")

    Predicted  Actual
0           0       0
1           0       0
2           1       1
3           0       0
4           0       0
5           1       1
6           1       1
7           1       1
8           0       0
9           1       1
10          0       0
11          0       0
12          0       0
13          1       0
14          1       1
15          1       1
16          0       0
17          1       1
18          1       1
19          0       0

Accuracy: 93.86%
