In [None]:
#import the needed libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:

# Load the dataset
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'
colnames = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
           'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
           'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
           'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
           'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
           'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst',
           'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst',
           'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']
data = pd.read_csv(url, header=None, names=colnames)



In [None]:

# Preprocess the data
X = data.iloc[:, 2:]
y = data.iloc[:, 1].replace({'B': 0, 'M': 1})

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Without Preprocessing
# Fit a logistic regression model
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train, y_train)

# Predict the test set
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

# Calculate the accuracy scores
acc_train = accuracy_score(y_train, y_pred_train)
acc_test = accuracy_score(y_test, y_pred_test)

# With Preprocessing
# Standardize the features
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)


In [None]:

# Fit a logistic regression model
clf = LogisticRegression(solver='lbfgs')
clf.fit(X_train_std, y_train)

In [None]:
# Predict the test set
y_pred_train_std = clf.predict(X_train_std)
y_pred_test_std = clf.predict(X_test_std)

In [None]:
# Calculate the accuracy scores
acc_train_std = accuracy_score(y_train, y_pred_train_std)
acc_test_std = accuracy_score(y_test, y_pred_test_std)

In [None]:
# Visualize the results
objects = ('Without Preprocessing - Train', 'Without Preprocessing - Test',
           'With Preprocessing - Train', 'With Preprocessing - Test')
y_pos = np.arange(len(objects))
performance = [acc_train, acc_test, acc_train_std, acc_test_std]

plt.bar(y_pos, performance, align='center', alpha=0.5)
plt.xticks(y_pos, objects)
plt.ylabel('Accuracy')
plt.title('Comparison of Logistic Regression Performance with and without Preprocessing')

plt.show()