## Creating a Logistic Regression Model
4/26/25

In [3]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler

In [5]:
# Load dataset
# Replace 'your_dataset.csv' with your actual dataset file
data = pd.read_csv("../../data/raw/dataset.csv")

# Verify the data is loaded correctly
print(data.head())  # Display the first few rows of the dataset
print(data.info())  # Display information about the dataset

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  compactness_mean  concavity_mean  concave points_mean  \
0          0.11840           0.27760          0.3001              0.14710   
1          0.08474           0.07864          0.0869              0.07017   
2          0.10960           0.15990          0.1974              0.12790   
3          0.14250           0.28390          0.2414              0.10520   
4          0.10030           0.13280          0.1980              0.10430   

   ...  radius_worst  texture_worst  perimeter_worst  area_wor

In [6]:
# Preprocess data
# Convert 'diagnosis' column to numeric (e.g., 'B' -> 0, 'M' -> 1)
data['diagnosis'] = data['diagnosis'].map({'B': 0, 'M': 1})
# Define features (X) and target (y)
# Drop target and id
X = data.drop(['diagnosis', 'id'], axis=1)
y = data['diagnosis']


In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (455, 30)
X_test shape: (114, 30)
y_train shape: (455,)
y_test shape: (114,)


In [11]:
print(X_test)

y_test.to_csv("./test.csv")

     radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  \
550       10.860         21.48           68.51      360.5          0.07431   
147       14.950         18.77           97.84      689.5          0.08138   
4         20.290         14.34          135.10     1297.0          0.10030   
77        18.050         16.15          120.20     1006.0          0.10650   
339       23.510         24.27          155.10     1747.0          0.10690   
..           ...           ...             ...        ...              ...   
212       28.110         18.47          188.50     2499.0          0.11420   
446       17.750         28.03          117.30      981.6          0.09997   
529       12.070         13.44           77.83      445.2          0.11000   
175        8.671         14.45           54.42      227.2          0.09138   
172       15.460         11.89          102.50      736.9          0.12570   

     compactness_mean  concavity_mean  concave points_mean  sym

In [5]:
# Scale features to ensure each has a mean of 0 and stdv of 1
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [6]:
# Initialize logistic regression model
model = LogisticRegression(max_iter = 1000) #default is 100, we were stopping at `100` iterations before converging on best solution

In [7]:
# Train the model
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [8]:
# Make predictions
y_pred = model.predict(X_test)

In [9]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

In [10]:
# Print evaluation metrics
print("Accuracy:", accuracy)
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Accuracy: 0.9385964912280702
Confusion Matrix:
 [[73  2]
 [ 5 34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95        75
           1       0.94      0.87      0.91        39

    accuracy                           0.94       114
   macro avg       0.94      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



In [11]:
# Extract feature importance
# Extract feature names from the dataset
feature_names = X.columns
# Get the coefficients
coefficients = model.coef_[0]  # shape (n_features,)

# Pair each coefficient with its feature name
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)  # for easier sorting by strength
})

# Sort features by importance (absolute value of coefficient)
feature_importance = feature_importance.sort_values(by='abs_coefficient', ascending=False)

print(feature_importance)

                    feature  coefficient  abs_coefficient
0               radius_mean    -2.125542         2.125542
26          concavity_worst     1.431230         1.431230
11               texture_se    -0.976911         0.976911
28           symmetry_worst     0.944582         0.944582
20             radius_worst    -0.906296         0.906296
25        compactness_worst     0.880785         0.880785
27     concave points_worst     0.778917         0.778917
6            concavity_mean     0.709375         0.709375
24         smoothness_worst     0.498740         0.498740
12             perimeter_se     0.488611         0.488611
7       concave points_mean     0.435150         0.435150
5          compactness_mean     0.411162         0.411162
21            texture_worst     0.365716         0.365716
4           smoothness_mean     0.279781         0.279781
8             symmetry_mean     0.277338         0.277338
15           compactness_se    -0.159921         0.159921
1             

## Saving the Trained Model
4/26/25


In [None]:
# Importing necessary libraries
import pickle
# Save the model to a file
with open('../../models/logistic_regression_model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [None]:

# Load the model   
with open('../../models/logistic_regression_model.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


In [14]:

# Make predictions with the loaded model
loaded_y_pred = loaded_model.predict(X_test)


In [15]:

# Evaluate the loaded model
loaded_accuracy = accuracy_score(y_test, loaded_y_pred)
loaded_conf_matrix = confusion_matrix(y_test, loaded_y_pred)
loaded_class_report = classification_report(y_test, loaded_y_pred)


In [16]:

# Print evaluation metrics for the loaded model
print("Loaded Model Accuracy:", loaded_accuracy)
print("Loaded Model Confusion Matrix:\n", loaded_conf_matrix)
print("Loaded Model Classification Report:\n", loaded_class_report)


Loaded Model Accuracy: 0.9385964912280702
Loaded Model Confusion Matrix:
 [[73  2]
 [ 5 34]]
Loaded Model Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.97      0.95        75
           1       0.94      0.87      0.91        39

    accuracy                           0.94       114
   macro avg       0.94      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114

