In [481]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
import statsmodels.api as sm
from sklearn.datasets import fetch_california_housing
from scipy.stats.mstats import winsorize
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.preprocessing import QuantileTransformer,PowerTransformer
import sklearn
from sklearn.datasets import load_breast_cancer
from statsmodels.stats.outliers_influence import variance_inflation_factor 
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc

In [452]:
df = load_breast_cancer(as_frame=True)

In [453]:
df = df.frame

In [454]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [455]:
df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [456]:
df.columns

Index(['mean radius', 'mean texture', 'mean perimeter', 'mean area',
       'mean smoothness', 'mean compactness', 'mean concavity',
       'mean concave points', 'mean symmetry', 'mean fractal dimension',
       'radius error', 'texture error', 'perimeter error', 'area error',
       'smoothness error', 'compactness error', 'concavity error',
       'concave points error', 'symmetry error', 'fractal dimension error',
       'worst radius', 'worst texture', 'worst perimeter', 'worst area',
       'worst smoothness', 'worst compactness', 'worst concavity',
       'worst concave points', 'worst symmetry', 'worst fractal dimension',
       'target'],
      dtype='object')

In [457]:
df.columns = [col.replace(' ','_') for col in df.columns]

In [458]:
df.columns

Index(['mean_radius', 'mean_texture', 'mean_perimeter', 'mean_area',
       'mean_smoothness', 'mean_compactness', 'mean_concavity',
       'mean_concave_points', 'mean_symmetry', 'mean_fractal_dimension',
       'radius_error', 'texture_error', 'perimeter_error', 'area_error',
       'smoothness_error', 'compactness_error', 'concavity_error',
       'concave_points_error', 'symmetry_error', 'fractal_dimension_error',
       'worst_radius', 'worst_texture', 'worst_perimeter', 'worst_area',
       'worst_smoothness', 'worst_compactness', 'worst_concavity',
       'worst_concave_points', 'worst_symmetry', 'worst_fractal_dimension',
       'target'],
      dtype='object')

In [459]:
df.corr().round(2)

Unnamed: 0,mean_radius,mean_texture,mean_perimeter,mean_area,mean_smoothness,mean_compactness,mean_concavity,mean_concave_points,mean_symmetry,mean_fractal_dimension,...,worst_texture,worst_perimeter,worst_area,worst_smoothness,worst_compactness,worst_concavity,worst_concave_points,worst_symmetry,worst_fractal_dimension,target
mean_radius,1.0,0.32,1.0,0.99,0.17,0.51,0.68,0.82,0.15,-0.31,...,0.3,0.97,0.94,0.12,0.41,0.53,0.74,0.16,0.01,-0.73
mean_texture,0.32,1.0,0.33,0.32,-0.02,0.24,0.3,0.29,0.07,-0.08,...,0.91,0.36,0.34,0.08,0.28,0.3,0.3,0.11,0.12,-0.42
mean_perimeter,1.0,0.33,1.0,0.99,0.21,0.56,0.72,0.85,0.18,-0.26,...,0.3,0.97,0.94,0.15,0.46,0.56,0.77,0.19,0.05,-0.74
mean_area,0.99,0.32,0.99,1.0,0.18,0.5,0.69,0.82,0.15,-0.28,...,0.29,0.96,0.96,0.12,0.39,0.51,0.72,0.14,0.0,-0.71
mean_smoothness,0.17,-0.02,0.21,0.18,1.0,0.66,0.52,0.55,0.56,0.58,...,0.04,0.24,0.21,0.81,0.47,0.43,0.5,0.39,0.5,-0.36
mean_compactness,0.51,0.24,0.56,0.5,0.66,1.0,0.88,0.83,0.6,0.57,...,0.25,0.59,0.51,0.57,0.87,0.82,0.82,0.51,0.69,-0.6
mean_concavity,0.68,0.3,0.72,0.69,0.52,0.88,1.0,0.92,0.5,0.34,...,0.3,0.73,0.68,0.45,0.75,0.88,0.86,0.41,0.51,-0.7
mean_concave_points,0.82,0.29,0.85,0.82,0.55,0.83,0.92,1.0,0.46,0.17,...,0.29,0.86,0.81,0.45,0.67,0.75,0.91,0.38,0.37,-0.78
mean_symmetry,0.15,0.07,0.18,0.15,0.56,0.6,0.5,0.46,1.0,0.48,...,0.09,0.22,0.18,0.43,0.47,0.43,0.43,0.7,0.44,-0.33
mean_fractal_dimension,-0.31,-0.08,-0.26,-0.28,0.58,0.57,0.34,0.17,0.48,1.0,...,-0.05,-0.21,-0.23,0.5,0.46,0.35,0.18,0.33,0.77,0.01


In [460]:
df.drop(["mean_area","worst_radius","worst_perimeter","worst_area",
         "worst_texture","mean_concavity","concave_points_error",
         "worst_concave_points","worst_compactness","radius_error",
         "area_error","mean_concave_points","worst_smoothness",
         "mean_concave_points","worst_fractal_dimension","mean_compactness",
         "fractal_dimension_error","compactness_error","texture_error",
         "symmetry_error","mean_symmetry","mean_smoothness",
         "mean_perimeter","mean_fractal_dimension","mean_fractal_dimension",
         "mean_radius","mean_texture"],axis=1,inplace=True)

In [461]:
df.head()

Unnamed: 0,perimeter_error,smoothness_error,concavity_error,worst_concavity,worst_symmetry,target
0,8.589,0.006399,0.05373,0.7119,0.4601,0
1,3.398,0.005225,0.0186,0.2416,0.275,0
2,4.585,0.00615,0.03832,0.4504,0.3613,0
3,3.445,0.00911,0.05661,0.6869,0.6638,0
4,5.438,0.01149,0.05688,0.4,0.2364,0


In [462]:
for ind, col in enumerate(df.drop(["target"],axis=1).select_dtypes(np.number).columns.to_list()):
    print(ind,col)

0 perimeter_error
1 smoothness_error
2 concavity_error
3 worst_concavity
4 worst_symmetry


In [463]:
df.corr().round(2)

Unnamed: 0,perimeter_error,smoothness_error,concavity_error,worst_concavity,worst_symmetry,target
perimeter_error,1.0,0.15,0.36,0.42,0.11,-0.56
smoothness_error,0.15,1.0,0.27,-0.06,-0.11,0.07
concavity_error,0.36,0.27,1.0,0.66,0.2,-0.25
worst_concavity,0.42,-0.06,0.66,1.0,0.53,-0.66
worst_symmetry,0.11,-0.11,0.2,0.53,1.0,-0.42
target,-0.56,0.07,-0.25,-0.66,-0.42,1.0


In [464]:
X = df.drop(['target'],axis=1)
y = df['target']

X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.65,random_state=354)
print(f'X_train: {X_train.shape}')
print(f'X_test: {X_test.shape}')
print(f'y_train: {y_train.shape}')
print(f'y_test: {y_test.shape}')

X_train: (369, 5)
X_test: (200, 5)
y_train: (369,)
y_test: (200,)


In [465]:
y_train.unique()

array([0, 1])

In [486]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [487]:
X_train_scaled

array([[ 1.54011032e-01, -5.02708411e-01,  7.98960208e-02,
         1.08041509e+00,  1.12877961e+00],
       [-6.06055105e-01, -1.24164757e+00, -6.31349676e-01,
        -5.33948638e-01, -3.98560595e-01],
       [-3.73576775e-01, -3.48008452e-01, -3.93668328e-01,
        -6.17835372e-01, -3.30964928e-01],
       ...,
       [-4.29307882e-01, -1.13434505e+00,  7.77979509e-02,
        -5.34404544e-01,  4.59260615e-01],
       [-3.65084416e-01,  8.68196540e-01, -9.54752165e-01,
        -1.24744178e+00, -6.85037473e-01],
       [ 1.14856855e-03, -5.09620537e-01,  3.34961376e-01,
         1.41049115e+00,  5.34903386e-01]])

In [488]:
model = sm.Logit(y_train,X_train_scaled).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.190650
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                 target   No. Observations:                  369
Model:                          Logit   Df Residuals:                      364
Method:                           MLE   Df Model:                            4
Date:                Fri, 19 Apr 2024   Pseudo R-squ.:                  0.7110
Time:                        16:15:59   Log-Likelihood:                -70.350
converged:                       True   LL-Null:                       -243.40
Covariance Type:            nonrobust   LLR p-value:                 1.215e-73
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
x1            -3.9826      0.557     -7.154      0.000      -5.074      -2.891
x2            -0.2477      0.

In [489]:
X_train

Unnamed: 0,perimeter_error,smoothness_error,concavity_error,worst_concavity,worst_symmetry
512,3.093,0.005414,0.03452,0.51060,0.3585
298,1.661,0.003169,0.01079,0.15650,0.2636
93,2.099,0.005884,0.01872,0.13810,0.2678
467,2.275,0.010980,0.01031,0.06409,0.3057
433,4.493,0.008074,0.05321,0.39120,0.3007
...,...,...,...,...,...
371,1.338,0.005012,0.01551,0.13620,0.2487
437,2.644,0.007976,0.01608,0.11170,0.2725
551,1.994,0.003495,0.03445,0.15640,0.3169
550,2.115,0.009579,0.00000,0.00000,0.2458


In [470]:
df.isna().sum()

perimeter_error     0
smoothness_error    0
concavity_error     0
worst_concavity     0
worst_symmetry      0
target              0
dtype: int64

In [495]:
X_train

Unnamed: 0,perimeter_error,smoothness_error,concavity_error,worst_concavity,worst_symmetry
512,3.093,0.005414,0.03452,0.51060,0.3585
298,1.661,0.003169,0.01079,0.15650,0.2636
93,2.099,0.005884,0.01872,0.13810,0.2678
467,2.275,0.010980,0.01031,0.06409,0.3057
433,4.493,0.008074,0.05321,0.39120,0.3007
...,...,...,...,...,...
371,1.338,0.005012,0.01551,0.13620,0.2487
437,2.644,0.007976,0.01608,0.11170,0.2725
551,1.994,0.003495,0.03445,0.15640,0.3169
550,2.115,0.009579,0.00000,0.00000,0.2458


In [492]:
vif_data = pd.DataFrame() 
vif_data["feature"] = X_train.columns 
vif_data["VIF"] = [variance_inflation_factor(X_train.values, i) 
                          for i in range(len(X_train.columns))] 

In [473]:
vif_data

Unnamed: 0,feature,VIF
0,perimeter_error,3.837499
1,smoothness_error,6.260772
2,concavity_error,4.201526
3,worst_concavity,7.237967
4,worst_symmetry,9.060941


In [478]:
preds = model.predict(X_test)

In [480]:
preds = np.where(preds >= 0.5,1,0)

In [483]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87        75
           1       0.91      0.94      0.92       125

    accuracy                           0.91       200
   macro avg       0.90      0.89      0.90       200
weighted avg       0.90      0.91      0.90       200



In [485]:
y_test.value_counts()

1    125
0     75
Name: target, dtype: int64