In [3]:
import pandas as pd
from sklearn.datasets import load_breast_cancer

In [4]:
cancer_data = load_breast_cancer()

In [5]:
print(cancer_data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [6]:
features = pd.DataFrame(cancer_data.data, columns=cancer_data.feature_names)
target = pd.DataFrame(cancer_data.target, columns=['Target'])

In [68]:
features.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [69]:
target.head()

Unnamed: 0,Target
0,0
1,0
2,0
3,0
4,0


In [70]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, classification_report

In [71]:
X_train, X_test, y_train, y_test = train_test_split(features, target, random_state=10)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(426, 30)
(143, 30)
(426, 1)
(143, 1)


In [72]:
my_logreg_model = LogisticRegression().fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [54]:
my_logreg_preds_train = my_logreg_model.predict(X_train)
my_logreg_preds_test = my_logreg_model.predict(X_test)

print('Accuracy on Train set : ',accuracy_score(y_train, my_logreg_preds_train))
print('Accuracy on Test set : ',accuracy_score(y_test, my_logreg_preds_test))
      

Accuracy on Train set :  0.9436619718309859
Accuracy on Test set :  0.9300699300699301


In [48]:
print('Precision on Train set : ',precision_score(y_train, my_logreg_preds_train))
print('Precision on Test set : ',precision_score(y_test, my_logreg_preds_test))

print('\n')

print('Recall on Train set : ',recall_score(y_train, my_logreg_preds_train))
print('Recall on Test set : ',recall_score(y_test, my_logreg_preds_test))

print('\n')

print('F1-Score on Train set : ',f1_score(y_train, my_logreg_preds_train))
print('F1-Score on Test set : ',f1_score(y_test, my_logreg_preds_test))

Precision on Train set :  0.9481481481481482
Precision on Test set :  0.9550561797752809


Recall on Train set :  0.9624060150375939
Recall on Test set :  0.9340659340659341


F1-Score on Train set :  0.955223880597015
F1-Score on Test set :  0.9444444444444444


In [47]:
print('Classification Report on Train set : ')
print(classification_report(y_train, my_logreg_preds_train))

Classification Report on Train set : 
              precision    recall  f1-score   support

           0       0.94      0.91      0.92       160
           1       0.95      0.96      0.96       266

    accuracy                           0.94       426
   macro avg       0.94      0.94      0.94       426
weighted avg       0.94      0.94      0.94       426



In [46]:
print('Classification Report on Test set : ')
print(classification_report(y_test, my_logreg_preds_test))

Classification Report on Test set : 
              precision    recall  f1-score   support

           0       0.89      0.92      0.91        52
           1       0.96      0.93      0.94        91

    accuracy                           0.93       143
   macro avg       0.92      0.93      0.93       143
weighted avg       0.93      0.93      0.93       143



In [17]:
preds = my_logreg_model.predict(X_test)

In [18]:
from sklearn.metrics import accuracy_score,confusion_matrix

In [19]:
print(accuracy_score(y_test, preds))

0.9300699300699301


In [20]:
print(confusion_matrix(y_test, preds))

[[48  4]
 [ 6 85]]


# Implementing Scalling ( mean Perimeter, mean Area, Worst perimeter)

In [34]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [35]:
mean_perimeter_std_scaler = StandardScaler()
mean_area_std_scaler = StandardScaler()
#worst_perimeter_std_scaler = StandardScaler()

In [37]:
#Scalling Training Dataset
X_train['mean perimeter'] = mean_perimeter_std_scaler.fit_transform(X_train[['mean perimeter']])
X_train['mean area'] = mean_area_std_scaler.fit_transform(X_train[['mean area']])
X_train

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
327,12.030,17.93,-0.662288,-0.597596,0.07683,0.03892,0.001546,0.005592,0.1382,0.06070,...,13.07,22.25,82.74,523.4,0.1013,0.07390,0.007732,0.02796,0.2171,0.07037
60,10.170,14.88,-1.128349,-0.968298,0.11340,0.08061,0.010840,0.012900,0.2743,0.06960,...,11.02,17.45,69.86,368.6,0.1275,0.09866,0.021680,0.02579,0.3557,0.08020
260,20.310,27.06,1.632077,1.730002,0.10000,0.10880,0.151900,0.093330,0.1814,0.05572,...,24.33,39.16,162.30,1844.0,0.1522,0.29450,0.378800,0.16970,0.3151,0.07999
504,9.268,12.87,-1.251932,-1.143006,0.16340,0.22390,0.097300,0.052520,0.2378,0.09502,...,10.28,16.38,69.05,300.2,0.1902,0.34410,0.209900,0.10250,0.3038,0.12520
544,13.870,20.70,-0.109799,-0.213902,0.09578,0.10180,0.036880,0.023690,0.1620,0.06688,...,15.05,24.75,99.17,688.6,0.1264,0.20370,0.137700,0.06845,0.2249,0.08492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,22.010,21.90,2.209605,2.266290,0.10630,0.19540,0.244800,0.150100,0.1824,0.06140,...,27.66,25.80,195.00,2227.0,0.1294,0.38850,0.475600,0.24320,0.2741,0.08574
320,10.250,16.18,-1.048788,-0.934296,0.10610,0.11110,0.067260,0.039650,0.1743,0.07279,...,11.28,20.61,71.53,390.4,0.1402,0.23600,0.189800,0.09744,0.2608,0.09702
527,12.340,12.27,-0.547186,-0.535398,0.09003,0.06307,0.029580,0.026470,0.1689,0.05808,...,13.61,19.27,87.22,564.9,0.1292,0.20740,0.179100,0.10700,0.3110,0.07592
125,13.850,17.21,-0.163513,-0.203121,0.08785,0.06136,0.014200,0.011410,0.1614,0.05890,...,15.49,23.58,100.30,725.9,0.1157,0.13500,0.081150,0.05104,0.2364,0.07182


In [38]:
# Modeling with scalling
logreg = LogisticRegression()

In [39]:
# Fitting with scalling
logreg.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression()

In [40]:
# Getting coefficient of the line
logreg.coef_, logreg.intercept_

(array([[ 1.88035175e+00,  2.16867779e-01, -9.37240153e-01,
         -4.96167228e-01, -7.43867123e-02, -2.61727101e-01,
         -4.15298069e-01, -1.91892855e-01, -7.69867516e-02,
         -1.51916360e-02, -1.15042456e-02,  8.75641519e-01,
          1.90046927e-01, -1.19331435e-01, -5.08509391e-03,
         -8.84313698e-03, -4.52651135e-02, -2.19060895e-02,
         -3.39682112e-03,  1.79123730e-03,  1.17750216e+00,
         -4.64252127e-01, -1.26805549e-01, -2.65013222e-02,
         -1.47111085e-01, -8.52090651e-01, -1.12475098e+00,
         -4.01213014e-01, -2.90493517e-01, -9.29728602e-02]]),
 array([0.40248753]))

In [41]:
# Scalling Test Dataset
X_test['mean perimeter'] = mean_perimeter_std_scaler.fit_transform(X_test[['mean perimeter']])
X_test['mean area'] = mean_area_std_scaler.fit_transform(X_test[['mean area']])
X_test

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
172,15.460,11.89,0.531938,0.325664,0.12570,0.15550,0.203200,0.109700,0.1966,0.07069,...,18.790,17.04,125.00,1102.0,0.1531,0.35830,0.58300,0.18270,0.3216,0.10100
553,9.333,21.94,-1.383281,-1.159092,0.09240,0.05605,0.039960,0.012820,0.1692,0.06576,...,9.845,25.05,62.86,295.8,0.1103,0.08298,0.07993,0.02564,0.2435,0.07393
374,13.690,16.07,-0.113662,-0.169778,0.08302,0.06374,0.025560,0.020310,0.1872,0.05669,...,14.840,20.21,99.16,670.6,0.1105,0.20960,0.13460,0.06987,0.3323,0.07701
370,16.350,23.29,0.818186,0.650621,0.09742,0.14970,0.181100,0.087730,0.2175,0.06218,...,19.380,31.03,129.30,1165.0,0.1415,0.46650,0.70870,0.22480,0.4824,0.09614
419,11.160,21.41,-0.857466,-0.793947,0.10180,0.05978,0.008955,0.010760,0.1615,0.06144,...,12.360,28.92,79.26,458.0,0.1282,0.11080,0.03582,0.04306,0.2976,0.07123
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
545,13.620,23.23,-0.142286,-0.188302,0.09246,0.06747,0.029740,0.024430,0.1664,0.05801,...,15.350,29.09,97.58,729.8,0.1216,0.15170,0.10490,0.07174,0.2642,0.06953
161,19.190,15.94,1.580045,1.644645,0.08694,0.11850,0.119300,0.096670,0.1741,0.05176,...,22.030,17.81,146.60,1495.0,0.1124,0.20160,0.22640,0.17770,0.2443,0.06251
434,14.860,16.94,0.196807,0.127236,0.08924,0.07074,0.033460,0.028770,0.1573,0.05703,...,16.310,20.54,102.30,777.5,0.1218,0.15500,0.12200,0.07971,0.2525,0.06827
59,8.618,11.79,-1.588939,-1.283109,0.09752,0.05272,0.020610,0.007799,0.1683,0.07187,...,9.507,15.40,59.90,274.9,0.1733,0.12390,0.11680,0.04419,0.3220,0.09026


In [42]:
y_pred = logreg.predict(X_test)
y_pred

array([0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1,
       1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0])

In [44]:
#Getting Metrics with Scalling
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred)

0.9370629370629371

In [45]:
print(metrics.classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.94      0.92        52
           1       0.97      0.93      0.95        91

    accuracy                           0.94       143
   macro avg       0.93      0.94      0.93       143
weighted avg       0.94      0.94      0.94       143



# Implementing PCA

In [21]:
from sklearn.decomposition import PCA

In [75]:
#on 50% of the Data
pca = PCA(n_components=0.8)

In [76]:
pca.fit(X_train)

PCA(n_components=0.8)

In [77]:
pca.explained_variance_ratio_

array([0.98163972])

In [78]:
X_train_transformed = pca.transform(X_train)
X_test_transformed = pca.transform(X_test)

In [79]:
X_train_transformed.shape, X_test_transformed.shape

((426, 1), (143, 1))

In [80]:
pca.inverse_transform(X_train_transformed).shape

(426, 30)

In [29]:
my_logreg_model.fit(X_train_transformed,y_train)

  y = column_or_1d(y, warn=True)


LogisticRegression()

In [30]:
preds = my_logreg_model.predict(X_test_transformed)

In [31]:
print(accuracy_score(y_test, preds))

0.9020979020979021
