In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('/content/breast_cancer.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 683 entries, 0 to 682
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   Sample code number           683 non-null    int64
 1   Clump Thickness              683 non-null    int64
 2   Uniformity of Cell Size      683 non-null    int64
 3   Uniformity of Cell Shape     683 non-null    int64
 4   Marginal Adhesion            683 non-null    int64
 5   Single Epithelial Cell Size  683 non-null    int64
 6   Bare Nuclei                  683 non-null    int64
 7   Bland Chromatin              683 non-null    int64
 8   Normal Nucleoli              683 non-null    int64
 9   Mitoses                      683 non-null    int64
 10  Class                        683 non-null    int64
dtypes: int64(11)
memory usage: 58.8 KB


In [6]:
df.shape

(683, 11)

In [7]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [9]:
class_counts = df['Class'].value_counts()

print("Class Counts:")
print(class_counts)
if len(class_counts) == 2 and min(class_counts) / max(class_counts) < 0.2:
    print("Data is imbalanced.")
else:
    print("Data is not imbalanced.")

Class Counts:
2    444
4    239
Name: Class, dtype: int64
Data is not imbalanced.


In [10]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

In [19]:
X_train

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
109,8,6,4,3,5,9,3,1,1
477,4,1,1,1,1,1,2,1,1
205,8,10,8,8,4,8,7,7,1
318,5,2,2,2,2,1,2,2,1
190,8,4,4,5,4,7,7,8,2
...,...,...,...,...,...,...,...,...,...
9,4,2,1,1,2,1,2,1,1
359,5,1,1,2,2,1,2,1,1
192,1,1,1,1,2,1,1,1,1
629,3,1,1,1,2,1,2,1,1


In [23]:
from sklearn.linear_model import LogisticRegression
lr_classifier = LogisticRegression(max_iter=1000)

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear', 'saga']
}

# Perform grid search with 5-fold cross-validation
grid_search = GridSearchCV(lr_classifier, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

In [27]:
lr_classifier = LogisticRegression(**best_params)
lr_classifier.fit(X_train,y_train)



In [30]:
y_pred = lr_classifier.predict(X_test)
print(y_pred)

[2 2 4 4 2 2 2 4 2 2 4 2 4 2 2 2 4 4 4 2 2 2 4 2 4 4 2 2 2 4 2 4 4 2 2 2 4
 4 2 4 2 2 2 2 2 2 2 4 2 2 4 2 4 2 2 2 4 4 2 4 2 2 2 2 2 2 2 2 4 4 2 2 2 2
 2 2 4 2 2 2 4 2 4 2 2 4 2 4 4 2 4 2 4 2 2 4 4 4 2 2 2 2 4 4 2 2 4 2 2 2 4
 2 2 4 2 2 2 2 2 2 2 4 2 2 4 4 2 4 2 4 2 2 4 2 2 4 2 4 2 2 2 2 2 2 2 4 4 2
 4 2 4 2 2 2 2 2 4 4 2 4 4 4 4 2 4 2 2 2 2 2 2 4 4 4 2 2 2 4 2 2 4 2 2 4 2
 2 4 4 2 2 2 2 2 2 2 2 2 2 4 2 2 2 2 2 4]


In [37]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

print("Accuracy Score : ",accuracy_score(y_pred,y_test))
print("Confusion Matrix : ")
print(confusion_matrix(y_pred,y_test))

Accuracy Score :  0.9365853658536586
Confusion Matrix : 
[[126   9]
 [  4  66]]


In [None]:
from sklearn.model_selection import cross_val_score
score = cross_val_score(estimator=lr_classifier,X = X_train,y = y_train,cv=10,scoring='accuracy')

In [45]:
print("Mean CV Score:", score.mean())
print("Standard Deviation of CV Scores:", score.std())

Mean CV Score: 0.9748670212765959
Standard Deviation of CV Scores: 0.01260619906400439
