In [21]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

load data - breast cancer

In [56]:
df = pd.read_csv('D:/Python/exercise/breast-cancer-wisconsin.data')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 11 columns):
1000025    698 non-null int64
5          698 non-null int64
1          698 non-null int64
1.1        698 non-null int64
1.2        698 non-null int64
2          698 non-null int64
1.3        698 non-null object
3          698 non-null int64
1.4        698 non-null int64
1.5        698 non-null int64
2.1        698 non-null int64
dtypes: int64(10), object(1)
memory usage: 60.1+ KB


In [57]:
df.head()

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4


clean data

In [61]:
df.replace("?",0,inplace=True)
df['1.3'] = df['1.3'].astype(np.int)
x = df.iloc[:,1:-1]
y = df.iloc[:,-1]
x.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 698 entries, 0 to 697
Data columns (total 9 columns):
5      698 non-null int64
1      698 non-null int64
1.1    698 non-null int64
1.2    698 non-null int64
2      698 non-null int64
1.3    698 non-null int32
3      698 non-null int64
1.4    698 non-null int64
1.5    698 non-null int64
dtypes: int32(1), int64(8)
memory usage: 46.5 KB


In [62]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=2)
print(y.shape)
print(x.shape)

(698,)
(698, 9)


LR

In [65]:
algo = LogisticRegression()
algo.fit(x_train,y_train)
train_predict = algo.predict(x_train)
test_predict = algo.predict(x_test)

#model evaluation
print('algo score on training data:',algo.score(x_train, y_train))
print('accuracy score:',accuracy_score(y_train, train_predict))
print('confusion matrix:',confusion_matrix(y_train, train_predict))
print('classification report:', classification_report(y_train, train_predict))



algo score on training data: 0.9713261648745519
accuracy score: 0.9713261648745519
confusion matrix: [[358   7]
 [  9 184]]
classification report:               precision    recall  f1-score   support

           2       0.98      0.98      0.98       365
           4       0.96      0.95      0.96       193

    accuracy                           0.97       558
   macro avg       0.97      0.97      0.97       558
weighted avg       0.97      0.97      0.97       558



LR-L1、L2 penalty 

In [70]:
algo_l1= LogisticRegression(penalty='l1',solver='saga')
algo_l2= LogisticRegression(penalty='l2',solver='saga')
algo_EN= LogisticRegression(penalty='elasticnet',solver='saga',l1_ratio=0.5) #only 'saga' solver can be used in EN, and add l1_ratio

algo_l1.fit(x_train, y_train)
train_pred1 = algo_l1.predict(x_train)
test_pred1 = algo_l1.predict(x_test)

#model evaluation
print('L1 score on training data:',algo_l1.score(x_train, y_train))
print('L1 accuracy score:',accuracy_score(y_train, train_pred1))
print('L1 confusion matrix:',confusion_matrix(y_train, train_pred1))
print('L1 classification report:', classification_report(y_train, train_pred1))

algo_l2.fit(x_train, y_train)
train_pred2 = algo_l2.predict(x_train)
test_pred2 = algo_l2.predict(x_test)

#model evaluation
print('L2 score on training data:',algo_l2.score(x_train, y_train))
print('L2 accuracy score:',accuracy_score(y_train, train_pred2))
print('L2 confusion matrix:',confusion_matrix(y_train, train_pred2))
print('L2 classification report:', classification_report(y_train, train_pred2))

algo_EN.fit(x_train, y_train)
train_pred3 = algo_EN.predict(x_train)
test_pred3 = algo_EN.predict(x_test)

#model evaluation
print('EN score on training data:',algo_EN.score(x_train, y_train))
print('EN accuracy score:',accuracy_score(y_train, train_pred3))
print('EN confusion matrix:',confusion_matrix(y_train, train_pred3))
print('EN classification report:', classification_report(y_train, train_pred3))


L1 score on training data: 0.9695340501792115
L1 accuracy score: 0.9695340501792115
L1 confusion matrix: [[357   8]
 [  9 184]]
L1 classification report:               precision    recall  f1-score   support

           2       0.98      0.98      0.98       365
           4       0.96      0.95      0.96       193

    accuracy                           0.97       558
   macro avg       0.97      0.97      0.97       558
weighted avg       0.97      0.97      0.97       558

L2 score on training data: 0.9713261648745519
L2 accuracy score: 0.9713261648745519
L2 confusion matrix: [[357   8]
 [  8 185]]
L2 classification report:               precision    recall  f1-score   support

           2       0.98      0.98      0.98       365
           4       0.96      0.96      0.96       193

    accuracy                           0.97       558
   macro avg       0.97      0.97      0.97       558
weighted avg       0.97      0.97      0.97       558

EN score on training data: 0.969534050



solver{‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’
Algorithm to use in the optimization problem.

For SMALL datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.

For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss; ‘liblinear’ is LIMITED to one-versus-rest schemes.

‘newton-cg’, ‘lbfgs’, ‘sag’ and ‘saga’ handle L2 or no penalty

‘liblinear’ and ‘saga’ also handle L1 penalty

‘saga’ also supports ‘elasticnet’ penalty

‘liblinear’ does not support setting penalty='none'