# Lecture 10: Classification2 Part 2
1. Logistic function
2. Logistic Regression

### 1. Logistic function

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def logistic(z):
    return np.exp(z)/(1+np.exp(z))

t = np.arange(-10, 11)

plt.plot(t, logistic(t), 'r--')
plt.show()


In [None]:
def logit(p):
    return np.log(p/(1-p))

t = np.arange(-10, 11)

plt.plot(t, logit(logistic(t)), 'bs')
plt.show()

### 2. Logistic Regression Example 1:
Pima Indians Diabetes Database. You can download the data at https://www.kaggle.com/datasets/uciml/pima-indians-diabetes-database

In [None]:
#import pandas
import pandas as pd
col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']
# load dataset
pima = pd.read_csv("pima-indians-diabetes.csv", header=None, names=col_names)

pima.head()

Split dataset in features and target variable

In [None]:
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']
X = pima[feature_cols] # Features
y = pima.label # Target variable

Split X and y into training and testing sets

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=16)

Training and Prediction

In [None]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression(random_state=16,max_iter=1000)

# fit the model with data
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

print(y_pred)

Model Evaluation using Confusion Matrix

In [None]:
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

Visualizing Confusion Matrix using Heatmap

In [None]:
# import required modules
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import classification_report
target_names = ['without diabetes', 'with diabetes']
print(classification_report(y_test, y_pred, target_names=target_names))

ROC Curve and AUC

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[:,1]   #Probability estimates
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
plt.plot(fpr,tpr)
plt.show()


In [None]:
auc = metrics.roc_auc_score(y_test, y_pred_proba)
print(auc)

### Logistic Regression Example 2: 
Breast Cancer Wisconsin (Original) Data Set. You can download the data at https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Original)

In [None]:
bc = pd.read_csv('breast-cancer-wisconsin.csv')
bc = bc.dropna()  #drops instances with missing information.

In [None]:
bc['class'] = bc['class'].astype('category')
bc['class'].describe()

We separate the labels from the rest of the dataset by dropping the appropriate column.

In [None]:
X = bc.drop(['class'], axis=1)
X = X.values
Y_raw = bc['class'].values

It would make our task easier to use '0' and '1' as the labels for our classes instead of the labels used by in the original dataset. This can easily be done with LabelEncoder

In [None]:
from sklearn import preprocessing
label_enc = preprocessing.LabelEncoder()
label_enc.fit(Y_raw)
Y = label_enc.transform(Y_raw)

#print(Y_raw)
#print(Y)

we can invert the label encoding with _label_enc.inverse_transform()_ later

In [None]:
XTrain, XTest, YTrain, YTest = train_test_split(X, Y, test_size=0.3, random_state=1)

We are going to use regularisation in our model and we can choose between L1 and L2 penalties. The hyperparameter in this case is implemented as C and it corresponds to the inverse of the regularisation strength. This means that the smaller the value of C, the stronger the penalty.

In [None]:
from sklearn.model_selection import GridSearchCV
import sklearn.model_selection as ms

pen_val = ['l1','l2']
C_val = 2. ** np.arange(-5, 10, step=2)
grid_s = [{'C': C_val, 'penalty': pen_val}]
model = LogisticRegression(max_iter=1000)
#model = LogisticRegression(max_iter=1000,solver='liblinear')

cv_logr = GridSearchCV(estimator=model, param_grid=grid_s, cv=ms.KFold(n_splits=10))

cv_logr.fit(XTrain, YTrain)
best_c = cv_logr.best_params_['C']
best_penalty = cv_logr.best_params_['penalty']

print("The best parameters are: cost={0} and penalty={1}".format(best_c, best_penalty))

Read https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html

Run again with the best setting

In [None]:
b_clf = LogisticRegression(C=best_c,penalty=best_penalty,solver='liblinear')
b_clf.fit(XTrain, YTrain)

predict = b_clf.predict(XTest)
y_proba = b_clf.predict_proba(XTest)

The accuracy of the model can be seen with the score method

In [None]:
print(b_clf.score(XTest, YTest))

In [None]:
fpr, tpr, threshold=metrics.roc_curve(YTest, y_proba[:,1])
plt.plot(fpr, tpr)

auc = metrics.roc_auc_score(y_test, y_pred_proba)
print(auc)