In [9]:
# sklearn.linear_model.LogisticRegression
# class sklearn.linear_model.LogisticRegression(penalty='l2', *, 
# dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, 
# class_weight=None, random_state=None, solver='lbfgs', max_iter=100, 
# multi_class='auto', verbose=0, warm_start=False, n_jobs=None, 
# l1_ratio=None)[source]
import numpy as np
import pandas as pd
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

# dataset: Probability of passing an exam versus where and hours of study
# if the problem is binary: 1 and 0 (pass and fail)

d = {'location': [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0],
     'hours': [0.5, 0.75, 1, 1.25, 1.5, 1.75, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 4, 4.25, 4.5, 4.75, 5, 5.5],
     'pass': [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]}
HrsStudying = pd.DataFrame(data=d)

X = HrsStudying[["location", "hours"]] 
y = HrsStudying["pass"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)

lr = LogisticRegression(random_state=0, C=1)
lr.fit(X_train, y_train)

# Estimate the accuracy of the classifier on future data, using the test data
print("Train set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))

y_predicted = lr.predict(X_test)
print("confusion matrix:\n", confusion_matrix(y_test, y_predicted))

# Coefficients of the linear model (b_1,b_2,...,b_p): log(p/(1-p)) = b0+b_1x_1+b_2x_2+...+b_px_p
#
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))

Train set score: 0.86
Test set score: 0.83
confusion matrix:
 [[3 0]
 [1 2]]
lr.coef_: [[0.837 1.030]]
lr.intercept_: [-3.566]


In [58]:
# recall kNN
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix

d = {'location': [0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0],
     'hours': [0.5, 0.75, 1, 1.25, 1.5, 1.75, 1.75, 2, 2.25, 2.5, 2.75, 3, 3.25, 3.5, 4, 4.25, 4.5, 4.75, 5, 5.5],
     'pass': [0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1]}

HrsStudying = pd.DataFrame(data=d)

X = HrsStudying[["location", "hours"]] 
y = HrsStudying["pass"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=10)
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X, y)

print("accuracy:", clf.score(X_test, y_test)) 
y_predicted = clf.predict(X_test)
print("confusion matrix:\n", confusion_matrix(y_test, y_predicted))


accuracy: 0.6666666666666666
confusion matrix:
 [[2 1]
 [1 2]]


In [86]:
# multiclass the problem is different. if n classes, LogisticRegression will build n classifiers, i.e. n generalized
# linear models. (b_1,b_2,...,b_p): log(p/(1-p)) = b0+b_1x_1+b_2x_2+...+b_px_p

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

fruit_df = pd.read_csv("fruit_data_with_colors.csv")

print(fruit_df.fruit_name.unique())
print(fruit_df.head(3))

X = fruit_df[['mass', 'width', 'height', 'color_score']]
y = fruit_df['fruit_label']

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, stratify=y, random_state=12)  # default .75/.25; try: random_state=1

clf = LogisticRegression(random_state=10, max_iter=10000, C=10)
clf.fit(X_train, y_train)

print("\nTrain set score: {:.2f}".format(clf.score(X_train, y_train)))
print("accuracy for test:", clf.score(X_test, y_test))
from sklearn.metrics import confusion_matrix
y_predicted = clf.predict(X_test)
print("confusion matrix:\n", confusion_matrix(y_test, y_predicted))
print()

#--------- regression coefficients -----------------
# there are 4 classes in the problem. So we see 4 sets of coefficients. For example: for class 0, 
# Assume p be the probability a apple is in class 0. 
print("logit.intercept_: {}".format(clf.intercept_))
print("logit.coef_: \n{}".format(clf.coef_))

['apple' 'mandarin' 'orange' 'lemon']
   fruit_label fruit_name fruit_subtype  mass  width  height  color_score
0            1      apple  granny_smith   192    8.4     7.3         0.55
1            1      apple  granny_smith   180    8.0     6.8         0.59
2            1      apple  granny_smith   176    7.4     7.2         0.60

Train set score: 0.84
accuracy for test: 0.8
confusion matrix:
 [[3 0 2 0]
 [0 1 0 0]
 [1 0 4 0]
 [0 0 0 4]]

logit.intercept_: [-10.139 46.045 -12.999 -22.907]
logit.coef_: 
[[0.153 2.678 -3.491 1.177]
 [-0.424 0.003 -0.037 0.002]
 [0.174 -0.218 -0.501 -0.968]
 [0.096 -2.463 4.029 -0.211]]


In [109]:
from sklearn.preprocessing import StandardScaler

fruit_df = pd.read_csv("fruit_data_with_colors.csv")

print(fruit_df.fruit_name.unique())
print(fruit_df.head(3))

X = fruit_df[['mass', 'width', 'height', 'color_score']]
y = fruit_df['fruit_label']

sc = StandardScaler()
X = sc.fit_transform(X)

X_train_s, X_test_s, y_train, y_test = \
    train_test_split(X, y, stratify=y, random_state=12)  # default .75/.25; try: random_state=1

clf = LogisticRegression(random_state=10, max_iter=10000, C=10)
clf.fit(X_train, y_train)

print("\nTrain set score: {:.2f}".format(clf.score(X_train, y_train)))
print("accuracy for test:", clf.score(X_test, y_test))
from sklearn.metrics import confusion_matrix
y_predicted = clf.predict(X_test)
print("confusion matrix:\n", confusion_matrix(y_test, y_predicted))
print()

#--------- regression coefficients -----------------
# there are 4 classes in the problem. So we see 4 sets of coefficients. For example: for class 0, 
# Assume p be the probability a apple is in class 0. 
print("logit.intercept_: {}".format(clf.intercept_))
print("logit.coef_: \n{}".format(clf.coef_))

['apple' 'mandarin' 'orange' 'lemon']
   fruit_label fruit_name fruit_subtype  mass  width  height  color_score
0            1      apple  granny_smith   192    8.4     7.3         0.55
1            1      apple  granny_smith   180    8.0     6.8         0.59
2            1      apple  granny_smith   176    7.4     7.2         0.60

Train set score: 0.48
accuracy for test: 0.6
confusion matrix:
 [[3 0 2 0]
 [1 0 0 0]
 [3 0 2 0]
 [0 0 0 4]]

logit.intercept_: [0.358 -0.920 0.359 0.203]
logit.coef_: 
[[0.006 0.065 -0.031 0.027]
 [-0.052 -0.054 -0.091 0.015]
 [0.062 0.064 0.019 0.018]
 [-0.016 -0.075 0.103 -0.061]]
