## Logistic Regression
 - Logistic regression is a linear model for classification rather than regression
 - Sigmoid function: $p = \frac{1}{(1+e^{-y})}$
 -  Linear regression equation: $\hat{y}$(${w,x}$) = $w_{0}$ + $w_{1}$$x_{1}$ + $w_{2}$$x_{2}$ + ... + $w_{p}$$x_{p}$
 - Applying sigmoid function: $p = \frac{1}{(1+e^{-(w_{0} + w_{1}x_{1} + w_{2}x_{2} + ... + w_{p}x_{p})})}$
 - Types of Logistic Regression
  - Binary Logistic Regression: target variable has only two possible outcomes
  - Multinomial Logistic Regression: Target variable has three or more nominal categories
  - Ordinal Logistic Regression: Target variable has three of more ordinal categories

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

np.set_printoptions(precision=4, suppress=True)

In [None]:
def logistic(x): 
    return 1 / (1 + np.exp(-x))

x = np.linspace(-6, 6, 100) 
plt.plot(x, logistic(x)) 
plt.axhline(.5, c='r', ls='--')
plt.grid(True) 
plt.title('Logistic (sigmoid)');

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression

from sklearn import datasets

from sklearn import metrics

#### Logistic versus Linear Regression

In [None]:
# Generate a sample dataset - a straight line with some Gaussian noise:

np.random.seed(123)

xmin, xmax = -5, 5
n_samples = 100

X = np.random.normal(size=n_samples)
y = (X > 0).astype(np.float)

# Noise commented out

# X[X > 0] *= 4
# X += .3 * np.random.normal(size=n_samples)

X = X[:, np.newaxis]  # or X.reshape(-1,1)
X[:10]

In [None]:
y[:10]

In [None]:
plt.figure(figsize=(8, 6))

plt.scatter(X.ravel(), y, color='k')
plt.xticks(range(-5, 5))
plt.yticks([0, 0.5, 1]);

In [None]:
# Fit the Logistic classifier and the Linear regression

clf = LogisticRegression(C=1e5, solver='lbfgs')
clf.fit(X, y)

print(clf.coef_)
print(clf.intercept_)

In [None]:
ols = LinearRegression()
ols.fit(X, y)

print(ols.coef_)
print(ols.intercept_)

In [None]:
def model(x):
    return 1 / (1 + np.exp(-x))

X_test = np.linspace(-5, 5, 300)

loss = model(X_test * clf.coef_ + clf.intercept_).ravel()

loss


In [None]:
plt.figure(figsize=(8, 6))

plt.scatter(X.ravel(), y, color='k')

plt.plot(X_test, loss, color='r', linewidth=3)

plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, linewidth=1, color='g')

plt.axhline(.5, c='gray', ls='--')

plt.ylabel('y')
plt.xlabel('X')
plt.xticks(range(-5, 5))
plt.yticks([0, 0.5, 1])
plt.ylim(-.25, 1.25)
plt.xlim(-4, 10)
plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
           loc="lower right", fontsize='small')
plt.tight_layout()

In [None]:
# Generate a sample dataset - a straight line with some Gaussian noise:

np.random.seed(123)

xmin, xmax = -5, 5
n_samples = 100

X = np.random.normal(size=n_samples)
y = (X > 0).astype(np.float)

# With Noise 

X[X > 0] *= 4
X += .3 * np.random.normal(size=n_samples)

np.min(X), np.max(X)

In [None]:
X = X[:, np.newaxis]  # or X.reshape(-1,1)
X[:10]

In [None]:
plt.figure(figsize=(8, 6))

plt.scatter(X.ravel(), y, color='k');

In [None]:
# Fit the Logistic classifier and the Linear regression

clf = LogisticRegression(C=1e5, solver='lbfgs')
clf.fit(X, y)

print(clf.coef_)
print(clf.intercept_)

In [None]:
ols = LinearRegression()
ols.fit(X, y)

print(ols.coef_)
print(ols.intercept_)

In [None]:
def model(x):
    return 1 / (1 + np.exp(-x))

X_test = np.linspace(-5, 10, 300)

loss = model(X_test * clf.coef_ + clf.intercept_).ravel()

loss


In [None]:
plt.figure(figsize=(8, 6))

plt.scatter(X.ravel(), y, color='k')

plt.plot(X_test, loss, color='r', linewidth=3)

plt.plot(X_test, ols.coef_ * X_test + ols.intercept_, 
         linewidth=1, color='g')

plt.axhline(.5, c='gray', ls='--')

plt.ylabel('y')
plt.xlabel('X')
plt.xticks(range(-5, 10))
plt.yticks([0, 0.5, 1])
plt.ylim(-.25, 1.25)
plt.xlim(-4, 10)
plt.legend(('Logistic Regression Model', 'Linear Regression Model'),
           loc="lower right", fontsize='small')
plt.tight_layout()

## Iris Dataset

In [None]:
iris = datasets.load_iris()

In [None]:
print(iris.DESCR)

In [None]:
X = iris.data[:, :2]  # Using the first two features.
y = iris.target

In [None]:
X[:5]

In [None]:
np.unique(y)

### Binary Logistic Regression

In [None]:
# Use only the first two classes

X = X[y != 2]
y = y[y != 2]

In [None]:
logreg = LogisticRegression()

logreg.fit(X, y);

In [None]:
logreg.score(X, y)

In [None]:
y_pred = logreg.predict(X)
y_pred

In [None]:
metrics.confusion_matrix(y, y_pred)

In [None]:
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max] by [y_min, y_max].

x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5

h = .02  # step size in the mesh

xx, yy = np.meshgrid(np.arange(x_min, x_max, h), 
                     np.arange(y_min, y_max, h))

Z = logreg.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

In [None]:
plt.figure(1, figsize=(10, 5))
plt.pcolormesh(xx, yy, Z, cmap=plt.cm.Paired)

# Plot also the training points
plt.scatter(X[:, 0], X[:, 1], c=y, edgecolors='k')

plt.xlabel('Sepal length')
plt.ylabel('Sepal width')

plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max());

### Multinomial Logistic Regression

In [None]:
# Using All the features

In [None]:
logreg = LogisticRegression(solver='lbfgs', multi_class='multinomial')

In [None]:
logreg.fit(iris.data, iris.target)

In [None]:
iris_probs = logreg.predict_proba(iris.data)
iris_probs[:5]

In [None]:
iris_pred = logreg.predict(iris.data)
iris_pred[:5]

In [None]:
metrics.confusion_matrix(iris.target, iris_pred)

In [None]:
iris_pred_df = pd.DataFrame(iris_probs, columns=iris.target_names).round(4)
iris_pred_df['predicted_class'] = iris.target_names[iris_pred]
iris_pred_df['target_class'] = iris.target_names[iris.target]
iris_pred_df.sample(12)

In [None]:
logreg.score(iris.data, iris.target)

In [None]:
iris_pred_df[iris_pred != iris.target]

In [None]:
print("Accuracy:",metrics.accuracy_score(iris.target, iris_pred))


### Model Validation
 - Divide the dataset into a training set and a test set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
X_train.shape, X_test.shape

In [None]:
logreg = LogisticRegression(solver='lbfgs', multi_class='multinomial')

In [None]:
logreg.fit(X_train, y_train)

In [None]:
y_pred = logreg.predict(X_test)

In [None]:
logreg.score(X_test, y_test)

In [None]:
metrics.confusion_matrix(y_test, y_pred)

In [None]:
metrics.accuracy_score(y_test, y_pred)

### Case Study - Predicting Credit Card Default
 - https://archive.ics.uci.edu/ml/datasets/default+of+credit+card+clients
 - http://inseaddataanalytics.github.io/INSEADAnalytics/CourseSessions/ClassificationProcessCreditCardDefault.html
 

In [None]:
ccd = pd.read_csv('http://people.bu.edu/kalathur/datasets/credit_card_default.csv',
                 index_col="ID")
ccd.head()

In [None]:
len(ccd)

In [None]:
ccd.rename(columns=lambda x: x.lower(), inplace=True)
ccd.rename(columns={'default payment next month':'default'}, inplace=True)

ccd.head().T

In [None]:
# getting the groups of features

bill_amt_features = ['bill_amt'+ str(i) for i in range(1,7)]

pay_amt_features = ['pay_amt'+ str(i) for i in range(1,7)]

numerical_features = ['limit_bal','age'] + bill_amt_features + pay_amt_features

In [None]:
numerical_features

In [None]:
ccd.sex.unique()

In [None]:
ccd.sex.value_counts()

In [None]:
ccd.education.unique()

In [None]:
ccd.education.value_counts()

In [None]:
# Creating  some binary features

ccd['male'] = (ccd['sex'] == 1).astype('int')

ccd['grad_school'] = (ccd['education'] == 1).astype('int')
ccd['university'] = (ccd['education'] == 2).astype('int')

ccd['married'] = (ccd['marriage'] == 1).astype('int')


In [None]:
ccd.head().T

In [None]:
# simplifying pay features 

pay_features= ['pay_' + str(i) for i in range(1,7)]

In [None]:
pay_features

In [None]:
ccd['pay_1'].unique()

In [None]:
ccd.loc[ccd['pay_1'] > 0].T

In [None]:
for x in pay_features:
    ccd.loc[ccd[x] <= 0, x] = 0

In [None]:
# simplifying delayed features

delayed_features = ['delayed_' + str(i) for i in range(1,7)]


In [None]:
for pay, delayed in zip(pay_features, delayed_features):
    ccd[delayed] = (ccd[pay] > 0).astype(int)

In [None]:
# creating a new feature: months delayed
ccd['months_delayed'] = ccd[delayed_features].sum(axis=1)

In [None]:
ccd.head().T

In [None]:
ccd['months_delayed'].value_counts()

#### Splitting the dataset

In [None]:
numerical_features = numerical_features + ['months_delayed']
binary_features = ['male','married','grad_school','university']

X = ccd[numerical_features + binary_features]
y = ccd['default'].astype(int)

In [None]:
X[:10]

In [None]:
y[:10]

In [None]:
# 1. Import the class you will use
from sklearn.preprocessing import StandardScaler

# 2. Create an instance of the class
scaler = StandardScaler()

# 3. Use the fit method of the instance
scaler.fit(X[numerical_features])

X[:][numerical_features] = scaler.transform(X[numerical_features])

In [None]:
len(X)

In [None]:
X[:10]

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=5/30, random_state=43)

In [None]:
len(X_train), len(y_train)

In [None]:
X_train[:10]

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(C=1e5, solver='lbfgs')

logreg.fit(X_train['months_delayed'].values.reshape(-1, 1), y_train)

In [None]:
print("W0: {}, W1: {}".format(logreg.intercept_[0], logreg.coef_[0][0]))

In [None]:
def get_probs(months_delayed):
    m = scaler.mean_[-1]
    std = scaler.var_[-1]**.5
    x = (months_delayed - m)/std
    prob_default = 1/(1+np.exp(-logreg.intercept_[0] + -logreg.coef_[0][0]*x))
    return prob_default

In [None]:
months = np.arange(13)
pred_probs = get_probs(months)
pd.DataFrame({'months': months, 'pred_probs':pred_probs})

In [None]:
plt.plot(months, pred_probs)
plt.xlabel('Months delayed')
plt.ylabel('Probability of default')
plt.grid()

In [None]:
np.unique(y_train, return_counts=True)

In [None]:
y_pred = logreg.predict(X_train['months_delayed'].values.reshape(-1, 1))
np.unique(y_pred, return_counts=True)

In [None]:
accuracy_logreg = metrics.accuracy_score(y_train, y_pred)
accuracy_logreg

In [None]:
metrics.confusion_matrix(y_train, y_pred)

In [None]:
# Using test data

In [None]:
np.unique(y_test, return_counts=True)

In [None]:
y_pred = logreg.predict(X_test['months_delayed'].values.reshape(-1, 1))
np.unique(y_pred, return_counts=True)

In [None]:
metrics.accuracy_score(y_test, y_pred)

In [None]:
metrics.confusion_matrix(y_test, y_pred)