In [17]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

In [18]:
# Read in the data from the CSV file
df = pd.read_csv('../chapter2/datasets/payment_fraud.csv')

# Convert categorical feature into dummy variables with one-hot encoding
df = pd.get_dummies(df, columns=['paymentMethod'])

In [19]:
# Split dataset up into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    df.drop('label', axis=1), df['label'],
    test_size=0.33, random_state=17)

In [20]:
# Initialize and train classifier model
clf = LogisticRegression().fit(X_train, y_train)

# Make predictions on test set
y_pred = clf.predict(X_test)

In [21]:
# Compare test set predictions with ground truth labels
print(accuracy_score(y_pred, y_test))
print(confusion_matrix(y_test, y_pred))

0.99992273816
[[12753     0]
 [    1   189]]


### Inspecting the trained Logistic Regression model coefficients & intercept

In [22]:
print(clf.coef_)

[[-7.44949492  0.26692309  1.39595031 -1.44011704  1.41274547  1.32026309
   0.20373255]]


In [23]:
print(clf.intercept_)

[ 2.93674111]


In [24]:
print(clf.n_iter_)

[19]


### Let's go under the hood and implement Logistic Regression ourselves

In [25]:
# Logistic function, also known as the sigmoid function
def logistic(x):
    return 1 / (1 + np.exp(-x))

In [26]:
# Logistic regression cost function
def cost(theta, X, y):
    X = X.values
    y = y.values
    
    # Note that we clip the minimum values to slightly above
    # zero to avoid throwing an error when logarithm is applied
    log_prob_zero = np.log(
        (1 - logistic(np.dot(X, theta))).clip(min=1e-10))
    log_prob_one = np.log(
        logistic(np.dot(X, theta)).clip(min=1e-10))

    # Calculating the log-likelihood terms
    zero_likelihood = (1 - y) * log_prob_zero
    one_likelihood = -y * log_prob_one
    
    # Summation across all the samples, then taking the mean
    return np.sum(one_likelihood - zero_likelihood) / (len(X))

In [27]:
# Logistic regression gradient function
def gradient(theta, X, y):
    X = X.values
    y = y.values

    num_params = theta.shape[0]
    grad = np.zeros(num_params)
    err = logistic(np.dot(X, theta)) - y

    # Iterating through parameters and calculating
    # gradient for each given current error
    for i in range(num_params):
        term = np.multiply(err, X[:, i])
        grad[i] = np.sum(term) / len(X)
    
    return grad

In [28]:
# Insert column of zeros for more convenient matrix multiplication
X_train.insert(0, 'ones', 1)
X_test.insert(0, 'ones', 1)

# Seed for reproducibility
np.random.seed(17)
theta = np.random.rand(8)

This is the value of the cost function before optimization (training the Logistic Regression model)

In [29]:
cost(theta, X_train, y_train)

20.38085906649756

Through optimization, the parameters of the Logistic Regression model (coefficients and intercept) will be adjusted such that the value of the cost function is minimized

In [30]:
from scipy.optimize import fmin_tnc

res = fmin_tnc(func=cost, x0=theta, fprime=gradient, 
               args=(X_train, y_train))

  This is separate from the ipykernel package so we can avoid doing imports until


This is the optimized combination of coefficients and intercept that result in a minimized cost function.

In [31]:
res

(array([ 17.9286163 , -26.67080469,   0.58380376,   2.46179901,
         -5.67978642,  10.65851254,  11.48534156,   4.5302039 ]), 38, 0)

After optimization, the cost went down from 20.38 to 3.29

In [None]:
cost(res[0], X_train, y_train)