## R Squared
R^2 indicates how well a given independent variable explains a dependent variable.

Calculating the log likelihood of the fit

In [2]:
from math import log, exp
import pandas as pd

patient_data = pd.read_csv('https://bit.ly/33ebs2R', delimiter=",").itertuples()

b0 = -3.17576395
b1 = 0.69267212
def logistic_function(x):
  p = 1.0 / (1.0 + exp(-(b0 + b1 * x)))
  return p

log_likelihood_fit = 0.0

for p in patient_data:
  if p.y == 1.0:
    log_likelihood_fit += log(logistic_function(p.x))
  elif p.y == 0.0:
    log_likelihood_fit += log(1.0 - logistic_function(p.x))

print(log_likelihood_fit)

-9.946161673231583


## Calculating R^2

In [3]:
import pandas as pd
from math import log, exp
patient_data = list(pd.read_csv('https://bit.ly/33ebs2R', delimiter=",") \
                                .itertuples())
b0 = -3.17576395
b1 = 0.69267212

def logistic_function(x):
  p = 1.0 / (1.0 + exp(-(b0 + b1 * x)))
  return p

# calculate the log likelihood of the fit
log_likelihood_fit = sum(log(logistic_function(p.x)) * p.y +
                         log(1.0 - logistic_function(p.x)) * (1.0 - p.y)
                         for p in patient_data)

# calculate the log likelihood without fit
likelihood = sum(p.y for p in patient_data) / len(patient_data)
log_likelihood = sum(log(likelihood) * p.y + log(1.0 - likelihood) * (1.0 - p.y) \
                     for p in patient_data) # calculate R-Square

r2 = (log_likelihood - log_likelihood_fit) / log_likelihood
print(r2)

0.306456105756576


## P Values
Chi-square distribution, annotated as χ2 distribution. It is continuous and used in several areas of statistics

In [5]:
import pandas as pd
from math import log, exp
from scipy.stats import chi2

patient_data = list(pd.read_csv('https://bit.ly/33ebs2R', delimiter=",").itertuples())

# Declare fitted logistic regression
b0 = -3.17576395
b1 = 0.69267212

def logistic_function(x):
  p = 1.0 / (1.0 + exp(-(b0 + b1 * x)))
  return p

# calculate the log likelihood of the fit
log_likelihood_fit = sum(log(logistic_function(p.x)) * p.y +
                         log(1.0 - logistic_function(p.x)) * (1.0 - p.y)
                         for p in patient_data)

# calculate the log likelihood without fit
likelihood = sum(p.y for p in patient_data) / len(patient_data)
log_likelihood = sum(log(likelihood) * p.y + log(1.0 - likelihood) * (1.0 - p.y) \
                     for p in patient_data)

# calculate p-value
chi2_input = 2 * (log_likelihood_fit - log_likelihood)
p_value = chi2.pdf(chi2_input, 1)
print(p_value)

0.0016604875618753787


## Train/Test Splits

3 Fold Cross Validation

In [6]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score

# Load the data
df = pd.read_csv("https://tinyurl.com/y6r7qjrp", delimiter=",")
X = df.values[:, :-1]
Y = df.values[:, -1]

# "random_state" is the random seed, which we fix to 7
kfold = KFold(n_splits=3, random_state=7, shuffle=True)
model = LogisticRegression(penalty='none')
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy Mean: %.3f (stdev=%.3f)" % (results.mean(), results.std()))

Accuracy Mean: 0.611 (stdev=0.000)


