## Introduction to GLMs


### Linear model, a special case of GLM

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols, glm

# Fit a linear model
model_lm = ols(formula = 'Salary ~ Experience',
               data = salary).fit()

# View model coefficients
print(model_lm.params)

In [None]:
from statsmodels.formula.api import ols, glm
import statsmodels.api as sm

# Fit a GLM
model_glm = glm(formula = 'Salary ~ Experience',
                data = salary,
                family = sm.families.Gaussian()).fit()

# View model coefficients
print(model_glm.params)

### Linear model and a binary response variable

In [None]:
# Define model formula
formula = 'y ~ width'

# Define probability distribution for the response variable for 
# the linear (LM) and logistic (GLM) model
family_LM = sm.families.Gaussian()
family_GLM = sm.families.Binomial()

# Define and fit a linear regression model
model_LM = glm(formula = formula, data = crab, family = family_LM).fit()
print(model_LM.summary())

# Define and fit a logistic regression model
model_GLM = glm(formula = formula, data = crab, family = family_GLM).fit()
print(model_GLM.summary())

### Comparing predicted values

In [None]:
# View test set
print(test)

# Compute estimated probabilities for linear model: pred_lm
pred_lm = model_LM.predict(test)

# Compute estimated probabilities for GLM model: pred_glm
pred_glm = model_GLM.predict(test)

# Create dataframe of predictions for linear and GLM model: predictions
predictions = pd.DataFrame({'Pred_LM': pred_lm, 'Pred_GLM': pred_glm})

# Concatenate test sample and predictions and view the results
all_data = pd.concat([test, predictions], axis = 1)
print(all_data)

### Model fitting step-by-step

In [None]:
# Define the formula the the logistic model
model_formula = 'switch ~ distance100'

# Define the correct probability distribution and the link function of the response variable
link_function = sm.families.links.logit
model_family = sm.families.Binomial(link = link_function)

# Fit the model
wells_fit = glm(formula = model_formula, 
                data = wells, 
                family = model_family).fit()

### Results of the model fit using summary()

In [None]:
# View the results of the wells_fit model
print(wells_fit.summary())

### Extracting parameter estimates

In [None]:
# Extract coefficients from the fitted model wells_fit
intercept, slope = wells_fit.params

# Print coefficients
print('Intercept =', intercept)
print('Slope =', slope)

# Extract and print confidence intervals
print(wells_fit.conf_int())

## Modeling Binary Data

### Compute odds and probabilities

In [None]:
# Probability calculation
probability = 15/60

# Compute odds using probability calculation
odds_from_probs = probability/(1 - probability)

# Print the results
print(round(odds_from_probs, 3))

### Fit logistic regression

In [None]:
# Load libraries and functions
import statsmodels.api as sm
from statsmodels.formula.api import glm

# Fit logistic regression model
model_GLM = glm(formula = 'switch ~ arsenic',
                data = wells,
                family = sm.families.Binomial()).fit()

# Print model summary
print(model_GLM.summary())

### Coefficients in terms of odds

In [None]:
# Load libraries and functions
import statsmodels.api as sm
from statsmodels.formula.api import glm
import numpy as np

# Fit logistic regression model
model_GLM = glm(formula = 'switch ~ distance100',
                data = wells,
                family = sm.families.Binomial()).fit()

# Extract model coefficients
print('Model coefficients: \n', model_GLM.params)

# Compute the multiplicative effect on the odds
print('Odds: \n', np.exp(model_GLM.params))

### Rate of change in probability

In [None]:
# Define x at 1.5
x = 1.5

# Extract intercept & slope from the fitted model
intercept, slope = wells_GLM.params

In [None]:
# Define x at 1.5
x = 1.5

# Compute and print the estimated probability
est_prob = np.exp(intercept + slope*x)/(1+np.exp(intercept + slope*x))
print('Estimated probability at x = 1.5: ', round(est_prob, 4))

# Compute the slope of the tangent line for parameter beta at x
slope_tan = slope * est_prob * (1 - est_prob)
print('The rate of change in probability: ', round(slope_tan,4))

### Statistical significance

In [None]:
# Import libraries and th glm function
import statsmodels.api as sm
from statsmodels.formula.api import glm

# Fit logistic regression and save as crab_GLM
crab_GLM = glm('y ~ width', data = crab, family = sm.families.Binomial()).fit()

# Print model summary
print(crab_GLM.summary())

### Computing Wald statistic

In [None]:
# Extract coefficients
intercept, slope = crab_GLM.params

# Estimated covariance matrix: crab_cov
crab_cov = crab_GLM.cov_params()
print(crab_cov)

# Compute standard error (SE): std_error
std_error = np.sqrt(crab_cov.loc['width', 'width'])
print('SE: ', round(std_error, 4))

# Compute Wald statistic
wald_stat = slope/std_error
print('Wald statistic: ', round(wald_stat,4))

### Confidence intervals

In [None]:
# Extract and print confidence intervals
print(crab_GLM.conf_int())

In [None]:
# Compute confidence intervals for the odds
print(np.exp(crab_GLM.conf_int()))

### Visualize model fit using regplot()

In [None]:
# Plot distance and switch and add overlay with the logistic fit
sns.regplot(x = 'arsenic', y = 'switch', 
            y_jitter = 0.03,
            data = wells, 
            logistic = True,
            ci = None)

# Display the plot
plt.show()

### Compute predictions

In [None]:
# Compute predictions for the test sample wells_test and save as prediction
prediction = wells_fit.predict(exog = wells_test)

# Add prediction to the existing data frame wells_test and assign column name prediction
wells_test['prediction'] = prediction

# Examine the first 5 computed predictions
print(wells_test[['switch', 'arsenic','prediction']].head())

### Compute confusion matrix

In [None]:
# Define the cutoff
cutoff =0.5

# Compute class predictions: y_prediction
y_prediction = np.where(prediction > cutoff, 1, 0)

In [None]:
# Compute class predictions y_pred
y_prediction = np.where(prediction > cutoff, 1, 0)

# Assign actual class labels from the test sample to y_actual
y_actual = wells_test['switch']

# Compute the confusion matrix using crosstab function
conf_mat = pd.crosstab(y_actual, y_prediction,
					   rownames=['Actual'], 
                  	   colnames=['Predicted'], 
                       margins = True)

# Print the confusion matrix
print(conf_mat)