### Illiteracy Rates and Demographic Features ###

#### Data Setup and Exploration ####

In [2]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import numpy as np

# Load dataset
from faraway.datasets import statedata
data = statedata.load()

# Inspect data
print(data.head())
print(data.info())
print(data.describe())

  State  Population  Income  Illiteracy  LifeExp  Murder  HSGrad  Frost  \
0    AL        3615    3624         2.1    69.05    15.1    41.3     20   
1    AK         365    6315         1.5    69.31    11.3    66.7    152   
2    AZ        2212    4530         1.8    70.55     7.8    58.1     15   
3    AR        2110    3378         1.9    70.66    10.1    39.9     65   
4    CA       21198    5114         1.1    71.71    10.3    62.6     20   

     Area  
0   50708  
1  566432  
2  113417  
3   51945  
4  156361  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   State       50 non-null     object 
 1   Population  50 non-null     int64  
 2   Income      50 non-null     int64  
 3   Illiteracy  50 non-null     float64
 4   LifeExp     50 non-null     float64
 5   Murder      50 non-null     float64
 6   HSGrad      50 non-null     float64
 7   F

#### Fit the Regression Model ####

The response variable is Illiteracy, and the predictors are Income and Population.

In [3]:
mlr_model = smf.ols(formula="Illiteracy ~ Income + Population", data=data).fit()
print(mlr_model.summary())

                            OLS Regression Results                            
Dep. Variable:             Illiteracy   R-squared:                       0.232
Model:                            OLS   Adj. R-squared:                  0.200
Method:                 Least Squares   F-statistic:                     7.110
Date:                Sun, 26 Oct 2025   Prob (F-statistic):            0.00201
Time:                        16:33:22   Log-Likelihood:                -39.081
No. Observations:                  50   AIC:                             84.16
Df Residuals:                      47   BIC:                             89.90
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.1631      0.569      5.556      0.0

#### Hypothesis Test for Slopes ####

In [4]:
# Print slopes and p-values
print("Income slope:", mlr_model.params['Income'], "| p-value:", mlr_model.pvalues['Income'])
print("Population slope:", mlr_model.params['Population'], "| p-value:", mlr_model.pvalues['Population'])

Income slope: -0.00047645183275191844 | p-value: 0.0006075011660540417
Population slope: 2.834840455181858e-05 | p-value: 0.11876474405794965


#### Confidence Intervals ####

In [6]:
print(mlr_model.conf_int(alpha=0.05))


                   0         1
Intercept   2.017716  4.308416
Income     -0.000737 -0.000216
Population -0.000008  0.000064


#### Quadratic Model ####

In [7]:
data['Income2'] = data['Income'] ** 2

# Fit model with quadratic term
quad_model = smf.ols(formula="Illiteracy ~ Income + Income2", data=data).fit()
print(quad_model.summary())

                            OLS Regression Results                            
Dep. Variable:             Illiteracy   R-squared:                       0.399
Model:                            OLS   Adj. R-squared:                  0.373
Method:                 Least Squares   F-statistic:                     15.58
Date:                Sun, 26 Oct 2025   Prob (F-statistic):           6.45e-06
Time:                        16:35:43   Log-Likelihood:                -32.974
No. Observations:                  50   AIC:                             71.95
Df Residuals:                      47   BIC:                             77.68
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     12.7604      2.452      5.204      0.0

In [8]:
print("Income^2 slope:", quad_model.params['Income2'], "| p-value:", quad_model.pvalues['Income2'])

Income^2 slope: 4.86386614877934e-07 | p-value: 0.00020380357998264753


In [9]:
print(quad_model.conf_int(alpha=0.05).loc['Income2'])


0    2.434836e-07
1    7.292897e-07
Name: Income2, dtype: float64


#### Add Interaction Term ####

In [10]:
interaction_model = smf.ols(formula="Illiteracy ~ Income * Population", data=data).fit()
print(interaction_model.summary())


                            OLS Regression Results                            
Dep. Variable:             Illiteracy   R-squared:                       0.323
Model:                            OLS   Adj. R-squared:                  0.278
Method:                 Least Squares   F-statistic:                     7.303
Date:                Sun, 26 Oct 2025   Prob (F-statistic):           0.000419
Time:                        16:37:02   Log-Likelihood:                -35.951
No. Observations:                  50   AIC:                             79.90
Df Residuals:                      46   BIC:                             87.55
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
Intercept             1.7918      0.77

In [11]:
print("Interaction slope:", interaction_model.params['Income:Population'])
print("Confidence Interval:", interaction_model.conf_int().loc['Income:Population'])


Interaction slope: -1.0619842809124943e-07
Confidence Interval: 0   -1.925058e-07
1   -1.989103e-08
Name: Income:Population, dtype: float64


#### Create Indicator Variable for High/Low Income ####

In [12]:
median_income = data['Income'].median()
data['HighIncome'] = np.where(data['Income'] > median_income, 1, 0)
indicator_model = smf.ols(formula="Illiteracy ~ HighIncome + Population", data=data).fit()
print(indicator_model.summary())

                            OLS Regression Results                            
Dep. Variable:             Illiteracy   R-squared:                       0.111
Model:                            OLS   Adj. R-squared:                  0.073
Method:                 Least Squares   F-statistic:                     2.943
Date:                Sun, 26 Oct 2025   Prob (F-statistic):             0.0625
Time:                        16:38:12   Log-Likelihood:                -42.739
No. Observations:                  50   AIC:                             91.48
Df Residuals:                      47   BIC:                             97.21
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.2651      0.134      9.444      0.0

In [13]:
print("HighIncome slope:", indicator_model.params['HighIncome'], "| p-value:", indicator_model.pvalues['HighIncome'])
print("Confidence Interval:", indicator_model.conf_int().loc['HighIncome'])

HighIncome slope: -0.3887259273972358 | p-value: 0.026151809876558352
Confidence Interval: 0   -0.729254
1   -0.048198
Name: HighIncome, dtype: float64


#### Log Transformation for Population ####

In [14]:
data['log_Population'] = np.log(data['Population'])

# Fit model with transformed variable
log_model = smf.ols(formula="Illiteracy ~ Income + log_Population", data=data).fit()
print(log_model.summary())

                            OLS Regression Results                            
Dep. Variable:             Illiteracy   R-squared:                       0.245
Model:                            OLS   Adj. R-squared:                  0.212
Method:                 Least Squares   F-statistic:                     7.610
Date:                Sun, 26 Oct 2025   Prob (F-statistic):            0.00137
Time:                        16:39:03   Log-Likelihood:                -38.676
No. Observations:                  50   AIC:                             83.35
Df Residuals:                      47   BIC:                             89.09
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          2.0630      0.797      2.

In [15]:
print("log_Population slope:", log_model.params['log_Population'], "| p-value:", log_model.pvalues['log_Population'])
print("Confidence Interval:", log_model.conf_int().loc['log_Population'])

log_Population slope: 0.1355427381605727 | p-value: 0.0742392436241975
Confidence Interval: 0   -0.013805
1    0.284890
Name: log_Population, dtype: float64


#### Model Comparison ####

In [16]:
models = {
   "Multiple Regression": mlr_model.rsquared_adj,
   "Quadratic Model": quad_model.rsquared_adj,
   "Interaction Model": interaction_model.rsquared_adj,
   "Indicator Model": indicator_model.rsquared_adj,
   "Log Model": log_model.rsquared_adj
}


# Display models in descending order of Adjusted R-squared
import pandas as pd
comparison_df = pd.DataFrame(models.items(), columns=['Model', 'Adjusted R-squared']).sort_values(by='Adjusted R-squared', ascending=False)
print(comparison_df)

                 Model  Adjusted R-squared
1      Quadratic Model            0.373065
2    Interaction Model            0.278437
4            Log Model            0.212467
0  Multiple Regression            0.199609
3      Indicator Model            0.073486
