# 09_03: Evaluating model fit

In [None]:
import math
import collections
import dataclasses
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp

In [None]:
import statsmodels
import statsmodels.formula.api as smf

In [None]:
gm = pd.read_csv('gapminder.csv', dtype_backend='pyarrow')

gdata = gm[gm.year == 1985].copy()
gdata['region'] = gdata['region'].astype('category')

In [None]:
continent = gdata.region.map({'Africa': 'skyblue', 'Europe': 'gold', 'America': 'palegreen', 'Asia': 'coral', 'Oceania': 'teal'})
population = 1e-6 * gdata.population

def plotbabies():
    gdata.plot.scatter('age5_surviving', 'babies_per_woman', c=continent, s=population,
                       linewidths=0.5, edgecolor='black', alpha=0.6, figsize=(5,3.5))

# define a function to plot residuals
def plotresidual(fit):
    pp.scatter(gdata.age5_surviving, fit.predict(gdata), color=continent, s=50, marker='.', ec='k', lw=0.5);

In [None]:
groupfit = smf.ols(formula='babies_per_woman ~ -1 + region', data=gdata).fit()
survivingfit = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving', data=gdata).fit()
twovariablefit = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving:region + population', data=gdata).fit()

In [None]:
plotbabies()
plotresidual(groupfit)

In [None]:
plotbabies()
plotresidual(survivingfit)

In [None]:
plotbabies()
plotresidual(twovariablefit)

In [None]:
gdata.babies_per_woman - survivingfit.predict(gdata)

In [None]:
survivingfit.resid

In [None]:
pp.hist(groupfit.resid, histtype='step', density=True, bins=20, label='constant')
pp.hist(survivingfit.resid, histtype='step', density=True, bins=20, label='surviving')
pp.hist(twovariablefit.resid, histtype='step', density=True, bins=20, label='twovariable')
pp.legend();

In [None]:
[fit.mse_resid for fit in [groupfit, survivingfit, twovariablefit]]

In [None]:
[fit.rsquared for fit in [groupfit, survivingfit, twovariablefit]]

In [None]:
[fit.fvalue for fit in [groupfit, survivingfit, twovariablefit]]

In [None]:
twovariablefit.summary()

In [None]:
shuffled = gdata.sample(len(gdata))

In [None]:
ntrain = int(0.9 * len(gdata))
training, testing = shuffled.iloc[:ntrain], shuffled.iloc[ntrain:]

In [None]:
len(training), len(testing)

In [None]:
grouptrain = smf.ols(formula='babies_per_woman ~ -1 + region', data=training).fit()
survivingtrain = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving', data=training).fit()
twovariabletrain = smf.ols(formula='babies_per_woman ~ -1 + region + age5_surviving:region + population', data=training).fit()

In [None]:
grouptrain.params

In [None]:
groupfit.params

In [None]:
def mse_test(fit, testing):
    return np.sum((fit.predict(testing) - testing.babies_per_woman)**2) / (len(testing) - len(fit.params))

In [None]:
[mse_test(fit, testing) for fit in [grouptrain, survivingtrain, twovariabletrain]] # testing MSE

In [None]:
mses = []
for i in range(100):
    shuffled = gdata.sample(len(gdata))
    training, testing = shuffled.iloc[:ntrain], shuffled.iloc[ntrain:]
    mses.append([mse_test(fit, testing) for fit in [grouptrain, survivingtrain, twovariabletrain]])

print(np.array(mses).mean(axis=0))