In [2]:
import numpy as np
import pandas as pd
from scipy.stats import norm, beta, gamma
from scipy.stats import multivariate_normal as mn
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
np.random.seed(1234)

In [6]:
rawdata = pd.read_csv('gapminder.tsv.txt', sep='\t')
life_exp = rawdata['lifeExp']
continent = pd.get_dummies(rawdata['continent'],drop_first=True,prefix="continent")
data = rawdata[['year', 'country', 'pop', 'gdpPercap', "lifeExp"]]
data = pd.concat([data, continent], axis=1)

data

Unnamed: 0,year,country,pop,gdpPercap,lifeExp,continent_Americas,continent_Asia,continent_Europe,continent_Oceania
0,1952,Afghanistan,8425333,779.445314,28.801,0,1,0,0
1,1957,Afghanistan,9240934,820.853030,30.332,0,1,0,0
2,1962,Afghanistan,10267083,853.100710,31.997,0,1,0,0
3,1967,Afghanistan,11537966,836.197138,34.020,0,1,0,0
4,1972,Afghanistan,13079460,739.981106,36.088,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1699,1987,Zimbabwe,9216418,706.157306,62.351,0,0,0,0
1700,1992,Zimbabwe,10704340,693.420786,60.377,0,0,0,0
1701,1997,Zimbabwe,11404948,792.449960,46.809,0,0,0,0
1702,2002,Zimbabwe,11926563,672.038623,39.989,0,0,0,0


In [7]:
countries = rawdata['country'].unique()
betas = []
life_exps = []

for country in countries:
    y = np.array(data[data['country']==country]['lifeExp'])
    dat = data[data['country']==country].drop(['country', "lifeExp"], axis=1)
    
    life_exps.append(y)
    
    b = np.linalg.lstsq(dat, y, rcond=None)
    betas.append(b[0])

In [12]:
# create gibbs sampler
def gibbs(data, life_exp, beta, N=1000, burn=100):
    beta_post = np.zeros(N-burn, dtype=object)
    sl = np.var(life_exp)
    sp = np.var(data['pop'])
    sg = np.var(data['gdpPercap'])
    print(sp/100)
    print(data['pop'])
    
    # get all initial vals
    k = norm(1, sl/20).rvs()
    z = norm(1, sl/20).rvs()
    for i in range(N):
        pop = norm(beta[1], ).rvs()
        gdp = norm(beta[2], sg).rvs()
        
        k = norm(k, sl/20).rvs()
        z = norm(z, sl/20).rvs()        
        
        if i>=burn:
            b = []
            b = [beta[0], pop, gdp, beta[3], beta[4], beta[5], beta[6]]
            beta_post[i-burn]=np.array(b)
            
    return(b)

In [13]:
d = data[data['country']=='Afghanistan']
gibbs(d, rawdata[rawdata['country']=='Afghanistan']['lifeExp'], betas[0])

463991898452.9557
0      8425333
1      9240934
2     10267083
3     11537966
4     13079460
5     14880372
6     12881816
7     13867957
8     16317921
9     22227415
10    25268405
11    31889923
Name: pop, dtype: int64


[0.3882406815153031,
 -1.5214479104251037,
 -3505.6694684218373,
 0.0,
 -728.2995085449766,
 0.0,
 0.0]

In [8]:
betas[0]

array([ 3.88240682e-01, -3.12984605e-07,  2.75141650e-03,  0.00000000e+00,
       -7.28299509e+02,  0.00000000e+00,  0.00000000e+00])