In [1]:
import numpy as np
import pandas as pd
from scipy.stats import norm, beta, gamma
from scipy.stats import multivariate_normal as mn
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
np.random.seed(1234)

In [2]:
rawdata = pd.read_csv('gapminder.tsv.txt', sep='\t')
life_exp = rawdata['lifeExp']
continent = pd.get_dummies(rawdata['continent'],drop_first=True,prefix="continent")
continent+=1
data = rawdata[['year', 'country', 'pop', 'gdpPercap']]
data = pd.concat([data, continent], axis=1)

data

Unnamed: 0,year,country,pop,gdpPercap,continent_Americas,continent_Asia,continent_Europe,continent_Oceania
0,1952,Afghanistan,8425333,779.445314,1,2,1,1
1,1957,Afghanistan,9240934,820.853030,1,2,1,1
2,1962,Afghanistan,10267083,853.100710,1,2,1,1
3,1967,Afghanistan,11537966,836.197138,1,2,1,1
4,1972,Afghanistan,13079460,739.981106,1,2,1,1
...,...,...,...,...,...,...,...,...
1699,1987,Zimbabwe,9216418,706.157306,1,1,1,1
1700,1992,Zimbabwe,10704340,693.420786,1,1,1,1
1701,1997,Zimbabwe,11404948,792.449960,1,1,1,1
1702,2002,Zimbabwe,11926563,672.038623,1,1,1,1


In [3]:
countries = rawdata['country'].unique()
betas = []
life_exps = []

for country in countries:
    dat = data[data['country']==country]
    dat=dat.drop(['country'], axis=1)
    dat = dat.reset_index()
    dat = dat.drop(['index'], axis=1)
    y = np.array(rawdata[rawdata['country']==country]['lifeExp'])
    life_exps.append(y)
    
    b = np.linalg.lstsq(dat, y, rcond=None)
    betas.append(b[0])

In [27]:
# create gibbs sampler
def gibbs(data, life_exp, beta, N=1000, burn=100):
    beta_post = np.zeros(N-burn, dtype=object)
    sl = np.var(life_exp)
    sp = np.var(data['pop'])
    sg = np.var(data['gdpPercap'])
    print(sp/100)
    print(data['pop'])
    
    # get all initial vals
    k = norm(1, sl/20).rvs()
    z = norm(1, sl/20).rvs()
    
    for i in range(N):
        pop = norm(beta[1], ).rvs()
        gdp = norm(beta[2], sg).rvs()
        
        k = norm(k, sl/20).rvs()
        z = norm(z, sl/20).rvs()        
        
        if i>=burn:
            b = []
            b = [beta[0], pop, gdp, beta[3], beta[4], beta[5], beta[6]]
            beta_post[i-burn]=np.array(b)
            
    return(b)

In [28]:
d = data[data['country']=='Afghanistan']
gibbs(d, rawdata[rawdata['country']=='Afghanistan']['lifeExp'], betas[0])

463991898452.9557
0      8425333
1      9240934
2     10267083
3     11537966
4     13079460
5     14880372
6     12881816
7     13867957
8     16317921
9     22227415
10    25268405
11    31889923
Name: pop, dtype: int64


[0.3882406815153056,
 167356622101.38977,
 7707.723146079254,
 -104.04278693499741,
 -208.08557386999487,
 -104.04278693499744,
 -104.04278693499744]

In [6]:
betas[0]

array([ 3.88240682e-01, -3.12984605e-07,  2.75141650e-03, -1.04042787e+02,
       -2.08085574e+02, -1.04042787e+02, -1.04042787e+02])