# Dataframe for Meta-Analysis

In [67]:
import pandas as pd
import datetime
import numpy as np
import patsy 

import warnings
warnings.filterwarnings('ignore')

In [89]:
# Load arron-temp data
arron_temp = pd.read_csv("C:/Users/azdra/Documents/School/KU Leuven/Modern Data Analytics/mda_project/code/out/arron_temp_data.csv", sep = ",")

# Load mortality data
mort = pd.read_csv("C:/Users/azdra/Documents/School/KU Leuven/Modern Data Analytics/mda_project/code/out/mortality.csv", sep = ",")

In [90]:
mort

Unnamed: 0,YEAR,YEAR_WEEK,COD,ARRON,N_DEATHS,WEEK
0,2000,2000-001,natural,11000,31.0,1
1,2000,2000-001,natural,12000,10.0,1
2,2000,2000-001,natural,13000,14.0,1
3,2000,2000-001,natural,21000,43.0,1
4,2000,2000-001,natural,23000,14.0,1
...,...,...,...,...,...,...
45550,2019,2019-053,natural,84000,6.0,53
45551,2019,2019-053,natural,85000,5.0,53
45552,2019,2019-053,natural,91000,10.0,53
45553,2019,2019-053,natural,92000,19.0,53


In [91]:
arron_temp = arron_temp.drop(['Unnamed: 0', 'GRID_NO'], axis = 1)

In [92]:
arron_temp

Unnamed: 0,DAY,Nom_arrondissement,NIS_Code,TEMPERATURE_MAX,TEMPERATURE_MIN,TEMPERATURE_AVG,YEAR
0,20000101,Aalst,41000,8.4,4.7,6.6,2000.0
1,20000102,Aalst,41000,8.8,5.7,7.3,2000.0
2,20000103,Aalst,41000,9.5,7.1,8.3,2000.0
3,20000104,Aalst,41000,10.2,7.1,8.7,2000.0
4,20000105,Aalst,41000,7.2,2.1,4.7,2000.0
...,...,...,...,...,...,...,...
321415,20191227,Ypres,33000,8.0,6.9,7.4,2019.0
321416,20191228,Ypres,33000,7.5,5.7,6.6,2019.0
321417,20191229,Ypres,33000,4.8,0.5,2.7,2019.0
321418,20191230,Ypres,33000,7.7,1.2,4.5,2019.0


In [98]:
regions = arron_temp['Nom_arrondissement'].unique()
datalist = {}

for region in regions:
    datalist[region] = arron_temp[arron_temp['Nom_arrondissement'] == region]
    datalist[region] = datalist[region].rename(columns={"DAY": "DATE"})
    datalist[region]['tmean05'] = datalist[region]['TEMPERATURE_AVG'].rolling(window=6).mean()
    datalist[region]['MONTH'] = [int((x//100)%100) for x in datalist[region]['DATE']]
    datalist[region]['DAY'] = [int(x%100) for x in datalist[region]['DATE']]
    datalist[region]['WEEK'] = [int(datetime.date(x//10000, (x//100)%100, x%100).strftime("%V")) for x in datalist[region]['DATE']]
    datalist[region]['YDAY'] = [int(datetime.date(x//10000, (x//100)%100, x%100).strftime('%j')) for x in datalist[region]['DATE']]
    datalist[region]['DOW'] = [int(datetime.date(x//10000, (x//100)%100, x%100).weekday()) for x in datalist[region]['DATE']]
    datalist[region]['TIME'] = list(range(1, len(datalist[region]['DATE']) + 1))
    datalist[region] = datalist[region][['DATE', 'YEAR', 'MONTH', 'DAY', 'WEEK', 'YDAY', 'DOW', 'TIME', 'Nom_arrondissement', 'NIS_Code', 'TEMPERATURE_MIN', 'TEMPERATURE_MAX', 'TEMPERATURE_AVG', 'tmean05']]
    datalist[region]['N_DEATHS'] = -1
    datalist[region].reset_index(drop = True)
    
    n_deaths=[]
    for i in range(0, len(datalist[region])):
        deaths = mort[(mort['YEAR']==datalist[region].iloc[i]['YEAR']) & 
                      (mort['WEEK']==datalist[region].iloc[i]['WEEK']) & 
                      (mort['ARRON']==datalist[region].iloc[i]['NIS_Code'])]['N_DEATHS']
        if len(deaths) ==0:
            n_deaths.append(np.NAN)
        else:
            n_deaths.append(deaths.iloc[0])
    datalist[region]['N_DEATHS'] = [round(d/7, 2) for d in n_deaths]

In [99]:
datalist['Dinant']

Unnamed: 0,DATE,YEAR,MONTH,DAY,WEEK,YDAY,DOW,TIME,Nom_arrondissement,NIS_Code,TEMPERATURE_MIN,TEMPERATURE_MAX,TEMPERATURE_AVG,tmean05,N_DEATHS
87660,20000101,2000.0,1,1,52,1,5,1,Dinant,91000,3.1,6.6,4.8,,1.29
87661,20000102,2000.0,1,2,52,2,6,2,Dinant,91000,3.5,6.9,5.2,,1.29
87662,20000103,2000.0,1,3,1,3,0,3,Dinant,91000,5.4,5.6,5.5,,1.00
87663,20000104,2000.0,1,4,1,4,1,4,Dinant,91000,4.8,7.9,6.3,,1.00
87664,20000105,2000.0,1,5,1,5,2,5,Dinant,91000,1.9,6.5,4.2,,1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94960,20191227,2019.0,12,27,52,361,4,7301,Dinant,91000,3.8,6.5,5.2,5.883333,2.86
94961,20191228,2019.0,12,28,52,362,5,7302,Dinant,91000,3.1,4.5,3.8,5.350000,2.86
94962,20191229,2019.0,12,29,52,363,6,7303,Dinant,91000,-1.0,7.5,3.2,4.716667,2.86
94963,20191230,2019.0,12,30,1,364,0,7304,Dinant,91000,2.2,9.2,5.7,4.450000,3.14


In [100]:
# saving meta-analysis dataset to /out
df = pd.DataFrame(columns = (datalist['Dinant']).columns)
for region in regions:
    df = pd.concat([df, datalist[region]])

df.to_csv('out/meta_analysis_data.csv', encoding = 'utf-8-sig') 

In [101]:
df

Unnamed: 0,DATE,YEAR,MONTH,DAY,WEEK,YDAY,DOW,TIME,Nom_arrondissement,NIS_Code,TEMPERATURE_MIN,TEMPERATURE_MAX,TEMPERATURE_AVG,tmean05,N_DEATHS
0,20000101,2000.0,1,1,52,1,5,1,Aalst,41000,4.7,8.4,6.6,,7.57
1,20000102,2000.0,1,2,52,2,6,2,Aalst,41000,5.7,8.8,7.3,,7.57
2,20000103,2000.0,1,3,1,3,0,3,Aalst,41000,7.1,9.5,8.3,,0.71
3,20000104,2000.0,1,4,1,4,1,4,Aalst,41000,7.1,10.2,8.7,,0.71
4,20000105,2000.0,1,5,1,5,2,5,Aalst,41000,2.1,7.2,4.7,,0.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321415,20191227,2019.0,12,27,52,361,4,7301,Ypres,33000,6.9,8.0,7.4,7.416667,2.00
321416,20191228,2019.0,12,28,52,362,5,7302,Ypres,33000,5.7,7.5,6.6,7.133333,2.00
321417,20191229,2019.0,12,29,52,363,6,7303,Ypres,33000,0.5,4.8,2.7,6.266667,2.00
321418,20191230,2019.0,12,30,1,364,0,7304,Ypres,33000,1.2,7.7,4.5,5.516667,2.43


# Model

In [None]:
m = len(datalist)
m

44

In [None]:
# TEMPERATURE RANGES (FOR LAG 0-5)
min_tmean05 = pd.DataFrame([{k:(v['tmean05']).min() for k,v in datalist.items()}], index =["MIN"])
max_tmean05 = pd.DataFrame([{k:(v['tmean05']).max() for k,v in datalist.items()}], index = ["MAX"])
ranges = pd.concat([min_tmean05, max_tmean05], axis = 0).transpose()
ranges

Unnamed: 0,MIN,MAX
Aalst,-6.05,25.483333
Antwerpen,-8.066667,25.366667
Arlon,-9.3,29.366667
Ath,-6.8,25.566667
Audenarde,-5.833333,25.633333
Bastogne,-9.9,27.116667
Brugge,-5.016667,23.55
Bruxelles-Capitale,-7.9,25.35
Charleroi,-9.033333,25.933333
Courtrai,-5.833333,25.633333


In [None]:
# DEFINE THE AVERAGE RANGE, CENTERING POINT, DEGREE AND TYPE OF THE SPLINE
# (THESE PARAMETERS CAN BE CHANGED BY THE USER FOR ADDITIONAL ANALYSES)
cen = 17
bounds = ranges.apply(np.mean)
degree = 2
type = "bs"
df = 6

# DEFINE THE KNOTS AT TEMPERATURE CORRESPONDING TO AVERAGE PERCENTILES
knotperc = [5, 35, 65, 95]
knots = pd.DataFrame(columns = ["5%", "35%", "65%", "95%"], index = regions)
for region in regions:
    perc = np.nanpercentile(datalist[region]["tmean05"], knotperc)
    knots.loc[region] = [perc[0], perc[1], perc[2], perc[3]]
knots
  

Unnamed: 0,5%,35%,65%,95%
Aalst,1.6325,8.41667,14.5167,19.9667
Antwerpen,1.5,8.35,14.4833,20.0167
Arlon,-0.816667,6.88333,13.8667,20.6175
Ath,1.41667,8.21667,14.3833,20.0342
Audenarde,1.78333,8.56667,14.6167,20.0167
Bastogne,-0.966667,6.455,13.2667,19.75
Brugge,2.35,8.38333,14.15,18.9333
Bruxelles-Capitale,1.31583,8.31667,14.5058,20.25
Charleroi,0.65,7.9,14.1667,19.9833
Courtrai,1.78333,8.56667,14.6167,20.0167


In [None]:
btmean05 = patsy.bs(x=datalist['Dinant']['tmean05'], knots=knots.loc['Dinant'], degree=degree, lower_bound=bounds[0], upper_bound=bounds[1])
btmean05
# import statsmodels.formula.api as smf
# import statsmodels.api as sm

# formula = 'death ~ btmean05 + dow + cubspl'
# model = smf.glm(formula = formula, data=rdata, family=sm.families.Poisson())
# result = model.fit()
# print(result.summary())

Unnamed: 0,0,1,2,3,4,5
87660,,,,,,
87661,,,,,,
87662,,,,,,
87663,,,,,,
87664,,,,,,
...,...,...,...,...,...,...
94960,0.021580,0.647700,0.330719,0.0,0.0,0.0
94961,0.039426,0.689178,0.271396,0.0,0.0,0.0
94962,0.067551,0.723890,0.208559,0.0,0.0,0.0
94963,0.081644,0.733783,0.184573,0.0,0.0,0.0


# Trying Things Out


In [None]:
# #!git clone https://github.com/rpy2/rpy2.git
# #%cd C:\Users\azdra\Documents\School\KU Leuven\Modern Data Analytics\mda_project\code\rpy2
# #!python setup.py install


# import pandas as pd
# import rpy2.robjects as ro
# from rpy2.robjects.packages import importr
# from rpy2.robjects import pandas2ri

# from rpy2.robjects.conversion import localconverter

# r_df = ro.DataFrame({'int_values': ro.IntVector([1,2,3]),
#                      'str_values': ro.StrVector(['abc', 'def', 'ghi'])})

# r_df
# with localconverter(ro.default_converter + pandas2ri.converter):
#   pd_from_r_df = ro.conversion.rpy2py(r_df)

# pd_from_r_df

# df=datalist['Dinant']
# rdf = pandas2ri.py2ri_pandasdataframe(df)
# qpoisson = ro.r.glm('Claim_Count ~ Age + Vehicle_Use', data = rdf, family = ro.r('quasipoisson(link = &quot;log&quot;)'))
# #print ro.r.summary(qpoisson)

In [None]:
# for region in regions:
#     # print iteration
#     print(region)

#     # load
#     data = datalist[region]

#     # Create the spline. Nodes and boundaries are fixed at same values
    



In [None]:
# rdata = pd.read_csv("C:/Users/azdra/Documents/School/KU Leuven/Modern Data Analytics/mda_project/r_data.csv", sep = ",")
# rdata

# btmean05 = patsy.bs(x=rdata['tmean05'], knots=[2.850704,  7.800237, 12.898169, 18.153446], degree=degree, lower_bound=-2.523582, upper_bound=23.017419)

# import statsmodels.formula.api as smf
# import statsmodels.api as sm

# cubspl = patsy.cr(rdata['time'], 7*14)
# formula = 'death ~ btmean05 + dow + cubspl'
# model = smf.glm(formula = formula, data=rdata, family=sm.families.Poisson())
# result = model.fit()
# print(result.summary())