In [1]:
import pandas as pd
import os
from epiweeks import Week, Year

In [2]:
PATH = 'data/'

cases_path = os.path.join(PATH, 'cases/cases.csv')
temperature_path = os.path.join(PATH, 'Temperature/temperature.csv')
precipitation_path = os.path.join(PATH, 'Precipitation/precipitation.csv')
demographics_path = os.path.join(PATH, 'Demographic/demographic.csv')
embeddings = os.path.join(PATH, 'Embeddings/variational_autoencoder_ResNet50V2__224_1024_3Bands_rio.csv')

In [3]:
def get_epiweek(date):
    week = Week.fromdate(date)
    return int(str(week))

### Read data

In [4]:
cases = pd.read_csv(cases_path)
cases.rename(columns={'epiweek': 'Epiweek'}, inplace=True)
# Merge columns 'Epiweek' wich only contains the week number and column year to be YearEpiweek; If epiweek has only 1 digit, add a 0 in front
cases['Epiweek'] = cases['Epiweek'].apply(lambda x: str(x).zfill(2))
cases['Epiweek'] = cases['year'].astype(str) + cases['Epiweek']
cases['Epiweek'] = cases['Epiweek'].astype(int)
cases

Unnamed: 0,cases,Epiweek,year
0,43,201501,2015
1,30,201502,2015
2,42,201503,2015
3,33,201504,2015
4,59,201505,2015
...,...,...,...
466,636,202349,2023
467,816,202350,2023
468,721,202351,2023
469,562,202352,2023


In [5]:
precipitation = pd.read_csv(precipitation_path)
precipitation

Unnamed: 0,Epiweek,precipitation
0,201453,7.326
1,201501,46.448
2,201502,0.013
3,201503,43.581
4,201504,99.522
...,...,...
466,202349,33.037
467,202350,0.323
468,202351,13.727
469,202352,6.937


In [6]:
temperature = pd.read_csv(temperature_path)
temperature

Unnamed: 0,Epiweek,temperature
0,201453,39.330667
1,201501,37.142143
2,201502,36.971643
3,201503,37.902000
4,201504,33.811714
...,...,...
405,202241,27.580500
406,202243,29.708286
407,202244,27.855143
408,202245,28.147571


In [7]:
demographics = pd.read_csv(demographics_path)
demographics.rename(columns={'epiweek': 'Epiweek'}, inplace=True)
demographics

Unnamed: 0,Epiweek,population,variation
0,201451,1.282500e+07,0.900000
1,201500,1.282723e+07,0.900000
2,201501,1.282946e+07,0.900000
3,201502,1.283169e+07,0.900000
4,201503,1.283392e+07,0.900000
...,...,...,...
466,202348,1.372091e+07,0.687736
467,202349,1.372268e+07,0.688302
468,202350,1.372445e+07,0.688868
469,202351,1.372623e+07,0.689434


In [8]:
embeddings = pd.read_csv(embeddings)
embeddings['Date'] = pd.to_datetime(embeddings['Date'])
embeddings['Epiweek'] = embeddings['Date'].apply(get_epiweek)
embeddings.sort_values(by='Epiweek', inplace=True)
embeddings

Unnamed: 0,Municipality Code,Date,0,1,2,3,4,5,6,7,...,1015,1016,1017,1018,1019,1020,1021,1022,1023,Epiweek
319,10000,2016-01-03,0.241374,-1.561932,0.309142,0.761763,1.078860,1.038276,-2.130718,0.208693,...,-1.269957,0.238937,-1.492360,-1.713798,1.239230,0.359771,0.134949,-0.437693,1.264120,201601
227,10000,2016-01-10,-0.135461,-1.707807,-0.354339,0.824970,-1.381245,-0.742811,-0.288823,0.445895,...,-2.025665,0.681672,0.321108,0.604627,0.770941,-1.010611,0.023148,0.860643,-0.245002,201602
34,10000,2016-01-17,-2.683568,1.224843,0.817149,-0.550825,-0.816751,1.652855,-0.524891,0.929462,...,0.016895,-1.271270,-1.117690,1.665178,0.443690,0.675015,-1.634904,-0.782346,-2.879396,201603
190,10000,2016-01-24,-0.392921,0.414659,0.140102,-1.350422,-0.746271,-0.540469,0.311457,1.756462,...,-0.503318,-0.482805,-0.436582,-0.155589,-0.857230,-0.140753,1.145457,-0.880416,-0.794366,201604
177,10000,2016-01-31,1.538756,-0.343051,0.173756,0.301466,-1.253962,0.681083,0.404304,1.364242,...,-0.954992,1.775423,-0.886031,1.274171,-0.568689,-0.134025,1.979926,-1.624665,-1.132198,201605
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,10000,2023-11-26,1.244006,-0.590205,1.047149,0.202036,-2.700557,-0.005340,1.109713,-1.685690,...,0.691372,-0.209624,0.841494,-0.007466,1.263855,-1.558685,-0.126284,0.180389,0.005462,202348
389,10000,2023-12-03,0.386444,0.995367,-0.079516,0.315029,3.171045,0.049635,0.460055,1.517618,...,0.982034,-0.763424,0.996093,0.495894,-1.531264,1.056130,-1.314655,-0.968883,2.481470,202349
147,10000,2023-12-10,-0.663760,-0.805401,0.189657,1.455089,0.259192,-1.646903,-0.912002,-0.322412,...,0.358316,2.705809,-1.144578,-0.676270,-0.701773,-0.105960,-1.564262,0.753925,-2.086452,202350
360,10000,2023-12-17,-1.707962,-0.988960,-0.488344,-0.919691,-1.072049,1.122650,0.170106,1.359195,...,1.216418,-0.947843,0.017484,0.651593,-0.671834,0.629730,-0.115686,0.213907,0.221282,202351


### Merge data

In [9]:
metadata = pd.merge(precipitation, demographics, on=['Epiweek'], how='left')
metadata = pd.merge(metadata, temperature, on=['Epiweek'], how='left')
metadata = pd.merge(metadata, cases, on=['Epiweek'], how='left')
metadata

Unnamed: 0,Epiweek,precipitation,population,variation,temperature,cases,year
0,201453,7.326,,,39.330667,,
1,201501,46.448,1.282946e+07,0.900000,37.142143,43.0,2015.0
2,201502,0.013,1.283169e+07,0.900000,36.971643,30.0,2015.0
3,201503,43.581,1.283392e+07,0.900000,37.902000,42.0,2015.0
4,201504,99.522,1.283615e+07,0.900000,33.811714,33.0,2015.0
...,...,...,...,...,...,...,...
466,202349,33.037,1.372268e+07,0.688302,,636.0,2023.0
467,202350,0.323,1.372445e+07,0.688868,,816.0,2023.0
468,202351,13.727,1.372623e+07,0.689434,,721.0,2023.0
469,202352,6.937,1.372800e+07,0.690000,,562.0,2023.0


In [10]:
# Full dataset
dataset = pd.merge(metadata, embeddings, on=['Epiweek'], how='right')
dataset

Unnamed: 0,Epiweek,precipitation,population,variation,temperature,cases,year,Municipality Code,Date,0,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,201601,0.439,1.294546e+07,0.900000,30.026000,495.0,2016.0,10000,2016-01-03,0.241374,...,-0.975199,-1.269957,0.238937,-1.492360,-1.713798,1.239230,0.359771,0.134949,-0.437693,1.264120
1,201602,120.388,1.294769e+07,0.900000,27.062500,404.0,2016.0,10000,2016-01-10,-0.135461,...,0.241989,-2.025665,0.681672,0.321108,0.604627,0.770941,-1.010611,0.023148,0.860643,-0.245002
2,201603,16.389,1.294992e+07,0.900000,27.203714,420.0,2016.0,10000,2016-01-17,-2.683568,...,-1.414230,0.016895,-1.271270,-1.117690,1.665178,0.443690,0.675015,-1.634904,-0.782346,-2.879396
3,201604,27.539,1.295215e+07,0.900000,32.608786,421.0,2016.0,10000,2016-01-24,-0.392921,...,0.362628,-0.503318,-0.482805,-0.436582,-0.155589,-0.857230,-0.140753,1.145457,-0.880416,-0.794366
4,201605,0.000,1.295438e+07,0.900000,34.180857,468.0,2016.0,10000,2016-01-31,1.538756,...,-0.208216,-0.954992,1.775423,-0.886031,1.274171,-0.568689,-0.134025,1.979926,-1.624665,-1.132198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,202348,4.456,1.372091e+07,0.687736,,510.0,2023.0,10000,2023-11-26,1.244006,...,1.371662,0.691372,-0.209624,0.841494,-0.007466,1.263855,-1.558685,-0.126284,0.180389,0.005462
412,202349,33.037,1.372268e+07,0.688302,,636.0,2023.0,10000,2023-12-03,0.386444,...,0.378428,0.982034,-0.763424,0.996093,0.495894,-1.531264,1.056130,-1.314655,-0.968883,2.481470
413,202350,0.323,1.372445e+07,0.688868,,816.0,2023.0,10000,2023-12-10,-0.663760,...,0.250468,0.358316,2.705809,-1.144578,-0.676270,-0.701773,-0.105960,-1.564262,0.753925,-2.086452
414,202351,13.727,1.372623e+07,0.689434,,721.0,2023.0,10000,2023-12-17,-1.707962,...,-0.641040,1.216418,-0.947843,0.017484,0.651593,-0.671834,0.629730,-0.115686,0.213907,0.221282


In [11]:
### Fill missing values 

# Interpolate missing values for 'population', 'variation'
dataset['population'] = dataset['population'].interpolate(method='linear')
dataset['variation'] = dataset['variation'].interpolate(method='linear')

# Impute missing temperature values by forecasting the missing values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor

imputer = IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=10, random_state=0), random_state=0)
imputer.fit(dataset[['temperature']])
dataset['temperature'] = imputer.transform(dataset[['temperature']])
dataset.drop(columns=['Municipality Code', 'Date', 'year'], inplace=True)
dataset

Unnamed: 0,Epiweek,precipitation,population,variation,temperature,cases,0,1,2,3,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,201601,0.439,1.294546e+07,0.900000,30.026000,495.0,0.241374,-1.561932,0.309142,0.761763,...,-0.975199,-1.269957,0.238937,-1.492360,-1.713798,1.239230,0.359771,0.134949,-0.437693,1.264120
1,201602,120.388,1.294769e+07,0.900000,27.062500,404.0,-0.135461,-1.707807,-0.354339,0.824970,...,0.241989,-2.025665,0.681672,0.321108,0.604627,0.770941,-1.010611,0.023148,0.860643,-0.245002
2,201603,16.389,1.294992e+07,0.900000,27.203714,420.0,-2.683568,1.224843,0.817149,-0.550825,...,-1.414230,0.016895,-1.271270,-1.117690,1.665178,0.443690,0.675015,-1.634904,-0.782346,-2.879396
3,201604,27.539,1.295215e+07,0.900000,32.608786,421.0,-0.392921,0.414659,0.140102,-1.350422,...,0.362628,-0.503318,-0.482805,-0.436582,-0.155589,-0.857230,-0.140753,1.145457,-0.880416,-0.794366
4,201605,0.000,1.295438e+07,0.900000,34.180857,468.0,1.538756,-0.343051,0.173756,0.301466,...,-0.208216,-0.954992,1.775423,-0.886031,1.274171,-0.568689,-0.134025,1.979926,-1.624665,-1.132198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,202348,4.456,1.372091e+07,0.687736,27.412054,510.0,1.244006,-0.590205,1.047149,0.202036,...,1.371662,0.691372,-0.209624,0.841494,-0.007466,1.263855,-1.558685,-0.126284,0.180389,0.005462
412,202349,33.037,1.372268e+07,0.688302,27.412054,636.0,0.386444,0.995367,-0.079516,0.315029,...,0.378428,0.982034,-0.763424,0.996093,0.495894,-1.531264,1.056130,-1.314655,-0.968883,2.481470
413,202350,0.323,1.372445e+07,0.688868,27.412054,816.0,-0.663760,-0.805401,0.189657,1.455089,...,0.250468,0.358316,2.705809,-1.144578,-0.676270,-0.701773,-0.105960,-1.564262,0.753925,-2.086452
414,202351,13.727,1.372623e+07,0.689434,27.412054,721.0,-1.707962,-0.988960,-0.488344,-0.919691,...,-0.641040,1.216418,-0.947843,0.017484,0.651593,-0.671834,0.629730,-0.115686,0.213907,0.221282


In [12]:
# Step 1: Get a list of all columns
columns = dataset.columns.tolist()

# Step 2: List of columns to move to the end
columns_to_move = ['Epiweek', 'precipitation', 'population', 'variation', 'temperature', 'cases']

# Get the remaining columns
remaining_columns = [col for col in columns if col not in columns_to_move]

# Concatenate the two lists to form the new column order
new_column_order = remaining_columns + columns_to_move

# Reindex the dataframe with the new column order
dataset = dataset.reindex(columns=new_column_order)

dataset

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1020,1021,1022,1023,Epiweek,precipitation,population,variation,temperature,cases
0,0.241374,-1.561932,0.309142,0.761763,1.078860,1.038276,-2.130718,0.208693,1.016787,-0.113562,...,0.359771,0.134949,-0.437693,1.264120,201601,0.439,1.294546e+07,0.900000,30.026000,495.0
1,-0.135461,-1.707807,-0.354339,0.824970,-1.381245,-0.742811,-0.288823,0.445895,-0.027824,-1.365176,...,-1.010611,0.023148,0.860643,-0.245002,201602,120.388,1.294769e+07,0.900000,27.062500,404.0
2,-2.683568,1.224843,0.817149,-0.550825,-0.816751,1.652855,-0.524891,0.929462,-0.728161,-0.589060,...,0.675015,-1.634904,-0.782346,-2.879396,201603,16.389,1.294992e+07,0.900000,27.203714,420.0
3,-0.392921,0.414659,0.140102,-1.350422,-0.746271,-0.540469,0.311457,1.756462,0.112347,-0.115989,...,-0.140753,1.145457,-0.880416,-0.794366,201604,27.539,1.295215e+07,0.900000,32.608786,421.0
4,1.538756,-0.343051,0.173756,0.301466,-1.253962,0.681083,0.404304,1.364242,0.577944,1.022846,...,-0.134025,1.979926,-1.624665,-1.132198,201605,0.000,1.295438e+07,0.900000,34.180857,468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,1.244006,-0.590205,1.047149,0.202036,-2.700557,-0.005340,1.109713,-1.685690,1.490114,0.629420,...,-1.558685,-0.126284,0.180389,0.005462,202348,4.456,1.372091e+07,0.687736,27.412054,510.0
412,0.386444,0.995367,-0.079516,0.315029,3.171045,0.049635,0.460055,1.517618,0.447097,-1.374521,...,1.056130,-1.314655,-0.968883,2.481470,202349,33.037,1.372268e+07,0.688302,27.412054,636.0
413,-0.663760,-0.805401,0.189657,1.455089,0.259192,-1.646903,-0.912002,-0.322412,-0.162563,-0.678678,...,-0.105960,-1.564262,0.753925,-2.086452,202350,0.323,1.372445e+07,0.688868,27.412054,816.0
414,-1.707962,-0.988960,-0.488344,-0.919691,-1.072049,1.122650,0.170106,1.359195,-1.414566,0.916261,...,0.629730,-0.115686,0.213907,0.221282,202351,13.727,1.372623e+07,0.689434,27.412054,721.0


In [13]:
dataset.to_csv('data/dataset.csv', index=False)

In [2]:
pd.read_csv('data/dataset.csv')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1020,1021,1022,1023,Epiweek,precipitation,population,variation,temperature,cases
0,0.241374,-1.561932,0.309142,0.761763,1.078860,1.038276,-2.130718,0.208693,1.016787,-0.113562,...,0.359771,0.134949,-0.437693,1.264120,201601,0.439,1.294546e+07,0.900000,30.026000,495.0
1,-0.135461,-1.707807,-0.354339,0.824970,-1.381245,-0.742811,-0.288823,0.445895,-0.027824,-1.365176,...,-1.010611,0.023148,0.860643,-0.245002,201602,120.388,1.294769e+07,0.900000,27.062500,404.0
2,-2.683568,1.224843,0.817149,-0.550825,-0.816751,1.652855,-0.524891,0.929462,-0.728161,-0.589060,...,0.675015,-1.634904,-0.782346,-2.879396,201603,16.389,1.294992e+07,0.900000,27.203714,420.0
3,-0.392921,0.414659,0.140102,-1.350422,-0.746271,-0.540469,0.311457,1.756462,0.112347,-0.115989,...,-0.140753,1.145457,-0.880416,-0.794366,201604,27.539,1.295215e+07,0.900000,32.608786,421.0
4,1.538756,-0.343051,0.173756,0.301466,-1.253962,0.681083,0.404304,1.364242,0.577944,1.022846,...,-0.134025,1.979926,-1.624665,-1.132198,201605,0.000,1.295438e+07,0.900000,34.180857,468.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,1.244006,-0.590205,1.047149,0.202036,-2.700557,-0.005340,1.109713,-1.685690,1.490114,0.629420,...,-1.558685,-0.126284,0.180389,0.005462,202348,4.456,1.372091e+07,0.687736,27.412054,510.0
412,0.386444,0.995367,-0.079516,0.315029,3.171045,0.049635,0.460055,1.517618,0.447097,-1.374521,...,1.056130,-1.314655,-0.968883,2.481470,202349,33.037,1.372268e+07,0.688302,27.412054,636.0
413,-0.663760,-0.805401,0.189657,1.455089,0.259192,-1.646903,-0.912002,-0.322412,-0.162563,-0.678678,...,-0.105960,-1.564262,0.753925,-2.086452,202350,0.323,1.372445e+07,0.688868,27.412054,816.0
414,-1.707962,-0.988960,-0.488344,-0.919691,-1.072049,1.122650,0.170106,1.359195,-1.414566,0.916261,...,0.629730,-0.115686,0.213907,0.221282,202351,13.727,1.372623e+07,0.689434,27.412054,721.0
