In [None]:
from pathlib import Path
from gluonts.dataset.common import FileDataset
from collections import defaultdict
import pandas as pd
import numpy as np
import os

# Read the directory to generate the list of files without extensions
PATH = "/workspaces/data/"
files = [os.path.splitext(file)[0] for file in os.listdir(PATH) if file.endswith('.parquet')]
data= []
for file in files:
    dataset = FileDataset(
        path=Path(f"{PATH}/{file}.parquet"),
        freq="M"
    )
    for entry in dataset:
        data.append({
                    "Code": file,
                    "TS ID": entry['item_id'],
                    "Start Date": entry['start'],
                    "TS Length": len(entry['target']),
                    "Mean": np.mean(entry['target']),
                    "Std": np.std(entry['target']),
                    "Min": np.min(entry['target']),
                    "Max": np.max(entry['target']),
                    # "Variance": np.var(entry['target'])
                })
df = pd.DataFrame(data)
df['TS ID'] = df['TS ID'].astype(int)
df.set_index(["Code","Start Date", "TS ID"], inplace=True)
df.sort_index(inplace=True)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,TS Length,Mean,Std,Min,Max
Code,Start Date,TS ID,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
CHIKBR,2015-01,11,121,74.008263,101.694603,0.0,505.0
CHIKBR,2015-01,14,121,88.181816,234.754578,1.0,1750.0
CHIKBR,2015-01,15,121,408.404968,469.915466,13.0,2558.0
CHIKBR,2015-01,17,121,317.090912,521.938477,2.0,2917.0
CHIKBR,2015-01,21,121,296.900818,647.155273,1.0,4950.0
...,...,...,...,...,...,...,...
ZIKABR,2016-01,35,107,214.345795,413.371277,26.0,2864.0
ZIKABR,2016-01,41,107,53.149532,179.401382,2.0,1503.0
ZIKABR,2016-01,50,107,90.224297,175.267288,3.0,1133.0
ZIKABR,2016-01,51,107,326.514008,1373.779053,0.0,10658.0


In [18]:
(df.style
 .format(precision=2)
 .to_latex("outputs/longtable_diseases.tex",hrules=True, caption="Details of each Time Series in datasets", label="tab:disease_details",
                                 clines="skip-last;data",siunitx=True, convert_css=True,multirow_align="c", multicol_align="c", environment="longtable")
)

In [34]:
df_grouped = df.reset_index().groupby('Code').agg({
    'TS ID': 'count',
})

df_grouped.rename(columns={'TS ID': '# TS'}, inplace=True)

In [35]:
diseases = {'HEPA': "Hepatitis",'LEIV': 'Visceral Leishmaniasis', 'LTAN': 'Tegumentary Leishmaniasis', 'CHAG': 'Acute Chagas Disease', 'ESQU': 'Schistosomiasis', 'MALA': 'Malaria', 'RAIV': 'Rabies', 'PEST': 'Plague', 'HANS': 'Leprosy', 'CHIK': 'Chikungunya Fever', 'DENG': 'Dengue', 'ZIKA': 'Zika Virus', 'FMAC': 'Spotted Fever', 'HANT': 'Hantavirus', 'LERD': 'RSI/Dort', 'DERM': 'Occupational Dermatoses', 'PNEU': 'Work-Related Pneumoconiosis', 'DIFT': 'Diphtheria', 'TETA': 'Accidental Tetanus', 'TETN': 'Neonatal Tetanus', 'MENI': 'Meningitis', 'COQU': 'Whooping Cough', 'VARC': 'Chickenpox', 'COLE': 'Cholera', 'SDTA': 'Foodborne Disease Outbreak', 'FTIF': 'Typhoid Fever', 'LEPT': 'Leptospirosis', 'TOXC': 'Congenital Toxoplasmosis', 'TOXG': 'Gestational Toxoplasmosis', 'TRAC': 'Trachoma Survey', 'SIFA': 'Acquired Syphilis', 'SIFC': 'Congenital Syphilis', 'SIFG': 'Gestational Syphilis', 'BOTU': 'Botulism', 'EXAN': 'Exanthematous Disease'}


In [36]:
# Create a mapping from disease codes to names
# Remove the last two characters from file before making the mapping
df_grouped['Disease Name'] = df_grouped.index.map(lambda x: diseases.get(x[:-2], 'Unknown'))


In [37]:
df_grouped.sort_index(inplace=True)
df_grouped.reset_index(inplace=True)
df_grouped

Unnamed: 0,Code,# TS,Disease Name
0,CHIKBR,23,Chikungunya Fever
1,DENGBR,27,Dengue
2,ESQUBR,5,Schistosomiasis
3,EXANBR,9,Exanthematous Disease
4,HANSBR,18,Leprosy
5,HEPABR,15,Hepatitis
6,LEIVBR,5,Visceral Leishmaniasis
7,LEPTBR,10,Leptospirosis
8,LTANBR,10,Tegumentary Leishmaniasis
9,MENIBR,10,Meningitis


In [40]:
df_grouped = df_grouped[['Code', 'Disease Name', '# TS']]
(df_grouped
 .to_latex("outputs/disease_summary.tex", 
               index=False,
                caption="Summary of Time Series Data by Disease",
                label="tab:disease_summary",
                column_format="llr",
                longtable=False,
                escape=False
            )
)