In [1]:
import pandas as pd
import tarfile
import json
import os
from pandas import json_normalize

import re
from datetime import datetime

In [2]:
def parse_date(filename):
    basename = os.path.basename(filename)
    match = re.search(r'reports_(\w+)(\d+)_(\d+)', basename)
    if not match:
        raise ValueError(f"Konnte kein Datum aus dem Dateinamen extrahieren: {basename}")

    month, day, year = match.groups()

    # Konvertiere den Monatsnamen in eine Zahl
    month_num = datetime.strptime(month, '%b').month

    # Erstelle ein vollständiges Datum
    date_str = f"{year}-{month_num:02d}-{int(day):02d}"
    return datetime.strptime(date_str, "%Y-%m-%d")

In [3]:
def read_json_from_tar_and_convert_to_df(tar_file):
    with tarfile.open(tar_file, 'r:gz') as tar:
        for member in tar.getmembers():
            if member.isfile() and member.name.endswith('.json'):
                file = tar.extractfile(member)
                data = json.load(file)               
                df = pd.DataFrame(json_normalize(data))
                return df
    

In [4]:
data_list = []
directory = 'data/protondb-data/reports'
df = None
for file in os.listdir(directory):
    if file.endswith('.tar.gz'):
        print(f"Extracting file {file}")
        date = parse_date(file)
        if date >= datetime.strptime("2022-03-01", "%Y-%m-%d"):
            file_path = os.path.join(directory, file)
            data =  read_json_from_tar_and_convert_to_df(file_path)
            if df is None:
                df = data
            else:
                df = pd.concat([df, data])

Extracting file reports_apr1_2019.tar.gz
Extracting file reports_apr1_2020.tar.gz
Extracting file reports_apr1_2021.tar.gz
Extracting file reports_apr1_2022.tar.gz
Extracting file reports_apr3_2024.tar.gz
Extracting file reports_apr4_2023.tar.gz
Extracting file reports_aug1_2019.tar.gz
Extracting file reports_aug1_2021.tar.gz
Extracting file reports_aug1_2022.tar.gz
Extracting file reports_aug1_2023.tar.gz
Extracting file reports_aug2_2024.tar.gz
Extracting file reports_dec1_2018.tar.gz
Extracting file reports_dec1_2020.tar.gz
Extracting file reports_dec1_2021.tar.gz
Extracting file reports_dec1_2023.tar.gz
Extracting file reports_dec2_2019.tar.gz
Extracting file reports_dec2_2022.tar.gz
Extracting file reports_feb1_2019.tar.gz
Extracting file reports_feb1_2020.tar.gz
Extracting file reports_feb1_2021.tar.gz
Extracting file reports_feb1_2022.tar.gz
Extracting file reports_feb1_2024.tar.gz
Extracting file reports_feb3_2023.tar.gz
Extracting file reports_jan1_2019.tar.gz
Extracting file 

In [5]:
data.to_pickle('data/reports.pkl')