In [None]:
from mimesis import Field, Fieldset, Schema
from mimesis import Generic
from mimesis import Address
from mimesis import Datetime
from mimesis import Numeric
from mimesis.locales import Locale

import pandas as pd

import urllib, json
from pathlib import Path

from constants import DATA_PROVIDERS
from constants import ALPHANUMERIC


In [None]:
SCHEMA_URL="https://raw.githubusercontent.com/PHACDataHub/tb-safe-inputs/tb-upload/ui/src/schema/test.json"
DATA_DIR=Path(".", "data", "parquet_test")

In [None]:
resp = urllib.request.urlopen(SCHEMA_URL)
schema_data = json.loads(resp.read())

In [None]:
field = Field(locale=Locale.EN_CA)
fieldset = Fieldset(locale=Locale.EN_CA)
generic = Generic(locale=Locale.EN_CA)
address = Address(locale=Locale.EN_CA)
dt = Datetime(locale=Locale.EN_CA)
numeric = Numeric()

schema = Schema(
    schema=lambda: {
        'YearReceived': dt.year(minimum=2022, maximum=2023),
        'MonthReceived': dt.month(),
        'DataProvider': generic.choice(DATA_PROVIDERS),
        'RegisterCaseNumber': field("increment"),
        'Gender': generic.choice(schema_data['properties']['Gender']['enum']),
        'PostalCode': address.postal_code(),
        'DiagnosisDate': dt.date(start=2000, end=2023),
        'ICDOption': generic.choice(schema_data['properties']['ICDOption']['enum']),
        'ICD9': generic.choice(['010', '011', '012', '013', '014', '015']),
        'ICD10': generic.choice(['A15', 'A16', 'A17']),
        'ChestXRay': generic.choice(schema_data['properties']['ChestXRay']['enum']),
        'IfAbnormal': generic.choice(schema_data['properties']['IfAbnormal']['enum']),
        'CaseCriteria': generic.choice(schema_data['properties']['CaseCriteria']['enum']),
        'Initial_Resistance': generic.choice(schema_data['properties']['Initial_Resistance']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingSpoligotyping': numeric.integer_number(start=1, end=10**15),
        'MIRU': generic.choice(ALPHANUMERIC, length=24),
        'PreviousTreatmentCompleted': generic.choice(schema_data['properties']['PreviousTreatmentCompleted']['enum']),
        'HIVStatus': generic.choice(schema_data['properties']['HIVStatus']['enum']),
        'Date_Of_HIV_Test': dt.datetime(start=2019, end=2023),
        'TravelTBCountry': generic.choice(schema_data['properties']['TravelTBCountry']['enum']),
        'HowLong': generic.choice([1,2,3,4,5,6,7,8,9,10,15,20,50,100, None]),
        'patientDiedBeforeDuring': generic.choice(schema_data['properties']['patientDiedBeforeDuring']['enum']),
        'DidPatientDie': generic.choice(schema_data['properties']['DidPatientDie']['enum']),
        'DateOfDeath': dt.datetime(start=2019, end=2023),
        'CauseOfDeath': generic.choice(schema_data['properties']['CauseOfDeath']['enum']),
        'FirstEpisodeOfTB': generic.choice(schema_data['properties']['FirstEpisodeOfTB']['enum']),
        'PreviousDiagnosisYear': dt.year(minimum=2010, maximum=2022),
    },
    iterations=1000,
)

In [None]:
df = pd.DataFrame(schema.create())
df

In [None]:
df.to_parquet(Path(DATA_DIR), partition_cols=['YearReceived', 'DataProvider'])