In [36]:
from mimesis import Field, Fieldset, Schema
from mimesis import Generic
from mimesis import Address
from mimesis import Datetime
from mimesis import Numeric
from mimesis.locales import Locale

import pandas as pd

import urllib, json
from pathlib import Path

from constants import DATA_PROVIDERS
from constants import ALPHANUMERIC


In [37]:
SCHEMA_URL="https://raw.githubusercontent.com/PHACDataHub/tb-safe-inputs/tb-upload/ui/src/schema/test.json"
DATA_DIR=Path(".", "data")

In [28]:
resp = urllib.request.urlopen(SCHEMA_URL)
schema_data = json.loads(resp.read())

In [48]:
field = Field(locale=Locale.EN_CA)
fieldset = Fieldset(locale=Locale.EN_CA)
generic = Generic(locale=Locale.EN_CA)
address = Address(locale=Locale.EN_CA)
dt = Datetime(locale=Locale.EN_CA)
numeric = Numeric()

schema = Schema(
    schema=lambda: {
        'YearReceived': dt.year(minimum=2022, maximum=2023),
        'MonthReceived': dt.month(),
        'DataProvider': generic.choice(DATA_PROVIDERS),
        'RegisterCaseNumber': field("increment"),
        'Gender': generic.choice(schema_data['properties']['Gender']['enum']),
        'PostalCode': address.postal_code(),
        'DiagnosisDate': dt.date(start=2000, end=2023),
        'ICDOption': generic.choice(schema_data['properties']['ICDOption']['enum']),
        'ICD9': generic.choice(['010', '011', '012', '013', '014', '015']),
        'ICD10': generic.choice(['A15', 'A16', 'A17']),
        'ChestXRay': generic.choice(schema_data['properties']['ChestXRay']['enum']),
        'IfAbnormal': generic.choice(schema_data['properties']['IfAbnormal']['enum']),
        'CaseCriteria': generic.choice(schema_data['properties']['CaseCriteria']['enum']),
        'Initial_Resistance': generic.choice(schema_data['properties']['Initial_Resistance']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingSpoligotyping': numeric.integer_number(start=1, end=10**15),
        'MIRU': generic.choice(ALPHANUMERIC, length=24),
        'PreviousTreatmentCompleted': generic.choice(schema_data['properties']['PreviousTreatmentCompleted']['enum']),
        'HIVStatus': generic.choice(schema_data['properties']['HIVStatus']['enum']),
        'Date_Of_HIV_Test': dt.datetime(start=2019, end=2023),
        'TravelTBCountry': generic.choice(schema_data['properties']['TravelTBCountry']['enum']),
        'HowLong': generic.choice([1,2,3,4,5,6,7,8,9,10,15,20,50,100, None]),
        'patientDiedBeforeDuring': generic.choice(schema_data['properties']['patientDiedBeforeDuring']['enum']),
        'DidPatientDie': generic.choice(schema_data['properties']['DidPatientDie']['enum']),
        'DateOfDeath': dt.datetime(start=2019, end=2023),
        'CauseOfDeath': generic.choice(schema_data['properties']['CauseOfDeath']['enum']),
        'FirstEpisodeOfTB': generic.choice(schema_data['properties']['FirstEpisodeOfTB']['enum']),
        'PreviousDiagnosisYear': dt.year(minimum=2010, maximum=2022),
    },
    iterations=1000,
)

In [49]:
df = pd.DataFrame(schema.create())
df

Unnamed: 0,YearReceived,MonthReceived,DataProvider,RegisterCaseNumber,Gender,PostalCode,DiagnosisDate,ICDOption,ICD9,ICD10,...,HIVStatus,Date_Of_HIV_Test,TravelTBCountry,HowLong,patientDiedBeforeDuring,DidPatientDie,DateOfDeath,CauseOfDeath,FirstEpisodeOfTB,PreviousDiagnosisYear
0,2023,May,MB - Other,1,F,A9H 5H3,2005-07-30,ICD10,013,A15,...,Test not offered,2021-11-04 13:56:19.679109,yes,4.0,unchecked,Yes-during treatment,2021-02-22 02:56:14.442418,TB did not contribute to death,3,2018
1,2022,September,Sask,2,F,Z0D 2R7,2021-12-23,ICD9,014,A16,...,Positive,2022-08-26 22:44:28.441727,no,,unchecked,Yes-during treatment,2023-09-28 08:54:19.228545,Unknown,2,2012
2,2023,December,BC,3,F,J8L 7E8,2006-02-20,ICD9,012,A15,...,Negative,2019-07-22 07:05:11.131697,unknown,5.0,checked,Yes-before treatment,2020-01-03 00:13:10.813407,Unknown,3,2014
3,2023,September,Quebec,4,M,T7V 9W2,2003-11-26,ICD9,013,A16,...,Test refused,2019-07-02 14:22:03.978479,yes,9.0,checked,Unknown,2023-01-11 11:41:04.625495,TB contributed to death but was not the underl...,1,2022
4,2022,September,Ontario Hospital Toronto 1,5,U,S8K 6Q3,2016-12-19,ICD10,013,A15,...,Positive,2022-07-05 00:51:38.959071,no,,checked,No,2019-08-20 12:51:29.563591,TB contributed to death but was not the underl...,3,2020
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2022,November,PEI,996,U,S0U 9U3,2020-10-12,ICD10,010,A15,...,Test refused,2019-05-01 00:17:02.945306,unknown,5.0,checked,Yes-before treatment,2021-07-13 02:56:45.086329,TB was the cause of death,1,2018
996,2022,August,NWT,997,M,P3N 6D5,2012-07-13,ICD9,011,A16,...,Test refused,2020-02-19 19:09:41.287901,yes,2.0,checked,Yes-before treatment,2019-04-01 02:37:41.609056,TB did not contribute to death,3,2013
997,2022,February,Ontario Hospital Toronto 1,998,M,F6O 8Y7,2015-10-21,ICD10,013,A15,...,Test not offered,2023-12-29 10:16:16.444150,no,5.0,unchecked,Yes-during treatment,2023-10-07 07:53:51.238742,TB contributed to death but was not the underl...,1,2016
998,2022,January,NWT,999,M,U1Z 7J9,2021-07-31,ICD10,015,A15,...,Test not offered,2023-02-09 05:40:31.190682,yes,2.0,checked,Unknown,2023-11-04 16:49:56.932938,TB was the cause of death,2,2010


In [50]:
df.to_parquet(Path(DATA_DIR, "parquet_test"), partition_cols=['YearReceived', 'DataProvider'])