In [9]:
from mimesis import Field, Fieldset, Schema
from mimesis import Generic
from mimesis import Address
from mimesis import Datetime
from mimesis import Numeric
from mimesis.locales import Locale
from mimesis import Text

import pandas as pd

import urllib, json
from pathlib import Path

from constants import *


In [10]:
SCHEMA_URL="https://raw.githubusercontent.com/PHACDataHub/tb-safe-inputs/tb-upload/ui/src/schema/schema.json"
DATA_FILE=Path(".", "data", "BC.xlsx")

In [11]:
resp = urllib.request.urlopen(SCHEMA_URL)
schema_data = json.loads(resp.read())

In [12]:
field = Field(locale=Locale.EN_CA)
fieldset = Fieldset(locale=Locale.EN_CA)
generic = Generic(locale=Locale.EN_CA)
address = Address(locale=Locale.EN_CA)
dt = Datetime(locale=Locale.EN_CA)
numeric = Numeric()

schema = Schema(
    schema=lambda: {
        'YearReceived': dt.year(minimum=2022, maximum=2023),
        'MonthReceived': dt.month(),
        'DataProvider': generic.choice(DATA_PROVIDERS),
        'RegisterCaseNumber': field("increment"),
        'Gender': generic.choice(schema_data['properties']['Gender']['enum']),
        'PostalCode': address.postal_code(),
        'DiagnosisDate': dt.date(start=2000, end=2023),
        'ICDOption': generic.choice(schema_data['properties']['ICDOption']['enum']),
        'ICD9': generic.choice(['010', '011', '012', '013', '014', '015']),
        'ICD10': generic.choice(['A15', 'A16', 'A17']),
        'ChestXRay': generic.choice(schema_data['properties']['ChestXRay']['enum']),
        'IfAbnormal': generic.choice(schema_data['properties']['IfAbnormal']['enum']),
        'CaseCriteria': generic.choice(schema_data['properties']['CaseCriteria']['enum']),
        'Initial_Resistance': generic.choice(schema_data['properties']['Initial_Resistance']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingSpoligotyping': numeric.integer_number(start=1, end=10**15),
        'MIRU': generic.choice(ALPHANUMERIC, length=24),
        'PreviousTreatmentCompleted': generic.choice(schema_data['properties']['PreviousTreatmentCompleted']['enum']),
        'HIVStatus': generic.choice(schema_data['properties']['HIVStatus']['enum']),
        'Date_Of_HIV_Test': dt.datetime(start=2019, end=2023),
        'TravelTBCountry': generic.choice(schema_data['properties']['TravelTBCountry']['enum']),
        'HowLong': generic.choice([1,2,3,4,5,6,7,8,9,10,15,20,50,100, None]),
        'patientDiedBeforeDuring': generic.choice(schema_data['properties']['patientDiedBeforeDuring']['enum']),
        'DidPatientDie': generic.choice(schema_data['properties']['DidPatientDie']['enum']),
        'DateOfDeath': dt.datetime(start=2019, end=2023),
        'CauseOfDeath': generic.choice(schema_data['properties']['CauseOfDeath']['enum']),
        'FirstEpisodeOfTB': generic.choice(schema_data['properties']['FirstEpisodeOfTB']['enum']),
        'PreviousDiagnosisYear': dt.year(minimum=2010, maximum=2022),
    },
    iterations=1000,
)

### PTs file - BC

In [13]:
field = Field(locale=Locale.EN_CA)
fieldset = Fieldset(locale=Locale.EN_CA)
generic = Generic(locale=Locale.EN_CA)
address = Address(locale=Locale.EN_CA)
dt = Datetime(locale=Locale.EN_CA)
numeric = Numeric()
text = Text()

schema = Schema(
    schema=lambda: {
        'client_id_phac': field("increment"),
        'classification_date': dt.date(start=2013, end=2023),
        # 'classification_year': numeric.integer_number(start=2013. end=2023),
        'classification': generic.choice(['Clinical', 'Confirmed']),
        'isoniazid_resistance': generic.choice(['R', 'S',]),
        'ethambutol_resistance': generic.choice(['R', 'S', ]),
        'rifampin_resistance': generic.choice(['R', 'S', ]),
        'pyrazinamide_resistance': generic.choice(['R', 'S', ]),
        'amikacin_resistance': generic.choice(['R', 'S', ]),
        'capreomycin_resistance': generic.choice(['R', 'S', ]),
        'ethionamide_resistance': generic.choice(['R', 'S', ]),
        'kanamycin_resistance': generic.choice(['R', 'S', ]),
        'moxifloxacin_resistance': generic.choice(['R', 'S', ]),
        'ofloxacin_resistance': generic.choice(['R', 'S', ]),
        'linezolid_resistance': generic.choice(['R', 'S', ]),
        'para_aminosalicylate_resistance': generic.choice(['R', 'S', ]),
        'rifabutin_resistance': generic.choice(['R', 'S', ]),
        'streptomycin_resistance': generic.choice(['R', 'S', ]),
        'age_at_classification_date_years': numeric.integer_number(start=16, end=80),
        'gender': generic.choice(['Male', 'Female', '', ]),
        'origin': generic.choice(ORIGINS),
        # 'country_of_birth_combined': generic.choice(['United States', 'Taiwan (Province of China)', 'Iran, Islamic Republic of', 'Korea', 'Macao', 'Syrian Arab Republic', 'Hong Kong', 'Tibet', None, 'Philipines', 'India', 'China', 'Germany', 'Australia', 'Iceland', 'Finland', 'Canada']),
        # 'immigration_arrival_date_combine': dt.date(start=1950, end=2023),
        # 'immigration_status_combined': generic.choice(IMMIGRATION_STATUS),
        'method_of_detection': generic.choice(CASE_FINDING),
        'tb_body_site_category_phac': text.quote(),
        'previous_abnormal_chest_xray': generic.choice(RISK_FACTORS),
        'tb_contact_within_2_years': generic.choice(RISK_FACTORS),
        'diabetes_mellitus': generic.choice(RISK_FACTORS),
        'kidney_disease_requiring_dialysi': generic.choice(RISK_FACTORS),
        'homelessness': generic.choice(RISK_FACTORS),
        'longterm_corticosteriod_use': generic.choice(RISK_FACTORS),
        'injection_drug_use': generic.choice(RISK_FACTORS),
        'solid_organ_transplant_candidate': generic.choice(RISK_FACTORS),
        'hiv_status': generic.choice(HIV_STATUS),
        'alcohol_use': generic.choice(RISK_FACTORS),
        'tobacco_use': generic.choice(RISK_FACTORS)
    },
    iterations=1000,
)

In [14]:
df = pd.DataFrame(schema.create())
df

Unnamed: 0,client_id_phac,classification_date,classification,isoniazid_resistance,ethambutol_resistance,rifampin_resistance,pyrazinamide_resistance,amikacin_resistance,capreomycin_resistance,ethionamide_resistance,...,tb_contact_within_2_years,diabetes_mellitus,kidney_disease_requiring_dialysi,homelessness,longterm_corticosteriod_use,injection_drug_use,solid_organ_transplant_candidate,hiv_status,alcohol_use,tobacco_use
0,1,2018-10-16,Confirmed,R,R,R,S,R,S,R,...,Unknown,,No,Yes,Unknown,Unknown,Yes,Unknown,Yes,
1,2,2015-04-29,Clinical,R,S,S,S,S,S,R,...,Yes,Yes,Unknown,,,,No,Negative,Unknown,
2,3,2019-08-19,Confirmed,R,R,S,R,S,R,S,...,,Unknown,Unknown,No,Yes,,Unknown,Test refused,Yes,Unknown
3,4,2020-08-09,Clinical,R,S,S,S,S,S,S,...,Yes,Yes,,Unknown,,No,Yes,Positive,Yes,Yes
4,5,2020-09-19,Confirmed,S,R,S,R,S,S,S,...,No,,Yes,Yes,Yes,Unknown,Yes,Negative,No,Unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2022-01-29,Clinical,S,S,R,R,R,S,R,...,Yes,,No,,No,Unknown,Unknown,Test not offered,Unknown,Yes
996,997,2013-09-01,Confirmed,R,S,S,S,S,S,R,...,No,,Unknown,Unknown,Yes,No,Unknown,Test not offered,,
997,998,2014-02-17,Clinical,S,R,S,S,S,S,R,...,Yes,Yes,Yes,Unknown,,,Yes,Unknown,,Yes
998,999,2021-03-15,Confirmed,S,S,R,S,S,R,R,...,No,Unknown,Yes,Unknown,Unknown,Yes,No,Test not offered,,Unknown


In [15]:
def country(origin):
    if(origin == 'Foreign Born'): return generic.choice(['United States', 'Taiwan (Province of China)', 'Iran, Islamic Republic of', 'Korea', 'Macao', 'Syrian Arab Republic', 'Hong Kong', 'Tibet', 'Philipines', 'India', 'China', 'Germany', 'Australia', 'Iceland', 'Finland', 'Ukraine', 'Argentina', 'UK', 'Ireland'])
    elif(origin == 'Unknown'): return generic.choice(['United States', 'Taiwan (Province of China)', 'Iran, Islamic Republic of', 'Korea', 'Macao', 'Syrian Arab Republic', 'Hong Kong', 'Tibet', 'Philipines', 'India', 'China', 'Germany', 'Australia', 'Iceland', 'Finland', 'Ukraine', 'Argentina', 'UK', 'Ireland', 'Canada', 'Unknown'])
    else: return "Canada"

def immigration_date(origin):
    if((origin == "Foreign Born") or (origin == "Unknown")): return dt.date(start=1950, end=2023)

def immigration_status(origin):
    if((origin == "Foreign Born") or (origin == "Unknown")): return generic.choice(IMMIGRATION_STATUS)
 
df['classification_year'] = df['classification_date'].map(lambda classification_date: classification_date.year)
df['country_of_birth_combined'] = df['origin'].map(lambda origin: country(origin))
df['immigration_arrival_date_combine'] = df['origin'].map(lambda origin: immigration_date(origin))
df['immigration_status_combined'] = df['origin'].map(lambda origin: immigration_status(origin))
df


Unnamed: 0,client_id_phac,classification_date,classification,isoniazid_resistance,ethambutol_resistance,rifampin_resistance,pyrazinamide_resistance,amikacin_resistance,capreomycin_resistance,ethionamide_resistance,...,longterm_corticosteriod_use,injection_drug_use,solid_organ_transplant_candidate,hiv_status,alcohol_use,tobacco_use,classification_year,country_of_birth_combined,immigration_arrival_date_combine,immigration_status_combined
0,1,2018-10-16,Confirmed,R,R,R,S,R,S,R,...,Unknown,Unknown,Yes,Unknown,Yes,,2018,Canada,,
1,2,2015-04-29,Clinical,R,S,S,S,S,S,R,...,,,No,Negative,Unknown,,2015,Canada,,
2,3,2019-08-19,Confirmed,R,R,S,R,S,R,S,...,Yes,,Unknown,Test refused,Yes,Unknown,2019,Canada,,
3,4,2020-08-09,Clinical,R,S,S,S,S,S,S,...,,No,Yes,Positive,Yes,Yes,2020,Tibet,2019-04-01,Other Current Immigration Status
4,5,2020-09-19,Confirmed,S,R,S,R,S,S,S,...,Yes,Unknown,Yes,Negative,No,Unknown,2020,Ukraine,2016-09-22,Work Permit
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2022-01-29,Clinical,S,S,R,R,R,S,R,...,No,Unknown,Unknown,Test not offered,Unknown,Yes,2022,Canada,,
996,997,2013-09-01,Confirmed,R,S,S,S,S,S,R,...,Yes,No,Unknown,Test not offered,,,2013,Canada,,
997,998,2014-02-17,Clinical,S,R,S,S,S,S,R,...,,,Yes,Unknown,,Yes,2014,Canada,,
998,999,2021-03-15,Confirmed,S,S,R,S,S,R,R,...,Unknown,Yes,No,Test not offered,,Unknown,2021,Tibet,2003-08-08,Foreign-born Canadian Citizen


In [16]:
df.to_excel(Path(DATA_FILE), index=False)