In [81]:
from mimesis import Field, Fieldset, Schema
from mimesis import Generic
from mimesis import Address
from mimesis import Datetime
from mimesis import Numeric
from mimesis.locales import Locale
from mimesis import Text

import pandas as pd

import urllib, json
from pathlib import Path

from constants import *
from datetime import date, timedelta


In [82]:
SCHEMA_URL="https://raw.githubusercontent.com/PHACDataHub/tb-safe-inputs/tb-upload/ui/src/schema/schema.json"

In [83]:
resp = urllib.request.urlopen(SCHEMA_URL)
schema_data = json.loads(resp.read())

In [84]:
field = Field(locale=Locale.EN_CA)
fieldset = Fieldset(locale=Locale.EN_CA)
generic = Generic(locale=Locale.EN_CA)
address = Address(locale=Locale.EN_CA)
dt = Datetime(locale=Locale.EN_CA)
numeric = Numeric()

schema = Schema(
    schema=lambda: {
        'YearReceived': dt.year(minimum=2022, maximum=2023),
        'MonthReceived': dt.month(),
        'DataProvider': generic.choice(DATA_PROVIDERS),
        'RegisterCaseNumber': field("increment"),
        'Gender': generic.choice(schema_data['properties']['Gender']['enum']),
        'PostalCode': address.postal_code(),
        'DiagnosisDate': dt.date(start=2000, end=2023),
        'ICDOption': generic.choice(schema_data['properties']['ICDOption']['enum']),
        'ICD9': generic.choice(['010', '011', '012', '013', '014', '015']),
        'ICD10': generic.choice(['A15', 'A16', 'A17']),
        'ChestXRay': generic.choice(schema_data['properties']['ChestXRay']['enum']),
        'IfAbnormal': generic.choice(schema_data['properties']['IfAbnormal']['enum']),
        'CaseCriteria': generic.choice(schema_data['properties']['CaseCriteria']['enum']),
        'Initial_Resistance': generic.choice(schema_data['properties']['Initial_Resistance']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingResults': generic.choice(schema_data['properties']['GenotypingResults']['enum']),
        'GenotypingSpoligotyping': numeric.integer_number(start=1, end=10**15),
        'MIRU': generic.choice(ALPHANUMERIC, length=24),
        'PreviousTreatmentCompleted': generic.choice(schema_data['properties']['PreviousTreatmentCompleted']['enum']),
        'HIVStatus': generic.choice(schema_data['properties']['HIVStatus']['enum']),
        'Date_Of_HIV_Test': dt.datetime(start=2019, end=2023),
        'TravelTBCountry': generic.choice(schema_data['properties']['TravelTBCountry']['enum']),
        'HowLong': generic.choice([1,2,3,4,5,6,7,8,9,10,15,20,50,100, None]),
        'patientDiedBeforeDuring': generic.choice(schema_data['properties']['patientDiedBeforeDuring']['enum']),
        'DidPatientDie': generic.choice(schema_data['properties']['DidPatientDie']['enum']),
        'DateOfDeath': dt.datetime(start=2019, end=2023),
        'CauseOfDeath': generic.choice(schema_data['properties']['CauseOfDeath']['enum']),
        'FirstEpisodeOfTB': generic.choice(schema_data['properties']['FirstEpisodeOfTB']['enum']),
        'PreviousDiagnosisYear': dt.year(minimum=2010, maximum=2022),
    },
    iterations=1000,
)

### PTs file - BC

In [85]:
field = Field(locale=Locale.EN_CA)
fieldset = Fieldset(locale=Locale.EN_CA)
generic = Generic(locale=Locale.EN_CA)
address = Address(locale=Locale.EN_CA)
dt = Datetime(locale=Locale.EN_CA)
numeric = Numeric()
text = Text()

schema = Schema(
    schema=lambda: {
        'client_id_phac': field("increment"),
        'classification_date': dt.date(start=2013, end=2023),
        # 'classification_year': numeric.integer_number(start=2013. end=2023),
        'classification': generic.choice(['Clinical', 'Confirmed']),
        'isoniazid_resistance': generic.choice(['R', 'S',]),
        'ethambutol_resistance': generic.choice(['R', 'S', ]),
        'rifampin_resistance': generic.choice(['R', 'S', ]),
        'pyrazinamide_resistance': generic.choice(['R', 'S', ]),
        'amikacin_resistance': generic.choice(['R', 'S', ]),
        'capreomycin_resistance': generic.choice(['R', 'S', ]),
        'ethionamide_resistance': generic.choice(['R', 'S', ]),
        'kanamycin_resistance': generic.choice(['R', 'S', ]),
        'moxifloxacin_resistance': generic.choice(['R', 'S', ]),
        'ofloxacin_resistance': generic.choice(['R', 'S', ]),
        'linezolid_resistance': generic.choice(['R', 'S', ]),
        'para_aminosalicylate_resistance': generic.choice(['R', 'S', ]),
        'rifabutin_resistance': generic.choice(['R', 'S', ]),
        'streptomycin_resistance': generic.choice(['R', 'S', ]),
        'age_at_classification_date_years': numeric.integer_number(start=16, end=80),
        'gender': generic.choice(['Male', 'Female', '', ]),
        'origin': generic.choice(ORIGINS),
        # 'country_of_birth_combined': generic.choice(['United States', 'Taiwan (Province of China)', 'Iran, Islamic Republic of', 'Korea', 'Macao', 'Syrian Arab Republic', 'Hong Kong', 'Tibet', None, 'Philipines', 'India', 'China', 'Germany', 'Australia', 'Iceland', 'Finland', 'Canada']),
        # 'immigration_arrival_date_combine': dt.date(start=1950, end=2023),
        # 'immigration_status_combined': generic.choice(IMMIGRATION_STATUS),
        'method_of_detection': generic.choice(CASE_FINDING),
        'tb_body_site_category_phac': text.quote(),
        'previous_abnormal_chest_xray': generic.choice(RISK_FACTORS),
        'tb_contact_within_2_years': generic.choice(RISK_FACTORS),
        'diabetes_mellitus': generic.choice(RISK_FACTORS),
        'kidney_disease_requiring_dialysi': generic.choice(RISK_FACTORS),
        'homelessness': generic.choice(RISK_FACTORS),
        'longterm_corticosteriod_use': generic.choice(RISK_FACTORS),
        'injection_drug_use': generic.choice(RISK_FACTORS),
        'solid_organ_transplant_candidate': generic.choice(RISK_FACTORS),
        'hiv_status': generic.choice(HIV_STATUS),
        'alcohol_use': generic.choice(RISK_FACTORS),
        'tobacco_use': generic.choice(RISK_FACTORS)
    },
    iterations=1000,
)

In [86]:
df = pd.DataFrame(schema.create())
df

Unnamed: 0,client_id_phac,classification_date,classification,isoniazid_resistance,ethambutol_resistance,rifampin_resistance,pyrazinamide_resistance,amikacin_resistance,capreomycin_resistance,ethionamide_resistance,...,tb_contact_within_2_years,diabetes_mellitus,kidney_disease_requiring_dialysi,homelessness,longterm_corticosteriod_use,injection_drug_use,solid_organ_transplant_candidate,hiv_status,alcohol_use,tobacco_use
0,1,2013-07-10,Clinical,R,S,R,R,S,R,S,...,No,Yes,Yes,Unknown,Yes,Unknown,Unknown,Unknown,No,Yes
1,2,2021-07-10,Confirmed,R,R,R,R,S,S,R,...,Yes,Yes,Unknown,No,No,,No,Unknown,Yes,
2,3,2017-07-25,Clinical,S,S,S,R,S,R,S,...,Yes,Yes,No,Unknown,Yes,Unknown,Unknown,Test refused,,Yes
3,4,2019-04-10,Confirmed,S,R,S,S,R,R,R,...,No,,,No,Yes,Unknown,Yes,Unknown,No,No
4,5,2021-12-20,Confirmed,S,S,S,S,S,S,S,...,Unknown,Yes,No,No,Yes,Yes,Unknown,Unknown,Yes,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2017-04-26,Confirmed,S,R,R,S,R,S,R,...,,,Yes,Unknown,Unknown,,Yes,Test not offered,,
996,997,2022-09-22,Clinical,R,S,S,S,S,S,S,...,No,Unknown,No,Yes,Yes,No,,Negative,Yes,
997,998,2017-12-28,Confirmed,R,R,R,S,S,R,S,...,No,Yes,,Yes,No,Yes,Yes,Test refused,Yes,Yes
998,999,2021-04-30,Confirmed,R,S,S,S,R,S,R,...,,No,Unknown,Unknown,Unknown,No,,Unknown,,


In [87]:
def country(origin):
    if(origin == 'Foreign Born'): return generic.choice(['United States', 'Taiwan (Province of China)', 'Iran, Islamic Republic of', 'Korea', 'Macao', 'Syrian Arab Republic', 'Hong Kong', 'Tibet', 'Philipines', 'India', 'China', 'Germany', 'Australia', 'Iceland', 'Finland', 'Ukraine', 'Argentina', 'UK', 'Ireland'])
    elif(origin == 'Unknown'): return generic.choice(['United States', 'Taiwan (Province of China)', 'Iran, Islamic Republic of', 'Korea', 'Macao', 'Syrian Arab Republic', 'Hong Kong', 'Tibet', 'Philipines', 'India', 'China', 'Germany', 'Australia', 'Iceland', 'Finland', 'Ukraine', 'Argentina', 'UK', 'Ireland', 'Canada', 'Unknown'])
    else: return "Canada"

def immigration_date(origin):
    if((origin == "Foreign Born") or (origin == "Unknown")): return dt.date(start=1950, end=2023)

def immigration_status(origin):
    if((origin == "Foreign Born") or (origin == "Unknown")): return generic.choice(IMMIGRATION_STATUS)
 
df['classification_year'] = df['classification_date'].map(lambda classification_date: classification_date.year)
df['country_of_birth_combined'] = df['origin'].map(lambda origin: country(origin))
df['immigration_arrival_date_combine'] = df['origin'].map(lambda origin: immigration_date(origin))
df['immigration_status_combined'] = df['origin'].map(lambda origin: immigration_status(origin))
df


Unnamed: 0,client_id_phac,classification_date,classification,isoniazid_resistance,ethambutol_resistance,rifampin_resistance,pyrazinamide_resistance,amikacin_resistance,capreomycin_resistance,ethionamide_resistance,...,longterm_corticosteriod_use,injection_drug_use,solid_organ_transplant_candidate,hiv_status,alcohol_use,tobacco_use,classification_year,country_of_birth_combined,immigration_arrival_date_combine,immigration_status_combined
0,1,2013-07-10,Clinical,R,S,R,R,S,R,S,...,Yes,Unknown,Unknown,Unknown,No,Yes,2013,UK,1995-04-14,
1,2,2021-07-10,Confirmed,R,R,R,R,S,S,R,...,No,,No,Unknown,Yes,,2021,Canada,,
2,3,2017-07-25,Clinical,S,S,S,R,S,R,S,...,Yes,Unknown,Unknown,Test refused,,Yes,2017,Canada,,
3,4,2019-04-10,Confirmed,S,R,S,S,R,R,R,...,Yes,Unknown,Yes,Unknown,No,No,2019,Canada,,
4,5,2021-12-20,Confirmed,S,S,S,S,S,S,S,...,Yes,Yes,Unknown,Unknown,Yes,No,2021,Canada,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,2017-04-26,Confirmed,S,R,R,S,R,S,R,...,Unknown,,Yes,Test not offered,,,2017,Canada,,
996,997,2022-09-22,Clinical,R,S,S,S,S,S,S,...,Yes,No,,Negative,Yes,,2022,Canada,,
997,998,2017-12-28,Confirmed,R,R,R,S,S,R,S,...,No,Yes,Yes,Test refused,Yes,Yes,2017,Canada,,
998,999,2021-04-30,Confirmed,R,S,S,S,R,S,R,...,Unknown,No,,Unknown,,,2021,Canada,,


In [88]:
DATA_FILE=Path(".", "tests", "BCCaseFileUploading.xlsx")
df.to_excel(Path(DATA_FILE), index=False)

### Outcomes File

In [89]:
field = Field(locale=Locale.EN_CA)
fieldset = Fieldset(locale=Locale.EN_CA)
generic = Generic(locale=Locale.EN_CA)
address = Address(locale=Locale.EN_CA)
dt = Datetime(locale=Locale.EN_CA)
numeric = Numeric()
text = Text()

schema = Schema(
    schema=lambda: {
        'client_id_phac': field("increment"),
        'classification_date': dt.date(start=2013, end=2023),
        # 'classification_year': numeric.integer_number(start=2013. end=2023),
        'treatment_start_date': dt.date(start=1993, end=2003),
        # 'treatment_end_date': lambda start_date: dt.date(start=start_date+timedelta(days=1)),
        # 'death_date_combined': lambda end_date, reason: dt.date(start=end_date) if(reason == 'Deceased') else None,
        'reason_treatment_ended_combined': generic.choice(['Adherent', 'Deceased', 'Other', 'Drug reaction/intolerance', 'Left BC-within Canada', 'Left Province', 'Transferred', 'Left Canada', 'Lost to follow up', 'Non-adherent', 'No Data', 'Other', None, "Contributed, but wasn't the underlying cause"]),
        'cause_of_death_combined': generic.choice(["Contributed, but wasn't the underlying cause", "Did not contribute to death/incidental", "Underlying cause of death", "Unknown"])

        },
    iterations=1000,
)

In [90]:
df = pd.DataFrame(schema.create())

In [91]:
df['classification_year'] = df['classification_date'].map(lambda classification_date: classification_date.year)
df['treatment_end_date'] = df['treatment_start_date'].map(lambda start_date: dt.date(start=start_date.year))
df['death_date_combined'] = df.apply(lambda row: dt.date(start=row['treatment_end_date'].year) if(row['reason_treatment_ended_combined'] == 'Deceased') else None,axis=1)
df

Unnamed: 0,client_id_phac,classification_date,treatment_start_date,reason_treatment_ended_combined,cause_of_death_combined,classification_year,treatment_end_date,death_date_combined
0,1,2021-09-12,1993-12-24,Transferred,"Contributed, but wasn't the underlying cause",2021,2005-01-30,
1,2,2015-01-08,1996-04-05,Adherent,Underlying cause of death,2015,2015-11-19,
2,3,2022-05-22,1994-12-22,Lost to follow up,Did not contribute to death/incidental,2022,2018-11-26,
3,4,2015-01-04,1997-02-08,,"Contributed, but wasn't the underlying cause",2015,2018-11-29,
4,5,2019-02-03,1996-06-10,Other,"Contributed, but wasn't the underlying cause",2019,1999-01-25,
...,...,...,...,...,...,...,...,...
995,996,2015-07-26,1994-05-09,Adherent,Did not contribute to death/incidental,2015,2015-03-08,
996,997,2016-10-19,2000-09-28,No Data,"Contributed, but wasn't the underlying cause",2016,2013-07-08,
997,998,2013-02-17,2003-04-17,Adherent,Underlying cause of death,2013,2017-02-09,
998,999,2023-06-11,1994-01-28,,Did not contribute to death/incidental,2023,2010-06-26,


In [92]:
DATA_FILE=Path(".", "tests", "BCOutcomesUploading.xlsx")
df.to_excel(Path(DATA_FILE), index=False)