In [1]:
from pathlib import Path
import pandas as pd

data_path = Path('c:\\Users\\mcvitano01\\OneDrive - JPS Health Network\\registries\\hiv-art-resistance\\data')
monogram_path = data_path.joinpath('monogram')

# This is a switch
#   --> Parsing PDFs is a lengthy, compute-intensive process
#   --> In contrast, downstream functions that take {df} as input complete in < 60 seconds
try:
    df = pd.read_csv(monogram_path.joinpath('phenotypic-tests-2021-12-31.tsv'), 
                     dtype={'PAT_MRN_ID': str}, sep='\t')

    rerun_pdf_parsing = False
except:
    rerun_pdf_parsing = True
    pass

## Step 1
### Manually download test result PDFs from Monogram site

## Step 2(a)
### Parse PDFs into dataframes
* *\> 15 minutes to run*

In [None]:
from src.utils import parse_phenotypic_reports

if rerun_pdf_parsing:

    # Phenosense
    phenosense, phenosense_eav = parse_phenotypic_reports(
        path_to_zips=monogram_path.joinpath('phenosense'), 
        test_type='phenosense')

    phenosense.to_csv(
        monogram_path.joinpath('phenosense/phenosense-2021-12-31.tsv'), 
        index=False, sep='\t')

    phenosense_eav.to_csv(
        monogram_path.joinpath('phenosense/phenosense-eav-2021-12-31.tsv'), 
        index=False, sep='\t')

    # Phenosense Entry
    phenosense_entry, phenosense_entry_eav = parse_phenotypic_reports(
        path_to_zips=monogram_path.joinpath('phenosense-entry'), 
        test_type='phenosense-entry')

    phenosense_entry.to_csv(
        monogram_path.joinpath('phenosense-entry/phenosense-entry-2021-12-31.tsv'), 
        index=False, sep='\t')

    phenosense_entry_eav.to_csv(
        monogram_path.joinpath('phenosense-entry/phenosense-entry-eav-2021-12-31.tsv'), 
        index=False, sep='\t')

    # Phenosense-Integrase
    phenosense_integrase, phenosense_integrase_eav = parse_phenotypic_reports(
        path_to_zips=monogram_path.joinpath('phenosense-integrase'), 
        test_type='phenosense-integrase')
    
    phenosense_integrase.to_csv(
        monogram_path.joinpath('phenosense-integrase/phenosense-integrase-2021-12-31.tsv'), 
        index=False, sep='\t')

    phenosense_integrase_eav.to_csv(
        monogram_path.joinpath('phenosense-integrase/phenosense-integrase-eav-2021-12-31.tsv'), 
        index=False, sep='\t')
    
    # Phenosense-GT
    phenosense_gt, phenosense_gt_eav = parse_phenotypic_reports(
        path_to_zips=monogram_path.joinpath('phenosense-gt'), 
        test_type='phenosense-gt')
    
    phenosense_gt.to_csv(
        monogram_path.joinpath('phenosense-gt/phenosense-gt-2021-12-31.tsv'), 
        index=False, sep='\t')

    phenosense_gt_eav.to_csv(
        monogram_path.joinpath('phenosense-gt/phenosense-gt-eav-2021-12-31.tsv'), 
        index=False, sep='\t')

In [None]:
# Combine
df = pd.concat([phenosense, phenosense_entry, phenosense_integrase, phenosense_gt])
df.sort_values(['PAT_MRN_ID', 'COLLECTED_DATE'], inplace=True)
df.reset_index(inplace=True, drop=True)

# Write to disk
df.to_csv(monogram_path.joinpath(
    'phenotypic-tests-through-2021-12-31.tsv'), index=False, sep='\t')

In [None]:
# Combine
df_eav = pd.concat([phenosense_eav, phenosense_entry_eav, phenosense_integrase_eav, phenosense_gt_eav])
df_eav.sort_values(['PAT_MRN_ID', 'COLLECTED_DATE'], inplace=True)
df_eav.reset_index(inplace=True, drop=True)

# Write to disk
df_eav.to_csv(monogram_path.joinpath(
    'phenotypic-eav-through-2021-12-31.tsv'), index=False, sep='\t')