In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
import matplotlib.pyplot as plt
import seaborn as sns

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [5]:
## Notes:
## What to do with greater than/less than/etc. values?
## Handling results such as 'neg'

## Load file
first_part_file = "/home/ec2-user/SageMaker/Team-5/data_processing/tests in first part of the merged data.csv"
first = pd.read_csv(first_part_file)

## Print names of all tests in table
unique_values = first['testName'].unique()
print("Unique tests in First Part of Merged Data:")
print(unique_values)

## Cleaning units
column_to_update = 'D6OtherTestResultUnit'
first[column_to_update] = first[column_to_update].str.replace("/dl", "/dL", case=False)
first[column_to_update] = first[column_to_update].str.replace("/l", "/L", case=False)
first[column_to_update] = first[column_to_update].str.replace("mm Hg", "mmHg", case=False)
first[column_to_update] = first[column_to_update].str.replace("NG/ML", "ng/mL", case=False)
first[column_to_update] = first[column_to_update].str.replace("MMOL", "mmol", case=False)
first[column_to_update] = first[column_to_update].str.replace("mm0/L", "mmol/L", case=False) ## Using best judgement
first[column_to_update] = first[column_to_update].str.replace("x1069/L", "x10^9/L", case=False)
first[column_to_update] = first[column_to_update].str.replace("/ml", "/mL", case=False)

## Sort by test name for readability
first = first.sort_values('testName')

## Print tables
pd.set_option('display.max_rows', 113)
display(first)

Unique tests in First Part of Merged Data:
['A1c at home' 'Albumin' 'Anion Gap' 'Anti GAD Antibodies' 'BHB' 'BUN'
 'Base Excess' 'Base excess' 'Beta Hydroxybutyrate'
 'Beta Hydroxybutyric Acid' 'Beta-Hydroxybutyrate' 'BetaHydroxy' 'Bicarb'
 'Bicarbonate' 'Blood Ketones' 'Blood gases pO2, blood' 'Blood ketones'
 'C-Peptide' 'CBC and diff normal' 'CGM' 'CGM data' 'CO2' 'Calcium'
 'Chloride' 'Creatinine' 'Free T4' 'Fructosamine' 'Glucose-Lab' 'HCO3'
 'HbA1c (point of care analyser)' 'HcT' 'HgB' 'Home test' 'IA2 Antibodies'
 'IgA' 'Insulin, Fasting' 'Islet Cell Cytoplasmic Autoabs' 'Ketones'
 'Keyton' 'Lactate' 'MCH' 'MCHC' 'MCV' 'Meta / Myelocytes' 'P-OHBUT'
 'P-OHButyr' 'PC02' 'PCO2' 'PCO2 Cap' 'PCo2' 'Phosphorus' 'Platelets'
 'Potassium' 'RANDOM PLASMA GLUCOSE' 'RBC' 'RDW' 'Random plasma glucose'
 'S-IA2Ab' 'Sodium' 'TSH' 'TTG IgA Abs' 'U-Gluk-O' 'Urea' 'Urine'
 'Urine Glucose' 'Urine Screen' 'Urine, Ketones' 'WBC' 'blood ketones'
 'c-peptide' 'cB-pH' 'chloride' 'creatinine' 'glycated P

Unnamed: 0,testName,D5OtherTestResultResult,D6OtherTestResultUnit
0,A1c at home,6.5%,
1,Albumin,4.4,g/dL
2,Anion Gap,15.0,mmol/L
3,Anion Gap,16.0,mmol/L
4,Anion Gap,20,mmol/L
5,Anion Gap,20,mmol/L
6,Anti GAD Antibodies,70,U/mL
7,BHB,4.77,mmol/L
8,BUN,15.0,mg/dL
9,Base Excess,-8,mmol/L


In [46]:
## Load file
second_part_file = "/home/ec2-user/SageMaker/Team-5/data_processing/tests in second part of the merged data.csv"
second = pd.read_csv(second_part_file, low_memory=False)

## Print names of all tests in table
unique_values = second['Test_Name'].unique()
print("Unique tests in Second Part of Merged Data:")
print(unique_values)
print('')

## Print names of all result types in table
unique_results = second['Result_Type'].unique()
print("Unique result types in Second Part of Merged Data:")
print(unique_results)

## Print table
display(second)

## Create data frame for each test and get middle 99%
second['Result'] = pd.to_numeric(second['Result'], errors='coerce')
middle_99_values = second.groupby('Test_Name')['Result'].quantile([0.005, 0.995]).unstack(level=1).reset_index()
second = pd.merge(second, middle_99_values, on='Test_Name', suffixes=('', '_middle99'))
second['Within_Middle_99'] = (second['Result'] >= second[0.005]) & (second['Result'] <= second[0.995])
# second = second.drop([0.005, 0.995], axis=1)
display(second)

## Print range for each test
for test_name in unique_values:
    middle_99_range = middle_99_values[middle_99_values['Test_Name'] == test_name][[0.005, 0.995]]
    print(f"Middle 99% range for '{test_name}':")
    print(middle_99_range)
    print("\n")

Unique tests in Second Part of Merged Data:
['DR3' 'DR4' 'GAD65' 'GAD65H' 'GLU-10' 'GLU0' 'GLU120' 'GLU30' 'GLU60'
 'GLU90' 'HLA' 'HLAa' 'HLAb' 'HbA1c' 'IA-2H' 'ICA' 'ICA512' 'INS-10'
 'INS0' 'INS120' 'INS60' 'INST-10' 'INST0' 'INST120' 'INST30' 'INST60'
 'INST90' 'MIAA' 'PEP-10' 'PEP0' 'PEP120' 'PEP30' 'PEP60' 'PEP90' 'ZNT8'
 'INS30' 'INS90']

Unique result types in Second Part of Merged Data:
['NORPTD' 'RPTD' 'RVSD' nan]


Unnamed: 0,Test_Name,Result,Result_Type
0,DR3,,NORPTD
1,DR3,,NORPTD
2,DR4,,NORPTD
3,DR4,,NORPTD
4,GAD65,,NORPTD
...,...,...,...
1739218,ZNT8,,
1739219,ZNT8,,
1739220,ZNT8,,
1739221,ZNT8,,


Unnamed: 0,Test_Name,Result,Result_Type,0.005,0.995,Within_Middle_99
0,DR3,,NORPTD,,,False
1,DR3,,NORPTD,,,False
2,DR3,,RPTD,,,False
3,DR3,,RPTD,,,False
4,DR3,,RPTD,,,False
...,...,...,...,...,...,...
1739218,INS90,266.0,RPTD,37.525,265.41,False
1739219,INS90,37.5,RPTD,37.525,265.41,False
1739220,INS90,38.0,RPTD,37.525,265.41,True
1739221,INS90,52.3,RPTD,37.525,265.41,True


Middle 99% range for 'DR3':
   0.005  0.995
0    NaN    NaN


Middle 99% range for 'DR4':
   0.005  0.995
1    NaN    NaN


Middle 99% range for 'GAD65':
   0.005  0.995
2 -0.041   1.04


Middle 99% range for 'GAD65H':
   0.005  0.995
3    0.0  905.0


Middle 99% range for 'GLU-10':
   0.005  0.995
4   66.0  141.0


Middle 99% range for 'GLU0':
   0.005  0.995
5   65.0  140.0


Middle 99% range for 'GLU120':
   0.005  0.995
6   53.0  350.0


Middle 99% range for 'GLU30':
   0.005  0.995
7   84.0  245.0


Middle 99% range for 'GLU60':
   0.005   0.995
8   63.0  306.42


Middle 99% range for 'GLU90':
   0.005   0.995
9   59.0  338.99


Middle 99% range for 'HLA':
    0.005  0.995
10    NaN    NaN


Middle 99% range for 'HLAa':
    0.005  0.995
11    NaN    NaN


Middle 99% range for 'HLAb':
    0.005  0.995
12    NaN    NaN


Middle 99% range for 'HbA1c':
    0.005  0.995
13    4.3    6.7


Middle 99% range for 'IA-2H':
    0.005    0.995
14    0.0  369.895


Middle 99% range for 'ICA':
