In [1]:
import pandas as pd
import numpy as np
import datetime
import os

In [2]:
# upload destination datasets df and create a new failure_report_df that complies with the schema 

destination_datasets_df = pd.read_csv('destination_dataset.csv')
failure_report_df = pd.DataFrame(columns = ["spm_id", "u_id", "source_file", "failure_reason", "timestamp"])

In [3]:
list_files = os.listdir(os.getcwd() + "/datasets")

#  function to get the source file using the hb_id/ spm_id and the datasets names

def getSourceFile(spm_id, hb_id):
    for file in  list_files:
        try: 
            if hb_id in file:
                return file
        except TypeError:
            if len(spm_id) == 8:
                return "S08000020.csv"
            elif len(spm_id) == 16:
                return "S08000024.json"
            elif len(spm_id) == 36:
                return "S08000031.csv"

In [4]:
# funtion to check whether record is a failure 
# it the test outcome is void/0 or there is no hb_id in the dataframe, the record is a failure

def findFails(row):
    if row["outcome"] == 0 and pd.isnull(row["hb_id"]):
        new_row = {"spm_id":row["spm_id"], 
                    "u_id":row["u_id"], 
                    "source_file":getSourceFile(row["spm_id"], row["hb_id"]), 
                    "failure_reason":"The user does not have any records in Patient Demographics and the result was void", 
                    "timestamp":datetime.datetime.now().isoformat()}
        return(new_row)
    elif row["outcome"] == 0:
        new_row = {"spm_id":row["spm_id"], 
                    "u_id":row["u_id"], 
                    "source_file":getSourceFile(row["spm_id"], row["hb_id"]), 
                    "failure_reason":"The test result was void", 
                    "timestamp":datetime.datetime.now().isoformat()}
        return new_row
    elif pd.isnull(row["hb_id"]):
        new_row = {"spm_id":row["spm_id"], 
                    "u_id":row["u_id"], 
                    "source_file":getSourceFile(row["spm_id"], row["hb_id"]), 
                    "failure_reason":"The user does not have any records in Patient Demographics", 
                    "timestamp":datetime.datetime.now().isoformat()}
        return(new_row)
    else:
        return False

In [5]:
fails = [] #list to hold the indices of failed records

#  apply findFails function on every row on destination_datasets

for index, row in destination_datasets_df.iterrows():
    newRow = findFails(row)
    if newRow:
        failure_report_df = failure_report_df.append(newRow, ignore_index = True)
        fails.append(index)

# remove rows of failed records and reindex dataframe
destination_datasets_df.drop(labels = fails, axis = 0, inplace = True)  
destination_datasets_df.reset_index(drop=True, inplace = True)


In [6]:
# check to see if there are still null values in destination dataset

destination_datasets_df.loc[destination_datasets_df['hb_id'].isnull()]

Unnamed: 0,spm_id,hb_id,hb_name,u_id,sex_at_birth,postcode_district,postcode_sector,simd_rank,outcome,effective_date,spm_test_type


In [7]:
destination_datasets_df

Unnamed: 0,spm_id,hb_id,hb_name,u_id,sex_at_birth,postcode_district,postcode_sector,simd_rank,outcome,effective_date,spm_test_type
0,1FTEW1CM,S08000020,NHS Grampian,1G4GC5ER0DF656630,F,AB12,AB12 9,3888,2,2022-01-01,na
1,WAUNF78P,S08000020,NHS Grampian,3VW8S7ATXFM709021,F,AB51,AB51 0,5886,1,2022-01-06,na
2,WA1CFAFP,S08000020,NHS Grampian,1G6DJ1ED2B0519654,M,AB30,AB30 1,4664,1,2022-01-11,na
3,JTHBB1BA,S08000020,NHS Grampian,1G6YV36A695848747,M,AB16,AB16 5,2631,2,2022-01-12,na
4,1GYS3KEF,S08000020,NHS Grampian,5GAKRBED8BJ816443,F,AB34,AB34 5,4596,2,2022-01-11,na
...,...,...,...,...,...,...,...,...,...,...,...
2908,3c3c73a1-2bd9-4802-934a-347d9343a1db,S08000031,NHS Greater Glasgow and Clyde,1HGCR2E7XFA201770,M,G3,G3 7,6520,2,2022-01-03,lfd
2909,6c805718-549a-4cbf-b480-5e3327812df2,S08000031,NHS Greater Glasgow and Clyde,1N6AA0EK7FN600163,M,G83,G83 0,2527,2,2022-01-04,pcr
2910,c1af8d99-f0f3-4cb0-a61b-6c5b1cf1bcdf,S08000031,NHS Greater Glasgow and Clyde,1FTEW1E86AK512258,F,G69,G69 7,5520,2,2022-01-11,pcr
2911,1aacae12-a8bb-45fc-8e49-b19de7e1e138,S08000031,NHS Greater Glasgow and Clyde,JTDKN3DU6A0891048,M,PA6,PA6 7,5900,2,,lfd


In [8]:
# export dataframes as csv files 

if not os.path.exists('solutions_datasets'):
                os.makedirs('solutions_datasets')   

destination_datasets_df.to_csv('solutions_datasets/destination_dataset_clean.csv', index=False)
failure_report_df.to_csv('solutions_datasets/failure_report.csv', index=False)