<a href="https://colab.research.google.com/github/nthammadi-uncc/DataAcquisition/main/Data_Acquisition_and_Ingestion_using_Colab_and_SQLite3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DSBA-HCIP 6160: Data Acquisition and Ingestion for Data Analytics

### Author: Naomi Thammadi

## Data Acquisition - Using FHIR API, Google Colab and SQLite3

### Import necessary Libraries

In [1]:
#import necessary libraries
!pip install requests
!pip install pymysql
import requests
import os
import json
import csv
import pandas as pd
import numpy as np
import math

#sqlite3 related libraries
import sqlite3



### Functions 

In [2]:
#function to get the element for phone number from the json 
def get_correct_phone_data(arr):
    #data can be NaN or a list
    #below conditions check for that to get the right data within the list
    if type(arr) == list:
        if(len(arr)>1 and 'value' not in arr[0].keys()):
            return arr[1]
        else:
            return arr[0]
    elif math.isnan(arr) == False:
        return arr[0]
    else:
        return 'unknown'

In [3]:
#function to clean patient data to convert lists to string and get right data from the json
def clean_patient_data(df):
    df['Name_JSON']=df['Name_JSON'].str[0]
    df['Address_JSON']=df['Address_JSON'].str[0]
    df['Phone_JSON']=df['Phone_JSON'].apply(lambda x:get_correct_phone_data(x))

In [4]:
#function to get name, address and phone details from the json
def concat_patient_data(df):
    #get name details and drop unnecessary columns
    name_df=df['Name_JSON'].apply(json.dumps).apply(json.loads).apply(pd.Series)
    name_df['Given Name']=[' '.join(map(str, l)) for l in name_df['given']]
    name_df.drop(['use','fhir_comments','text','given','prefix'], axis = 1, inplace = True, errors='ignore')
    
    #get address details, format them and drop unnecessary columns
    address_df=df['Address_JSON'].apply(json.dumps).apply(json.loads).apply(pd.Series)
    address_df.drop([0,'district','text','type','use','period','extension','fhir_comments'], axis = 1, inplace = True, errors='ignore')
    address_df.fillna('', inplace=True)
    #combine street address into one line
    address_df['line'] = [','.join(map(str, l)) for l in address_df['line']]
    #combine street address, city, state, country into one column separated by (,)
    address_df['Address']=address_df.apply(lambda row: ','.join(row.values.astype(str)), axis=1)
    address_df.Address = address_df.Address.replace({',,,,,': 'unknown'})
    address_df.rename(columns={'city': 'City', 'state': 'State', 'country': 'Country'}, inplace=True)
    
    #get phone details and drop unnecessary columns
    phone_df=df['Phone_JSON'].apply(json.dumps).apply(json.loads).apply(pd.Series)
    phone_df['Phone']=phone_df['value']
    phone_df.drop([0,'rank','system','use','value'], axis = 1, inplace = True, errors='ignore')
    phone_df.fillna('-', inplace=True)
    
    #concatenate all these dataframes into one and return it
    df=pd.concat([df,name_df,address_df,phone_df],axis=1)
    
    return df

In [5]:
#function to arrange the data correctly to display
def arrange_patient_data(df):
    df['Family Name']=df['family']
    df.drop(['Name_JSON','Address_JSON','Phone_JSON','family','line','postalCode','period','id'], axis = 1, inplace = True, errors='ignore')
    df.fillna('-', inplace=True)
    df['Date of Death'] = df['Date of Death'].replace({'-': 'unknown'})
    df=df[['Patient Id','Given Name','Family Name','Gender','Date of Birth','Date of Death','Address','City','State','Country','Phone']]
    return df

## Retrieving details of Deceased Patients using FHIR REST API

In [6]:
#function to search for patients based on search string
def search_patients():
    #url to make the API callout
    url='http://hapi.fhir.org/baseR4/Patient?deceased=true'
    response = requests.get(url)
    #format the response to json
    resp=response.json() 
    #store into a dataframe
    data=pd.json_normalize(resp['entry'])
    
    patient_df=data.filter(['resource.id','resource.name','resource.gender','resource.birthDate','resource.deceasedDateTime','resource.address','resource.telecom','resource.deceasedBoolean'], axis=1)
    patient_df.columns=['Patient Id','Name_JSON','Gender','Date of Birth','Date of Death', 'Address_JSON','Phone_JSON','Deceased']
    
    #call functions to run the necessary data preprocessing steps
    clean_patient_data(patient_df)
    patient_df=concat_patient_data(patient_df)
    patient_df=arrange_patient_data(patient_df)
    
    return patient_df

In [7]:
#function to display search results in a html
def on_search_clicked(b):
    search_patients.data=''
    results=search_patients()
    
    #set the dataframe into a HTML table
    search_patients.data= results.to_html();
    
    display(HTML(search_patients.data))

## Display Patient Details

In [8]:
from ipywidgets import widgets
from IPython.core.display import display, HTML

display(HTML('<h4>Click on "Display Patients" to show all the deceased patients.</h4>'))
search_btn=widgets.Button(description="Display Patients",button_style="success")
display(search_btn)
search_btn.on_click(on_search_clicked)



Button(button_style='success', description='Display Patients', style=ButtonStyle())

## Data Ingestion - Transfer Patient details to MySQL database

In [9]:
#read the csv files and convert into dataframe
url='https://raw.githubusercontent.com/nthammadi-uncc/DataAcquisition/main/Data/patient_data.csv'
patients_df=pd.read_csv(url)
patients_df.head()

Unnamed: 0,patient_id,given_name,family_name,gender,date_of_birth,date_of_death,city,state,phone,death_by_natural_cause,alcohol_intake,nicotine_intake,race,bmi,health_issues
0,7949473,Gaynor,Titta,Female,6/4/46,10/22/16 15:42,West Palm Beach,Florida,561-553-2724,0,Never,Never,Asian,Normal,1
1,9662622,Oona,Pusill,Female,11/26/65,5/17/16 22:33,Hollywood,Florida,954-178-3132,0,Often,Often,Alaska Native,Over weight,1
2,6716847,Yetta,Taysbil,Male,9/14/47,11/16/07 10:39,Jersey City,New Jersey,201-765-7538,0,Never,Never,African American,Under weight,1
3,8349189,Farrah,Bartles,Male,12/8/97,2/18/00 1:18,Murfreesboro,Tennessee,615-130-9597,0,Often,Often,Alaska Native,Over weight,0
4,3790901,Richardo,Hedau,Bigender,7/11/77,12/24/93 11:05,Reston,Virginia,571-153-4428,1,Never,Occasionally,Native Hawaiian,Normal,1


In [10]:
patients_df.shape

(10000, 15)

In [11]:
patients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   patient_id              10000 non-null  int64 
 1   given_name              10000 non-null  object
 2   family_name             10000 non-null  object
 3   gender                  10000 non-null  object
 4   date_of_birth           10000 non-null  object
 5   date_of_death           10000 non-null  object
 6   city                    10000 non-null  object
 7   state                   10000 non-null  object
 8   phone                   10000 non-null  object
 9   death_by_natural_cause  10000 non-null  int64 
 10  alcohol_intake          10000 non-null  object
 11  nicotine_intake         10000 non-null  object
 12  race                    10000 non-null  object
 13  bmi                     10000 non-null  object
 14  health_issues           10000 non-null  int64 
dtypes: 

## Connect and Insert into to SQLite3

In [12]:
'''Arguments:
            df: Pandas data frame that will be inserted as a table
            table_name: name of the table
            db_name: databse to connect to '''

def create_new_table(df, table_name, db_name):
    # Step 1: Setup local logging
    import logging
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s: %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

    # Step 2: Find columns in the dataframe
    cols = df.columns
    cols_string = ','.join(cols)
    val_wildcard_string = ','.join(['?'] * len(cols))

    # Step 3: Connect to a DB file if it exists, else create a new schema
    dbconnection=sqlite3.connect(db_name)
    cursor = dbconnection.cursor()
    logging.info(f'SQL DB {db_name} created')

    # Step 4: Create Table - first drop it if it exists
    sql_string = f"""DROP TABLE IF EXISTS {table_name};"""
    cursor.execute(sql_string)

    sql_string = f"""CREATE TABLE {table_name} ({cols_string});"""
    cursor.execute(sql_string)
    logging.info(f'SQL Table {table_name} created with {len(cols)} columns')

    # Step 5: Upload the dataframe
    rows_to_upload = df.to_dict(orient='split')['data']
    sql_string = f"""INSERT INTO {table_name} ({cols_string}) VALUES ({val_wildcard_string});"""  
    print(sql_string)
    cursor.executemany(sql_string, rows_to_upload)
    logging.info(f'{len(rows_to_upload)} rows uploaded to {table_name}')

    # Step 6: Commit the changes and close the connection
    dbconnection.commit()
    dbconnection.close()

In [13]:
create_new_table(patients_df,table_name='patients',db_name='patient_db')

2022-03-30 14:06:18 INFO: SQL DB patient_db created
2022-03-30 14:06:18 INFO: SQL Table patients created with 15 columns
2022-03-30 14:06:18 INFO: 10000 rows uploaded to patients


INSERT INTO patients (patient_id,given_name,family_name,gender,date_of_birth,date_of_death,city,state,phone,death_by_natural_cause,alcohol_intake,nicotine_intake,race,bmi,health_issues) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?);


## Retrieve from SQLite3

In [14]:
'''Arguments:
            df: Pandas data frame that will be inserted as a table
            table_name: name of the table
            db_name: databse to connect to 
   Returns:
            pd.DataFrame: results from query_string put into a dataframe
            '''

def query_from_table(query_string, db_name):
    # Step 1: Connect to the SQL DB
    con = sqlite3.connect(db_name)

    # Step 2: Execute the SQL query
    cursor = con.execute(query_string)

    # Step 3: Fetch the data and column names
    result_data = cursor.fetchall()
    print(cursor.description)
    cols = [description[0] for description in cursor.description]

    # Step 4: Close the connection
    con.close()

    # Step 5: Return as a dataframe
    return pd.DataFrame(result_data, columns=cols)

In [15]:
#query all female patients who lived in Florida
query_string="""SELECT patient_id, given_name, family_name, date_of_birth, date_of_death, death_by_natural_cause, health_issues 
                FROM patients
                WHERE gender='Female' AND state = 'Florida'
                ORDER BY family_name"""
result_df = query_from_table(query_string, db_name='patient_db')


(('patient_id', None, None, None, None, None, None), ('given_name', None, None, None, None, None, None), ('family_name', None, None, None, None, None, None), ('date_of_birth', None, None, None, None, None, None), ('date_of_death', None, None, None, None, None, None), ('death_by_natural_cause', None, None, None, None, None, None), ('health_issues', None, None, None, None, None, None))


In [16]:
result_df.head(50)

Unnamed: 0,patient_id,given_name,family_name,date_of_birth,date_of_death,death_by_natural_cause,health_issues
0,5655793,Nannette,Aaronson,7/2/41,11/6/96 6:28,0,1
1,9111089,Ibby,Advani,9/28/78,10/28/08 0:09,0,1
2,4053689,Guinevere,Alldred,9/29/05,8/24/97 23:24,1,1
3,1885338,Genni,Amott,7/22/18,7/27/03 15:08,1,0
4,1143681,Siffre,Andrichak,7/29/47,4/17/06 17:07,1,1
5,4864170,Adamo,Arnaud,1/20/34,3/5/13 18:13,0,1
6,6654269,Ashbey,Ashbolt,2/13/33,11/4/21 18:42,1,0
7,8224045,Ilyse,Aveyard,11/15/51,4/28/00 6:17,1,1
8,1936877,Lazaro,Balf,11/30/31,5/3/00 5:44,1,1
9,6148253,Neville,Bamling,9/9/34,3/6/11 17:56,0,0


### Assignment:

<ol>
    <li>Calculate the age of patient and create a new column in the patients_df Data Frame</li>
    <li>Create a table in SQLite3 with the new patient data frame that has patient age</li>
    <li>Retrieve any columns(of your choice) of patients data from SQLite3</li>
    <li>Create any visualizations on python for the patient data</li>
    <li>Optional: Run a linear regression to predict either death by natural cause or health issues</li>
</ol>