In [1]:
import numpy as np # type: ignore
import pandas as pd # type: ignore
import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore
import plotly.express as px # type: ignore

In [2]:
# -------------- Data Assessing (understood data more deeply before implementing methods to clean it)
'''  
2 Type of unclean data:
    - Dirty data (Data with quality issue)
        - duplicate data
        - missing data
        - corrupt data
        - inaccurate data
    
    - Messy data (data with tidiness issues)
        - each variable form a column
        - each observation form a row
        - each observational unit form a table
'''

'  \n2 Type of unclean data:\n    - Dirty data (Data with quality issue)\n        - duplicate data\n        - missing data\n        - corrupt data\n        - inaccurate data\n    \n    - Messy data (data with tidiness issues)\n        - each variable form a column\n        - each observation form a row\n        - each observational unit form a table\n'

In [3]:
# data of clinical trial 

patients = pd.read_csv('./patients.csv')
treatments = pd.read_csv('./treatments.csv')
adverse_reactions = pd.read_csv('./adverse_reactions.csv')
treatments_cut = pd.read_csv('./treatments_cut.csv')

In [4]:
patients.head()

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,951-719-9170ZoeWellish@superrito.com,7/10/1976,121.7,66,19.6
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,PamelaSHill@cuvox.de+1 (217) 569-3204,4/3/1967,118.8,66,19.2
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,402-363-6804JaeMDebord@gustr.com,2/19/1980,177.8,71,24.8
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,PhanBaLiem@jourrapide.com+1 (732) 636-8246,7/26/1951,220.9,70,31.7
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [5]:
treatments.head()

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,veronika,jindrová,41u - 48u,-,7.63,7.2,
1,elliot,richardson,-,40u - 45u,7.56,7.09,0.97
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,
3,skye,gormanston,33u - 36u,-,7.97,7.62,0.35
4,alissa,montez,-,33u - 29u,7.78,7.46,0.32


In [6]:
treatments_cut.head()

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,jožka,resanovič,22u - 30u,-,7.56,7.22,0.34
1,inunnguaq,heilmann,57u - 67u,-,7.85,7.45,
2,alwin,svensson,36u - 39u,-,7.78,7.34,
3,thể,lương,-,61u - 64u,7.64,7.22,0.92
4,amanda,ribeiro,36u - 44u,-,7.85,7.47,0.38


In [7]:
adverse_reactions.head()

Unnamed: 0,given_name,surname,adverse_reaction
0,berta,napolitani,injection site discomfort
1,lena,baer,hypoglycemia
2,joseph,day,hypoglycemia
3,flavia,fiorentino,cough
4,manouck,wubbels,throat irritation


In [8]:
''' 
1. Write summary of data
2. Column description 
3. Add any additional information
'''

' \n1. Write summary of data\n2. Column description \n3. Add any additional information\n'

In [9]:
''' 
Type of assessment
    - Manual (looking through the data)
    - Programmatic (info(), describe(), sample())
'''

' \nType of assessment\n    - Manual (looking through the data)\n    - Programmatic (info(), describe(), sample())\n'

In [10]:
# export data for manual assessment

with pd.ExcelWriter('clinical_trials.xlsx') as writer:
    patients.to_excel(writer, sheet_name = 'patients')
    treatments.to_excel(writer, sheet_name = 'treatments')
    treatments_cut.to_excel(writer, sheet_name = 'treatments_cut')
    adverse_reactions.to_excel(writer, sheet_name = 'adverse_reactions')

In [11]:
''' 
Issue with data 
1. Dirty data
    ---patients---
    - patient id = 9 has misspelled name Dsvid = David (`accuracy`)
    - state col sometimes contains full name sometimes short name (`consistency`)
    - zip code must be 5 number (`validity`)
    - 12 data missing in address, city, state, zipcode, country, contact (`completion`)
    - incorrect datatype assigned to sex, zipcode, birthday (`validity`)
    - duplicate entry by the name "John Doe" (``accuracy`)
    - one patients has 48 pound this is wrong (`accuracy`)
    - one patients height = 27 inch not possible (`accuracy`)

    ---treatments & treatments_cut---
    - given name, surname col is in all lower case (`consistency`)
    - remove u from auralin and novodra col (`validity`)
    - "-" in auralin and novodra col treated as NaN (`validity`)
    - missing value in hba1c_change col (`completion`)
    - one duplicate entry by the name "joseph day" (`accuracy`)
    - in "hba1c_change" 9 instead of 4 -> .98 == .48 (`accuracy`)

    ---adverse_reaction---
    - given name and surname is in lower case (`consistency`)

2. Messy data
    ---patients---
    - contact col contains both email and number

    ---treatments & treatments_cut---
    - auralin, novodra col should be split into 2 col -> start and end dose
    - merge both the table
    
    ---adverse_reaction---
    - this table should not exist independently

'''

' \nIssue with data \n1. Dirty data\n    ---patients---\n    - patient id = 9 has misspelled name Dsvid = David (`accuracy`)\n    - state col sometimes contains full name sometimes short name (`consistency`)\n    - zip code must be 5 number (`validity`)\n    - 12 data missing in address, city, state, zipcode, country, contact (`completion`)\n    - incorrect datatype assigned to sex, zipcode, birthday (`validity`)\n    - duplicate entry by the name "John Doe" (``accuracy`)\n    - one patients has 48 pound this is wrong (`accuracy`)\n    - one patients height = 27 inch not possible (`accuracy`)\n\n    ---treatments & treatments_cut---\n    - given name, surname col is in all lower case (`consistency`)\n    - remove u from auralin and novodra col (`validity`)\n    - "-" in auralin and novodra col treated as NaN (`validity`)\n    - missing value in hba1c_change col (`completion`)\n    - one duplicate entry by the name "joseph day" (`accuracy`)\n    - in "hba1c_change" 9 instead of 4 -> .98

In [12]:
''' 
Automatic assessment
    - head and tail
    - sample
    - info
    - isnull
    - duplicated
    - describe
'''

' \nAutomatic assessment\n    - head and tail\n    - sample\n    - info\n    - isnull\n    - duplicated\n    - describe\n'

In [13]:
patients.head()
patients.tail()
patients.sample(5)
patients.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   patient_id    503 non-null    int64  
 1   assigned_sex  503 non-null    object 
 2   given_name    503 non-null    object 
 3   surname       503 non-null    object 
 4   address       491 non-null    object 
 5   city          491 non-null    object 
 6   state         491 non-null    object 
 7   zip_code      491 non-null    float64
 8   country       491 non-null    object 
 9   contact       491 non-null    object 
 10  birthdate     503 non-null    object 
 11  weight        503 non-null    float64
 12  height        503 non-null    int64  
 13  bmi           503 non-null    float64
dtypes: float64(3), int64(2), object(9)
memory usage: 55.1+ KB


In [14]:
patients[patients['address'].isnull()]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
209,210,female,Lalita,Eldarkhanov,,,,,,,8/14/1950,143.4,62,26.2
219,220,male,Mỹ,Quynh,,,,,,,4/9/1978,237.8,69,35.1
230,231,female,Elisabeth,Knudsen,,,,,,,9/23/1976,165.9,63,29.4
234,235,female,Martina,Tománková,,,,,,,4/7/1936,199.5,65,33.2
242,243,male,John,O'Brian,,,,,,,2/25/1957,205.3,74,26.4
249,250,male,Benjamin,Mehler,,,,,,,10/30/1951,146.5,69,21.6
257,258,male,Jin,Kung,,,,,,,5/17/1995,231.7,69,34.2
264,265,female,Wafiyyah,Asfour,,,,,,,11/3/1989,158.6,63,28.1
269,270,female,Flavia,Fiorentino,,,,,,,10/9/1937,175.2,61,33.1
278,279,female,Generosa,Cabán,,,,,,,12/16/1962,124.3,69,18.4


In [15]:
treatments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    280 non-null    object 
 1   surname       280 non-null    object 
 2   auralin       280 non-null    object 
 3   novodra       280 non-null    object 
 4   hba1c_start   280 non-null    float64
 5   hba1c_end     280 non-null    float64
 6   hba1c_change  171 non-null    float64
dtypes: float64(3), object(4)
memory usage: 15.4+ KB


In [16]:
treatments_cut.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70 entries, 0 to 69
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    70 non-null     object 
 1   surname       70 non-null     object 
 2   auralin       70 non-null     object 
 3   novodra       70 non-null     object 
 4   hba1c_start   70 non-null     float64
 5   hba1c_end     70 non-null     float64
 6   hba1c_change  42 non-null     float64
dtypes: float64(3), object(4)
memory usage: 4.0+ KB


In [17]:
adverse_reactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   given_name        34 non-null     object
 1   surname           34 non-null     object
 2   adverse_reaction  34 non-null     object
dtypes: object(3)
memory usage: 948.0+ bytes


In [18]:
# is there any duplicate
patients.duplicated().sum()
patients['patient_id'].duplicated().sum()
patients[patients.duplicated(subset = ['given_name', 'surname'])]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
229,230,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
237,238,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
244,245,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
251,252,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4
277,278,male,John,Doe,123 Main Street,New York,NY,12345.0,United States,johndoe@email.com1234567890,1/1/1975,180.0,72,24.4


In [19]:
treatments[treatments.duplicated()]
treatments[treatments.duplicated(subset=['given_name', 'surname'])]

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
136,joseph,day,29u - 36u,-,7.7,7.19,


In [20]:
treatments_cut[treatments_cut.duplicated()]
treatments_cut[treatments_cut.duplicated(subset=['given_name', 'surname'])]

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change


In [21]:
adverse_reactions.duplicated().sum()

np.int64(0)

In [22]:
patients.describe()

Unnamed: 0,patient_id,zip_code,weight,height,bmi
count,503.0,491.0,503.0,503.0,503.0
mean,252.0,49084.118126,173.43499,66.634195,27.483897
std,145.347859,30265.807442,33.916741,4.411297,5.276438
min,1.0,1002.0,48.8,27.0,17.1
25%,126.5,21920.5,149.3,63.0,23.3
50%,252.0,48057.0,175.3,67.0,27.2
75%,377.5,75679.0,199.5,70.0,31.75
max,503.0,99701.0,255.9,79.0,37.7


In [23]:
patients[patients['weight'] == 48.8]
patients[patients['height'] == 27]

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,contact,birthdate,weight,height,bmi
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,334-515-7487TimNeudorf@cuvox.de,2/18/1928,192.3,27,26.1


In [24]:
treatments.describe()

Unnamed: 0,hba1c_start,hba1c_end,hba1c_change
count,280.0,280.0,171.0
mean,7.985929,7.589286,0.546023
std,0.568638,0.569672,0.279555
min,7.5,7.01,0.2
25%,7.66,7.27,0.34
50%,7.8,7.42,0.38
75%,7.97,7.57,0.92
max,9.95,9.58,0.99


In [25]:
treatments.sort_values('hba1c_start')
treatments.sort_values('hba1c_end')
treatments.sort_values('hba1c_change', na_position='first')

Unnamed: 0,given_name,surname,auralin,novodra,hba1c_start,hba1c_end,hba1c_change
0,veronika,jindrová,41u - 48u,-,7.63,7.20,
2,yukitaka,takenaka,-,39u - 36u,7.68,7.25,
8,saber,ménard,-,54u - 54u,8.08,7.70,
9,asia,woźniak,30u - 36u,-,7.76,7.37,
10,joseph,day,29u - 36u,-,7.70,7.19,
...,...,...,...,...,...,...,...
49,jackson,addison,-,42u - 42u,7.99,7.51,0.98
17,gina,cain,-,36u - 36u,7.88,7.40,0.98
138,giovana,rocha,-,23u - 21u,7.87,7.38,0.99
32,laura,ehrlichmann,-,43u - 40u,7.95,7.46,0.99


In [26]:
treatments_cut.describe()

Unnamed: 0,hba1c_start,hba1c_end,hba1c_change
count,70.0,70.0,42.0
mean,7.838,7.443143,0.51881
std,0.423007,0.418706,0.270719
min,7.51,7.02,0.28
25%,7.64,7.2325,0.34
50%,7.73,7.345,0.37
75%,7.86,7.4675,0.9075
max,9.91,9.46,0.97


In [27]:
adverse_reactions.describe()

Unnamed: 0,given_name,surname,adverse_reaction
count,34,34,34
unique,34,33,6
top,berta,johnson,hypoglycemia
freq,1,2,19


In [28]:
''' 
Labeling for dirty data
Data Quality Dimensions
    - Completeness -> is data missing
    - Validity -> is data valid (negative height, duplicate patient id)
    - Accuracy -> data is valid but not accurate
    - Consistency -> both valid and accurate but write it differently

Order of danger
    completeness >>> validity >>> accuracy >>> consistency


Data cleaning order
    1. Quality -> Completeness
    2. Tidiness
    3. Quality -> Validity
    4. Quality -> Accuracy
    5. Quality -> Consistency


Step for data cleaning
    - Define (solution)
    - Code
    - Test
'''

' \nLabeling for dirty data\nData Quality Dimensions\n    - Completeness -> is data missing\n    - Validity -> is data valid (negative height, duplicate patient id)\n    - Accuracy -> data is valid but not accurate\n    - Consistency -> both valid and accurate but write it differently\n\nOrder of danger\n    completeness >>> validity >>> accuracy >>> consistency\n\n\nData cleaning order\n    1. Quality -> Completeness\n    2. Tidiness\n    3. Quality -> Validity\n    4. Quality -> Accuracy\n    5. Quality -> Consistency\n\n\nStep for data cleaning\n    - Define (solution)\n    - Code\n    - Test\n'

In [29]:
# --------------- Before cleaning make copy of pandas dataframe 

patients_df = patients.copy()
treatments_df = treatments.copy()
treatments_cut_df = treatments_cut.copy()
adverse_reactions_df = adverse_reactions.copy()


In [30]:
# - 12 data missing in address, city, state, zipcode, country, contact (`completion`) --> replace all missing value with "No data"

patients_df['address'].isnull().sum()
patients_df[patients_df['address'].isnull()]
patients_df.fillna("No data", inplace = True)
patients_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503 entries, 0 to 502
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   patient_id    503 non-null    int64  
 1   assigned_sex  503 non-null    object 
 2   given_name    503 non-null    object 
 3   surname       503 non-null    object 
 4   address       503 non-null    object 
 5   city          503 non-null    object 
 6   state         503 non-null    object 
 7   zip_code      503 non-null    object 
 8   country       503 non-null    object 
 9   contact       503 non-null    object 
 10  birthdate     503 non-null    object 
 11  weight        503 non-null    float64
 12  height        503 non-null    int64  
 13  bmi           503 non-null    float64
dtypes: float64(2), int64(2), object(10)
memory usage: 55.1+ KB


  patients_df.fillna("No data", inplace = True)


In [31]:
# - missing value in hba1c_change col (`completion`) --> hba1c_start - hba1c_end

treatments.head()
treatments_df['hba1c_change'] = treatments_df['hba1c_start'] - treatments_df['hba1c_end']
treatments_cut_df['hba1c_change'] = treatments_df['hba1c_start'] - treatments_df['hba1c_end']
treatments_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 280 entries, 0 to 279
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    280 non-null    object 
 1   surname       280 non-null    object 
 2   auralin       280 non-null    object 
 3   novodra       280 non-null    object 
 4   hba1c_start   280 non-null    float64
 5   hba1c_end     280 non-null    float64
 6   hba1c_change  280 non-null    float64
dtypes: float64(3), object(4)
memory usage: 15.4+ KB


In [32]:
#  contact col contains both email and number -> use regex to separate email and phone

import re


data = []

for item in patients['contact']:
    
    item = str(item)

    phone_match = re.search(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', item)
    phone = phone_match.group(0) if phone_match else None

    item = re.sub(r'(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})', '', item)

    email_match = re.search(r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)', item)
    email = email_match.group(0) if email_match else None

    data.append({'phone': phone, 'email': email})

patients_df[['phone', 'email']] = pd.DataFrame(data, columns=['phone', 'email'])

patients_df.drop(columns = 'contact', inplace = True)

In [33]:
patients_df

Unnamed: 0,patient_id,assigned_sex,given_name,surname,address,city,state,zip_code,country,birthdate,weight,height,bmi,phone,email
0,1,female,Zoe,Wellish,576 Brown Bear Drive,Rancho California,California,92390.0,United States,7/10/1976,121.7,66,19.6,951-719-9170,ZoeWellish@superrito.com
1,2,female,Pamela,Hill,2370 University Hill Road,Armstrong,Illinois,61812.0,United States,4/3/1967,118.8,66,19.2,(217) 569-3204,PamelaSHill@cuvox.de
2,3,male,Jae,Debord,1493 Poling Farm Road,York,Nebraska,68467.0,United States,2/19/1980,177.8,71,24.8,402-363-6804,JaeMDebord@gustr.com
3,4,male,Liêm,Phan,2335 Webster Street,Woodbridge,NJ,7095.0,United States,7/26/1951,220.9,70,31.7,(732) 636-8246,PhanBaLiem@jourrapide.com
4,5,male,Tim,Neudorf,1428 Turkey Pen Lane,Dothan,AL,36303.0,United States,2/18/1928,192.3,27,26.1,334-515-7487,TimNeudorf@cuvox.de
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,499,male,Mustafa,Lindström,2530 Victoria Court,Milton Mills,ME,3852.0,United States,4/10/1959,181.1,72,24.6,207-477-0579,MustafaLindstrom@jourrapide.com
499,500,male,Ruman,Bisliev,494 Clarksburg Park Road,Sedona,AZ,86341.0,United States,3/26/1948,239.6,70,34.4,928-284-4492,RumanBisliev@gustr.com
500,501,female,Jinke,de Keizer,649 Nutter Street,Overland Park,MO,64110.0,United States,1/13/1971,171.2,67,26.8,816-223-6007,JinkedeKeizer@teleworm.us
501,502,female,Chidalu,Onyekaozulu,3652 Boone Crockett Lane,Seattle,WA,98109.0,United States,2/13/1952,176.9,67,27.7,360 443 2060,ChidaluOnyekaozulu@jourrapide.com1


In [34]:
# merge treatments and treatments_cut table

treatments_df.shape
treatments_cut_df.shape

treatments_df = pd.concat([treatments_df, treatments_cut_df])

In [35]:
treatments_df.shape

(350, 7)

In [36]:
treatments_df = treatments_df.melt(id_vars = ['given_name', 'surname', 'hba1c_start', 'hba1c_end', 'hba1c_change'], var_name = 'Type', value_name = 'dosage_range')

In [37]:
treatments_df = treatments_df[treatments_df['dosage_range'] != '-']

In [38]:
treatments_df['dosage_start'] = treatments_df['dosage_range'].str.split('-').str.get(0)
treatments_df['dosage_end'] = treatments_df['dosage_range'].str.split('-').str.get(1)

In [39]:
treatments_df = treatments_df.drop(columns = 'dosage_range')

In [40]:
treatments_df['dosage_start'] = treatments_df['dosage_start'].str.replace('u', '')
treatments_df['dosage_end'] = treatments_df['dosage_end'].str.replace('u', '')

In [41]:
treatments_df['dosage_start'] = treatments_df['dosage_start'].astype(np.int16)
treatments_df['dosage_end'] = treatments_df['dosage_end'].astype(np.int16)

In [42]:
treatments_df

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,Type,dosage_start,dosage_end
0,veronika,jindrová,7.63,7.20,0.43,auralin,41,48
3,skye,gormanston,7.97,7.62,0.35,auralin,33,36
6,sophia,haugen,7.65,7.27,0.38,auralin,37,42
7,eddie,archer,7.89,7.55,0.34,auralin,31,38
9,asia,woźniak,7.76,7.37,0.39,auralin,30,36
...,...,...,...,...,...,...,...,...
688,christopher,woodward,7.51,7.06,0.40,novodra,55,51
690,maret,sultygov,7.67,7.30,0.46,novodra,26,23
694,lixue,hsueh,9.21,8.80,0.38,novodra,22,23
696,jakob,jakobsen,7.96,7.51,0.31,novodra,28,26


In [43]:
treatments_df = treatments_df.merge(adverse_reactions_df, how = 'left', on = ['given_name', 'surname'])

In [44]:
treatments_df

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,Type,dosage_start,dosage_end,adverse_reaction
0,veronika,jindrová,7.63,7.20,0.43,auralin,41,48,
1,skye,gormanston,7.97,7.62,0.35,auralin,33,36,
2,sophia,haugen,7.65,7.27,0.38,auralin,37,42,
3,eddie,archer,7.89,7.55,0.34,auralin,31,38,
4,asia,woźniak,7.76,7.37,0.39,auralin,30,36,
...,...,...,...,...,...,...,...,...,...
345,christopher,woodward,7.51,7.06,0.40,novodra,55,51,nausea
346,maret,sultygov,7.67,7.30,0.46,novodra,26,23,
347,lixue,hsueh,9.21,8.80,0.38,novodra,22,23,injection site discomfort
348,jakob,jakobsen,7.96,7.51,0.31,novodra,28,26,hypoglycemia
