# Part II: Transform Non-Image Data into OMOP CDM

This part can be skipped if your non-imaging data already exist in OMOP CDM. In this case, you would need to go to the `Part II: Tranform Image Data` notebook.

### Prerequisites
* Download ODBC Driver 18 from web <https://learn.microsoft.com/en-us/sql/connect/odbc/linux-mac/install-microsoft-odbc-driver-sql-server-macos?view=sql-server-ver16>
* OMOP CDM instance with the connection string and authentication information. If your database access limits IP addresses, make sure to add your IP address before running the connection strings.
* Install SQL processing package, i.e., pyodbc.
* Files or database that non-image data is stored: demographic, clinical assessments, labs, visits, etc.

In [2]:
import socket

def get_network_ip():
    s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
    try:
        # doesn't have to be reachable
        s.connect(('10.254.254.254', 1))
        ip_address = s.getsockname()[0]
    except Exception:
        ip_address = '127.0.0.1'
    finally:
        s.close()
    return ip_address

ip_address = get_network_ip()
print(f"My IP address is: {ip_address}")


My IP address is: 192.168.1.153


In [1]:
# !!! redact before publishing !!!

import psycopg2

conn = psycopg2.connect(
    database="adni",
    user="dbadmin",
    password="hopkinsx93ewD",
    host="ohdsicdmdb.postgres.database.azure.com",
    port="5432",
    connect_timeout = 6000
)

cursor = conn.cursor()

### Patient demographic data
- Downloaded from ADNI demographic file (Subject Demographics) and saved in my local folder.
- This will be transformed and loaded to the Person table

In [21]:
import pandas as pd
# Load the file
patient_demo = pd.read_csv('./files/PTDEMOG_28Mar2024.csv')
# Subbset required fields from the file
patient_demo_staging = patient_demo[['PTID', 'PTGENDER', 'PTDOB', 'PTDOBYY', 'PTRACCAT', 'PTETHCAT']].copy() 

In [22]:
# Standard omop gender concepts includes female/male
patient_demo_staging = patient_demo_staging[patient_demo_staging['PTGENDER'].isin([1,2])]

# Assign an integer value for each unique PatientID
patient_demo_staging['person_id'], _ = pd.factorize(patient_demo_staging['PTID'])
patient_demo_staging['person_id'] = patient_demo_staging['person_id'] + 1 

# Codify Gender - replace 'F' with 8532 and 'M' with 8507
gender_codification = {2: 8532, 1: 8507}
patient_demo_staging['gender_concept_id'] = patient_demo_staging['PTGENDER'].replace(gender_codification)

# birth year and month
patient_demo_staging['year_of_birth'] = patient_demo_staging['PTDOBYY']
patient_demo_staging['month_of_birth'] = patient_demo_staging['PTDOB'].str.slice(0,2).astype(int)

# race category
race_codification = {1:8657, 2:8515, 3: 8557, 4:8516, 5:8527}
patient_demo_staging['race_concept_id'] = patient_demo_staging['PTRACCAT'].map(lambda x: race_codification.get(x, 0)).astype(int)

# ethnicity
ethnicity_codification = {1:38003563, 2:38003564}
patient_demo_staging['ethnicity_concept_id'] = patient_demo_staging['PTETHCAT'].map(lambda x: ethnicity_codification.get(x, 0)).astype(int)

# drop duplicate if applicable
patient_demo_staging = patient_demo_staging.drop_duplicates(subset='PTID', keep = 'first')

# source name
patient_demo_staging['source'] = 'ADNI'

patient_demo_staging.head()

Unnamed: 0,PTID,PTGENDER,PTDOB,PTDOBYY,PTRACCAT,PTETHCAT,person_id,gender_concept_id,year_of_birth,month_of_birth,race_concept_id,ethnicity_concept_id,source
0,011_S_0002,1.0,04/1931,1931.0,5,2.0,1,8507.0,1931.0,4,0,38003564,ADNI
1,022_S_0001,2.0,12/1944,1944.0,-4,-4.0,2,8532.0,1944.0,12,0,0,ADNI
2,011_S_0003,1.0,05/1924,1924.0,5,2.0,3,8507.0,1924.0,5,0,38003564,ADNI
3,022_S_0004,1.0,01/1938,1938.0,5,1.0,4,8507.0,1938.0,1,0,38003563,ADNI
4,011_S_0005,1.0,12/1931,1931.0,5,2.0,5,8507.0,1931.0,12,0,38003564,ADNI


In [25]:
# Update PERSON
cursor = conn.cursor()

# Update PERSON
sql = '''
    INSERT INTO adni.person (person_id, gender_concept_id, year_of_birth, month_of_birth, race_concept_id, ethnicity_concept_id, gender_source_value) 
    VALUES (%s,%s,%s,%s,%s,%s,%s)
    '''
for index, row in patient_demo_staging.iterrows():
    cursor.execute(sql, (row['person_id'], row['gender_concept_id'], row['year_of_birth'], row['month_of_birth'], row['race_concept_id'], row['ethnicity_concept_id'], row['PTGENDER']))

conn.commit()

In [39]:
# Create the registry_idmap table if it doesn't exist
# This will keep track of which OMOP Person_id belongs to ADNI PTID

ddl_statement = """
CREATE TABLE adni.registry_idmap(
    source_id varchar(250) NOT NULL,
    person_id integer NOT NULL,
    source_name varchar(250)
);
"""

cursor.execute(ddl_statement)
conn.commit()

sql = '''
    INSERT INTO adni.registry_idmap (source_id, person_id, source_name) 
    VALUES (%s,%s,%s)
    '''
for index, row in patient_demo_staging.iterrows():
    cursor.execute(sql, (row['PTID'], row['person_id'], row['source']))

conn.commit()

In [2]:
import pandas as pd
# Check the updates
sql_query = "SELECT * FROM dbo.registry_idmap"
df_registry_idmap = pd.read_sql_query(sql_query, conn)

# sql_query = "SELECT * FROM person"
# df_person = pd.read_sql_query(sql_query, conn)

cursor.close()
conn.close()

df_registry_idmap

  df_registry_idmap = pd.read_sql_query(sql_query, conn)


Unnamed: 0,source_id,person_id,source_name
0,011_S_0002,1,ADNI
1,022_S_0001,2,ADNI
2,011_S_0003,3,ADNI
3,022_S_0004,4,ADNI
4,011_S_0005,5,ADNI
...,...,...,...
4147,037_S_10063,4148,ADNI
4148,941_S_10085,4149,ADNI
4149,037_S_10062,4150,ADNI
4150,007_S_10075,4151,ADNI


### NPI-Q and NPI Scores

In [27]:
import pandas as pd
# Load the file
npi = pd.read_csv('./files/NPI_10Apr2024.csv')
# Subbset required fields from the file
# This should be selected by the researcher(s)
npi_staging = npi[['PTID', 'Phase', 'VISCODE', 'VISCODE2', 'VISDATE', 'EXAMDATE', 'NPIATOT',
                                     'NPIBTOT', 'NPICTOT', 'NPIDTOT', 'NPIETOT', 'NPIFTOT', 'NPIGTOT', 'NPIHTOT',
                                     'NPIITOT', 'NPIJTOT', 'NPIKTOT', 'NPILTOT', 'NPITOTAL']].copy()

In [28]:
# transform the table from wide to long
# add custom concept for NPI as SNOMED only exists for NPI-Q
# add custom concept for each questions
npi_staging.head()

Unnamed: 0,PTID,Phase,VISCODE,VISCODE2,VISDATE,EXAMDATE,NPIATOT,NPIBTOT,NPICTOT,NPIDTOT,NPIETOT,NPIFTOT,NPIGTOT,NPIHTOT,NPIITOT,NPIJTOT,NPIKTOT,NPILTOT,NPITOTAL
0,098_S_0172,ADNI2,v06,m60,2011-03-14,2011-03-14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,098_S_0160,ADNI2,v06,m60,2011-03-15,2011-03-15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,031_S_0294,ADNI2,v06,m60,2011-03-25,2011-03-25,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,8.0
3,941_S_1203,ADNI2,v06,m48,2011-03-23,2011-03-23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,116_S_1271,ADNI2,v06,m48,2011-03-30,2011-03-30,0.0,3.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,8.0,19.0


In [29]:
# Define the columns to keep (not starting with 'NPI')
cols_to_keep = [col for col in npi_staging.columns if not col.startswith('NPI')]

# Define the columns that need to be transformed
npi_columns = [col for col in npi_staging.columns if col.startswith('NPI')]

# Melt the DataFrame
npi_long = npi_staging.melt(id_vars=cols_to_keep, value_vars=npi_columns,
                    var_name='NPI_section', value_name='NPI_value')

# Extract the first letter after 'NPI' for the 'NPI_section'
npi_long['NPI_section'] = npi_long['NPI_section'].apply(lambda x: x[3])

npi_long.head()

Unnamed: 0,PTID,Phase,VISCODE,VISCODE2,VISDATE,EXAMDATE,NPI_section,NPI_value
0,098_S_0172,ADNI2,v06,m60,2011-03-14,2011-03-14,A,0.0
1,098_S_0160,ADNI2,v06,m60,2011-03-15,2011-03-15,A,0.0
2,031_S_0294,ADNI2,v06,m60,2011-03-25,2011-03-25,A,1.0
3,941_S_1203,ADNI2,v06,m48,2011-03-23,2011-03-23,A,0.0
4,116_S_1271,ADNI2,v06,m48,2011-03-30,2011-03-30,A,0.0


In [30]:
npi_long.shape #npi_long.shape

(90324, 8)

In [31]:
npi_long = npi_long.merge(df_registry_idmap[['source_id', 'person_id']], left_on= 'PTID', right_on = 'source_id', how ='left')

In [15]:
# Import custom concepts
sql_query = "SELECT * FROM dbo.concept WHERE concept_id > 2000000000"
df_concept = pd.read_sql_query(sql_query, conn)

# cursor.close()
# conn.close()

  df_concept = pd.read_sql_query(sql_query, conn)


In [16]:
df_concept

Unnamed: 0,concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_date,valid_end_date,invalid_reason
0,2128000010,Length to End,Measurement,DICOM,DICOM Attributes,,00080001,1993-01-01,2099-12-31,
1,2128000011,Specific Character Set,Measurement,DICOM,DICOM Attributes,,00080005,1993-01-01,2099-12-31,
2,2128000012,Image Type,Measurement,DICOM,DICOM Attributes,,00080008,1993-01-01,2099-12-31,
3,2128000013,Instance Creation Date,Measurement,DICOM,DICOM Attributes,,00080012,1993-01-01,2099-12-31,
4,2128000014,Instance Creation Time,Measurement,DICOM,DICOM Attributes,,00080013,1993-01-01,2099-12-31,
...,...,...,...,...,...,...,...,...,...,...
8043,2128011219,Plane through Anterior Extent,Measurement,DICOM,DICOM Value Sets,,128128,1993-01-01,2099-12-31,
8044,2128011220,Plane through Center,Measurement,DICOM,DICOM Value Sets,,128130,1993-01-01,2099-12-31,
8045,2128011221,Plane through Inferior Extent,Measurement,DICOM,DICOM Value Sets,,128121,1993-01-01,2099-12-31,
8046,2128011222,Plane through Superior Extent,Measurement,DICOM,DICOM Value Sets,,128120,1993-01-01,2099-12-31,


In [20]:
41571200 + 2000000000 #Assessment scales (parent node of NPI-Q) + custom concept id convention

2041571200

In [21]:
# Update CONCEPT for NPI
sql = '''
    INSERT INTO dbo.concept (concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code,valid_start_date,valid_end_date) 
    VALUES 
    (2041571200, 'Neuropsychiatric Inventory', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571201, 'Neuropsychiatric Inventory Section A. Delusions: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571202, 'Neuropsychiatric Inventory Section B. Hallucinations: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571203, 'Neuropsychiatric Inventory Section C. Agitation/Aggression: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571204, 'Neuropsychiatric Inventory Section D. Depression/Dysphoria: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571205, 'Neuropsychiatric Inventory Section E. Anxiety: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571206, 'Neuropsychiatric Inventory Section F. Elation/Euphoria: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571207, 'Neuropsychiatric Inventory Section G. Apathy/Indifference: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571208, 'Neuropsychiatric Inventory Section H. Disinhibition: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571209, 'Neuropsychiatric Inventory Section I. Irritability/Lability: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571210, 'Neuropsychiatric Inventory Section J. Aberrant Motor Behavior: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571211, 'Neuropsychiatric Inventory Section K. Sleep: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571212, 'Neuropsychiatric Inventory Section L. Appetite and eating disorders: Item score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571213, 'Neuropsychiatric Inventory Total Score', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31')
    '''
cursor.execute(sql)
conn.commit()

In [22]:
# Update CONCEPT for NPI-Q
sql = '''
    INSERT INTO dbo.concept (concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,concept_code,valid_start_date,valid_end_date) 
    VALUES 
    (2041571214, 'Neuropsychiatric Inventory Q Delusions', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571215, 'Neuropsychiatric Inventory Q Hallucinations', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571216, 'Neuropsychiatric Inventory Q Agitation/Aggression', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571217, 'Neuropsychiatric Inventory Q Depression/Dysphoria', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571218, 'Neuropsychiatric Inventory Q Anxiety', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571219, 'Neuropsychiatric Inventory Q Elation/Euphoria', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571220, 'Neuropsychiatric Inventory Q Apathy/Indifference', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571221, 'Neuropsychiatric Inventory Q Disinhibition', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571222, 'Neuropsychiatric Inventory Q Irritability/Lability', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571223, 'Neuropsychiatric Inventory Q Aberrant Motor Behavior', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571224, 'Neuropsychiatric Inventory Q Sleep', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31'),
    (2041571225, 'Neuropsychiatric Inventory Q Appetite and eating disorders', 'Measurement', '0', '0', '0', '1994-01-01', '2099-12-31')
    '''
cursor.execute(sql)
conn.commit()

In [6]:
# Import measurement table
sql_query = "SELECT * FROM dbo.measurement"
df_measurement = pd.read_sql_query(sql_query, conn)

# cursor.close()
# conn.close()

  df_measurement = pd.read_sql_query(sql_query, conn)


In [23]:
df_measurement.measurement_id.max() #560169

560169

In [32]:
npi_long.NPI_value.unique()

array([ 0.,  1.,  3., nan,  2.,  8.,  4.,  6.,  9., 12., 19., 20., 10.,
       14., 17.,  5.,  7., 13., 16., 18., 23., 26., 32., 28., 47., 29.,
       11., 22., 24., 15., 37., 25., 27., 34., 21., 31., 33., 46., 43.,
       35., 30., 39., 36., 59., 38., 60., 53., 51., 49., 41., 52., 40.,
       79., 44., 88., 45., 65., 67., 61., 55., 57., 48., 58., 78., 56.,
       42., 54., 71.])

In [119]:
# Drop rows with NPI_value NAN
def is_numeric(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

npi_long_new = npi_long.dropna(subset=['NPI_value']).copy()
npi_long_new['NPI_value'] = npi_long_new['NPI_value'].apply(lambda x: round(float(x), 6) if is_numeric(x) else x)
npi_long_new.shape #(89659, 10)

(89659, 10)

In [120]:
# Assign custom concepts per NPI question section
npi_codes = {
    'A': 2041571201,
    'B': 2041571202,
    'C': 2041571203,
    'D': 2041571204,
    'E': 2041571205,
    'F': 2041571206,
    'G': 2041571207,
    'H': 2041571208,
    'I': 2041571209,
    'J': 2041571210,
    'K': 2041571211,
    'L': 2041571212,
    'T': 2041571213
}

npi_long_new['concept_id'] = npi_long_new['NPI_section'].map(npi_codes)

In [121]:
# Create measurement_id 
# max measurement_id in the db as of 5/17/2024 = 560169
npi_long_new['measurement_id'] = range(560169, 560169 + npi_long_new.shape[0])

In [122]:
npi_long_new

Unnamed: 0,PTID,Phase,VISCODE,VISCODE2,VISDATE,EXAMDATE,NPI_section,NPI_value,source_id,person_id,concept_id,measurement_id
0,098_S_0172,ADNI2,v06,m60,2011-03-14,2011-03-14,A,0.0,098_S_0172,197,2041571201,560169
1,098_S_0160,ADNI2,v06,m60,2011-03-15,2011-03-15,A,0.0,098_S_0160,124,2041571201,560170
2,031_S_0294,ADNI2,v06,m60,2011-03-25,2011-03-25,A,1.0,031_S_0294,236,2041571201,560171
3,941_S_1203,ADNI2,v06,m48,2011-03-23,2011-03-23,A,0.0,941_S_1203,1171,2041571201,560172
4,116_S_1271,ADNI2,v06,m48,2011-03-30,2011-03-30,A,0.0,116_S_1271,1258,2041571201,560173
...,...,...,...,...,...,...,...,...,...,...,...,...
90319,020_S_6504,ADNI3,y3,m36,2021-09-14,2021-09-14,T,0.0,020_S_6504,3460,2041571213,649823
90320,126_S_7060,ADNI3,y2,m24,2024-03-05,2024-03-05,T,2.0,126_S_7060,4013,2041571213,649824
90321,123_S_4127,ADNI3,y5,m144,2023-10-19,2023-10-19,T,1.0,123_S_4127,1869,2041571213,649825
90322,123_S_7125,ADNI3,bl,bl,2023-07-25,2023-07-25,T,6.0,123_S_7125,4078,2041571213,649826


In [123]:
npi_long_new['VISDATE'] = pd.to_datetime(npi_long_new['VISDATE'].str.strip(), errors='coerce')

In [124]:
# Convert Exam date and visit date
npi_long_new['EXAMDATE_DT'] = pd.to_datetime(npi_long_new['EXAMDATE'].str.strip(), errors = 'coerce')

In [125]:
# Look through NA exam dates
npi_long_new[npi_long_new['EXAMDATE_DT'].isna()]

Unnamed: 0,PTID,Phase,VISCODE,VISCODE2,VISDATE,EXAMDATE,NPI_section,NPI_value,source_id,person_id,concept_id,measurement_id,EXAMDATE_DT
513,128_S_0770,ADNI2,v06,m60,NaT,0012-02-14,A,0.0,128_S_0770,723,2041571201,560680,NaT
603,037_S_4410,ADNI2,v03,bl,2012-01-04,,A,0.0,037_S_4410,2117,2041571201,560770,NaT
1678,131_S_5148,ADNI2,v03,bl,NaT,0013-05-06,A,0.0,131_S_5148,2819,2041571201,561840,NaT
2594,072_S_2026,ADNI2,v21,m36,NaT,0013-10-28,A,0.0,072_S_2026,1407,2041571201,562749,NaT
2812,053_S_2396,ADNI2,v21,m36,2014-08-05,,A,0.0,053_S_2396,1731,2041571201,562962,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
90219,941_S_4420,ADNI3,y3,,NaT,,T,0.0,941_S_4420,2304,2041571213,649724,NaT
90256,941_S_4036,ADNI3,y5,m144,2023-04-12,,T,0.0,941_S_4036,1775,2041571213,649761,NaT
90277,012_S_6073,ADNI3,y5,m60,2022-12-13,,T,0.0,012_S_6073,3048,2041571213,649782,NaT
90306,941_S_6044,ADNI3,y5,m60,2022-09-19,,T,0.0,941_S_6044,3018,2041571213,649810,NaT


In [126]:
# Look through NA visit dates
npi_long_new[npi_long_new['VISDATE'].isna()]

Unnamed: 0,PTID,Phase,VISCODE,VISCODE2,VISDATE,EXAMDATE,NPI_section,NPI_value,source_id,person_id,concept_id,measurement_id,EXAMDATE_DT
513,128_S_0770,ADNI2,v06,m60,NaT,0012-02-14,A,0.0,128_S_0770,723,2041571201,560680,NaT
1678,131_S_5148,ADNI2,v03,bl,NaT,0013-05-06,A,0.0,131_S_5148,2819,2041571201,561840,NaT
2594,072_S_2026,ADNI2,v21,m36,NaT,0013-10-28,A,0.0,072_S_2026,1407,2041571201,562749,NaT
5129,129_S_6082,ADNI3,y1,,NaT,,A,0.0,129_S_6082,3054,2041571201,565251,NaT
5351,023_S_2068,ADNI3,y1,,NaT,,A,0.0,023_S_2068,1450,2041571201,565473,NaT
...,...,...,...,...,...,...,...,...,...,...,...,...,...
85970,072_S_2026,ADNI2,v21,m36,NaT,0013-10-28,T,0.0,072_S_2026,1407,2041571213,645526,NaT
88505,129_S_6082,ADNI3,y1,,NaT,,T,0.0,129_S_6082,3054,2041571213,648022,NaT
88727,023_S_2068,ADNI3,y1,,NaT,,T,0.0,023_S_2068,1450,2041571213,648242,NaT
88786,135_S_5273,ADNI3,y2,,NaT,,T,0.0,135_S_5273,2952,2041571213,648301,NaT


In [127]:
# To maximize retaining data, we will use visit date as measurement date
# Drop rows where visit date is NAN
npi_long_new = npi_long_new.dropna(subset=['VISDATE'])
npi_long_new.shape #(89568, 13)

(89568, 13)

In [129]:
sql = '''
    INSERT INTO dbo.measurement (measurement_id, person_id, measurement_concept_id, measurement_date, measurement_type_concept_id, value_as_number, measurement_source_concept_id) 
    VALUES (%s,%s,%s,%s,%s,%s,%s)
'''

# Define batch size
batch_size = 20000
batch = []

# Iterate through the DataFrame
for index, row in npi_long_new.iterrows():
    # Append row data to the batch
    batch.append((row['measurement_id'], row['person_id'], row['concept_id'], row['VISDATE'], 32817, row['NPI_value'], row['concept_id']))

    # Check if batch size is reached or if it's the last row
    if len(batch) == batch_size or (index == len(npi_long_new) - 1):
        # Execute the batch
        cursor.executemany(sql, batch)
        conn.commit()  # Commit after each batch
        batch = []  # Clear the batch for the next set of rows

# Close the cursor and connection
cursor.close()
conn.close()

In [4]:
# check if the measurement table is updated with NPI values

sql_query = "SELECT * FROM dbo.measurement where measurement_source_concept_id between 2041571201 and 2041571213"
df_measurement_npi = pd.read_sql_query(sql_query, conn)

  df_measurement_npi = pd.read_sql_query(sql_query, conn)


In [14]:
df_measurement_npi['measurement_source_concept_id'].unique() #81926

array([2128000026, 2128000979, 2128001008, 2128001273, 2128001274,
       2128000014, 2128000991, 2128000025, 2128000011, 2128000012,
       2128000016, 2128000017, 2128000018, 2128000019, 2128000023,
       2128000024, 2128000034, 2128000103, 2128000105, 2128000109,
       2128000341, 2128000342, 2128000343, 2128000344, 2128000345,
       2128000346, 2128000357, 2128000366, 2128000367, 2128000369,
       2128000370, 2128000371, 2128000372, 2128000373, 2128000374,
       2128000376, 2128000377, 2128000378, 2128000379, 2128000491,
       2128000492, 2128000493, 2128000494, 2128000495, 2128000496,
       2128000575, 2128000977, 2128000978, 2128000993, 2128001207,
       2128001209, 2128001215, 2128001216, 2128001219, 2128001239,
       2128001240, 2128001241, 2128001242, 2128001422, 2128001275,
       2128001276, 2128000368, 2128000723, 2128000724, 2128000728,
       2128000745, 2128001000, 2128000418, 2128000419, 2128000420,
       2128000421, 2128000995, 2128001004, 2128000013, 2128001

In [8]:
agg_npi = df_measurement_npi.groupby('person_id')['measurement_date'].agg(['min', 'max']).reset_index()

In [9]:
agg_npi

Unnamed: 0,person_id,min,max
0,1,2011-09-19,2017-10-18
1,8,2011-09-27,2015-10-29
2,14,2011-11-15,2011-11-15
3,23,2011-10-10,2020-03-16
4,27,2011-10-20,2020-11-17
...,...,...,...
1864,4072,2023-01-17,2023-01-17
1865,4074,2023-03-29,2023-03-29
1866,4075,2023-05-18,2023-05-18
1867,4076,2023-04-14,2023-04-14


### Update the Observation_period table

In [15]:
import pandas as pd
sql = 'select * from dbo.observation_period'
df_obs_period = pd.read_sql_query(sql, conn)

  df_obs_period = pd.read_sql_query(sql, conn)


In [16]:
df_obs_period_npi = df_obs_period.merge(agg_npi, on = 'person_id')
df_obs_period_npi.head()

Unnamed: 0,observation_period_id,person_id,observation_period_start_date,observation_period_end_date,period_type_concept_id,min,max
0,1,75,2012-12-11,2021-01-21,32817,2012-12-11,2021-01-11
1,2,134,2013-02-22,2018-03-21,32817,2013-02-22,2018-03-21
2,3,1572,2012-12-07,2022-03-30,32817,2012-12-07,2022-03-30
3,4,1580,2011-12-06,2021-10-20,32817,2011-12-06,2021-10-20
4,5,1594,2013-01-21,2022-01-27,32817,2013-01-21,2022-01-27


In [17]:
df_obs_period_npi['older_date'] = df_obs_period_npi[['observation_period_start_date', 'min']].min(axis = 1)
df_obs_period_npi['later_date'] = df_obs_period_npi[['observation_period_end_date', 'max']].max(axis = 1)
df_obs_period_npi

Unnamed: 0,observation_period_id,person_id,observation_period_start_date,observation_period_end_date,period_type_concept_id,min,max,older_date,later_date
0,1,75,2012-12-11,2021-01-21,32817,2012-12-11,2021-01-11,2012-12-11,2021-01-21
1,2,134,2013-02-22,2018-03-21,32817,2013-02-22,2018-03-21,2013-02-22,2018-03-21
2,3,1572,2012-12-07,2022-03-30,32817,2012-12-07,2022-03-30,2012-12-07,2022-03-30
3,4,1580,2011-12-06,2021-10-20,32817,2011-12-06,2021-10-20,2011-12-06,2021-10-20
4,5,1594,2013-01-21,2022-01-27,32817,2013-01-21,2022-01-27,2013-01-21,2022-01-27
5,6,1812,2011-07-20,2021-08-11,32817,2011-07-20,2021-08-05,2011-07-20,2021-08-11
6,7,1858,2011-10-06,2022-01-28,32817,2011-10-06,2022-01-21,2011-10-06,2022-01-28
7,8,1885,2011-11-29,2021-12-21,32817,2011-11-29,2021-12-21,2011-11-29,2021-12-21
8,9,1894,2011-08-29,2021-11-05,32817,2011-08-29,2021-11-05,2011-08-29,2021-11-05
9,10,2161,2012-02-16,2022-01-04,32817,2012-02-16,2022-01-04,2012-02-16,2022-01-04


In [14]:
sql_query = '''
    UPDATE dbo.observation_period
    SET observation_period_start_date = %s, observation_period_end_date = %s
    WHERE person_id = %s
'''

for index, row in df_obs_period_npi.iterrows():
    cursor.execute(sql_query, (row['older_date'], row['later_date'], row['person_id']))

conn.commit()

In [18]:
cursor.close()
conn.close()

## Ingesting diagnosis codes

In [6]:
import pandas as pd

dx = pd.read_csv('./files/DXSUM_diagnosis.csv')

In [7]:
dx.head()

Unnamed: 0,PTID,EXAMDATE,DIAGNOSIS
0,011_S_0002,2005-09-29,1.0
1,011_S_0003,2005-09-30,3.0
2,011_S_0005,2005-09-30,1.0
3,011_S_0008,2005-09-30,1.0
4,022_S_0007,2005-10-06,3.0


In [8]:
idmap = pd.read_csv('./files/registry_idmap.csv')
idmap.head()

Unnamed: 0,source_id,person_id,source_name
0,011_S_0002,1,ADNI
1,022_S_0001,2,ADNI
2,011_S_0003,3,ADNI
3,022_S_0004,4,ADNI
4,011_S_0005,5,ADNI


In [9]:
dx = dx.merge(idmap[["source_id", "person_id"]], left_on = 'PTID', right_on = 'source_id', how = 'left')
dx.head()

Unnamed: 0,PTID,EXAMDATE,DIAGNOSIS,source_id,person_id
0,011_S_0002,2005-09-29,1.0,011_S_0002,1.0
1,011_S_0003,2005-09-30,3.0,011_S_0003,3.0
2,011_S_0005,2005-09-30,1.0,011_S_0005,5.0
3,011_S_0008,2005-09-30,1.0,011_S_0008,8.0
4,022_S_0007,2005-10-06,3.0,022_S_0007,6.0


In [17]:
dx_no_control = dx[dx['DIAGNOSIS']>1].copy().reset_index()
dx_no_control['DIAGNOSIS'].unique()

array([3., 2.])

In [18]:
dx_no_control.shape, dx.shape

((8677, 6), (13842, 5))

In [13]:
mapping = {2: 4297400, 3: 4182210}

In [19]:
dx_no_control['id'] = range(1,len(dx_no_control)+1)
dx_no_control['concept_id'] = dx_no_control['DIAGNOSIS'].map(mapping)
dx_no_control['condition_type_concept_id'] = 32817

In [20]:
dx_no_control

Unnamed: 0,index,PTID,EXAMDATE,DIAGNOSIS,source_id,person_id,id,concept_id,condition_type_concept_id
0,1,011_S_0003,2005-09-30,3.0,011_S_0003,3.0,1,4182210,32817
1,4,022_S_0007,2005-10-06,3.0,022_S_0007,6.0,2,4182210,32817
2,6,023_S_0030,2005-10-20,2.0,023_S_0030,26.0,3,4297400,32817
3,13,023_S_0042,2005-11-10,2.0,023_S_0042,35.0,4,4297400,32817
4,14,067_S_0029,2005-11-11,3.0,067_S_0029,25.0,5,4182210,32817
...,...,...,...,...,...,...,...,...,...
8672,13828,016_S_10138,2024-06-12,2.0,,,8673,4297400,32817
8673,13831,014_S_6502,2024-06-14,2.0,014_S_6502,3450.0,8674,4297400,32817
8674,13834,033_S_7079,2024-06-13,2.0,033_S_7079,4032.0,8675,4297400,32817
8675,13835,019_S_10141,2024-06-13,2.0,,,8676,4297400,32817


In [21]:
dx_no_control.to_csv('./files/dx_for_omop.csv')

In [None]:
# Update CONDITION_OCCURRENCE
sql = '''
    INSERT INTO dbo.condition_occurrence (condition_occurrence_id, person_id, condition_concept_id, condition_start_date, condition_type_concept_id) 
    VALUES (%s,%s,%s,%s,%s)
    '''
for index, row in dx_no_control.iterrows():
    cursor.execute(sql, (row['id'], row['person_id'], row['concept_id'], row['EXAMDATE'], row['condition_type_concept_id']))

conn.commit()

### Mini-Mental Evaluation scores

In [4]:
import pandas as pd
mmse = pd.read_csv('./files/MMSE_08Aug2024.csv')
mmse.head()

Unnamed: 0,PHASE,PTID,RID,VISCODE,VISCODE2,VISDATE,DONE,NDREASON,SOURCE,MMDATE,...,MMDRAW,MMSCORE,ID,SITEID,USERDATE,USERDATE2,DD_CRF_VERSION_LABEL,LANGUAGE_CODE,HAS_QC_ERROR,update_stamp
0,ADNI1,011_S_0002,2,sc,sc,2005-08-17,,,,1.0,...,1.0,28.0,10,107,2005-08-17,,,,,2005-08-17 00:00:00.0
1,ADNI1,022_S_0001,1,f,f,2005-08-18,,,,1.0,...,0.0,28.0,12,10,2005-08-18,,,,,2005-08-18 00:00:00.0
2,ADNI1,011_S_0003,3,sc,sc,2005-08-18,,,,0.0,...,1.0,20.0,14,107,2005-08-18,,,,,2005-08-18 00:00:00.0
3,ADNI1,022_S_0004,4,sc,sc,2005-08-18,,,,1.0,...,0.0,27.0,16,10,2005-08-18,,,,,2005-08-18 00:00:00.0
4,ADNI1,011_S_0005,5,sc,sc,2005-08-23,,,,1.0,...,1.0,29.0,18,107,2005-08-23,,,,,2005-08-23 00:00:00.0


In [5]:
mmse.columns

Index(['PHASE', 'PTID', 'RID', 'VISCODE', 'VISCODE2', 'VISDATE', 'DONE',
       'NDREASON', 'SOURCE', 'MMDATE', 'MMYEAR', 'MMMONTH', 'MMDAY',
       'MMSEASON', 'MMHOSPIT', 'MMFLOOR', 'MMCITY', 'MMAREA', 'MMSTATE',
       'WORDLIST', 'WORD1', 'WORD2', 'WORD3', 'MMTRIALS', 'MMD', 'MML', 'MMR',
       'MMO', 'MMW', 'MMLTR1', 'MMLTR2', 'MMLTR3', 'MMLTR4', 'MMLTR5',
       'MMLTR6', 'MMLTR7', 'WORLDSCORE', 'WORD1DL', 'WORD2DL', 'WORD3DL',
       'MMWATCH', 'MMPENCIL', 'MMREPEAT', 'MMHAND', 'MMFOLD', 'MMONFLR',
       'MMREAD', 'MMWRITE', 'MMDRAW', 'MMSCORE', 'ID', 'SITEID', 'USERDATE',
       'USERDATE2', 'DD_CRF_VERSION_LABEL', 'LANGUAGE_CODE', 'HAS_QC_ERROR',
       'update_stamp'],
      dtype='object')

In [6]:
data_dic = pd.read_csv('./files/DATADIC_08Aug2024.csv')
data_dic.head()

Unnamed: 0,PHASE,CRFNAME,TBLNAME,FLDNAME,TEXT,TYPE,LENGTH,DD_CRF_VERSION,CODE,UNITS,STATUS,CODE_CHANGES,MAPPING_NOTES
0,ADNI1,ADAS-Cognitive Behavior,ADAS,PTID,Participant ID,,,,,,,,
1,ADNI1,ADAS-Cognitive Behavior,ADAS,RID,Participant roster ID,N,38 digits,,,,,,
2,ADNI1,ADAS-Cognitive Behavior,ADAS,VISCODE,Visit code,T,20 characters,,,,,,
3,ADNI1,ADAS-Cognitive Behavior,ADAS,EXAMDATE,Examination Date,D,10,,,,,,
4,ADNI1,ADAS-Cognitive Behavior,ADAS,VISDATE,Assessment EXAMDATE when present; otherwise Re...,D,,,,,,,


In [9]:
data_dic[data_dic['TBLNAME']=="MMSE"]['FLDNAME'].unique()

array(['PTID', 'RID', 'VISCODE', 'EXAMDATE', 'VISDATE', 'MMDATE',
       'MMDATECM', 'MMYEAR', 'MMYEARCM', 'MMMONTH', 'MMMNTHCM', 'MMDAY',
       'MMDAYCM', 'MMSEASON', 'MMSESNCM', 'MMHOSPIT', 'MMHOSPCM',
       'MMFLOOR', 'MMFLRCM', 'MMCITY', 'MMCITYCM', 'MMAREA', 'MMAREACM',
       'MMSTATE', 'MMSTCM', 'MMBALL', 'MMFLAG', 'MMTREE', 'MMTRIALS',
       'MMD', 'MMDLTR', 'MML', 'MMLLTR', 'MMR', 'MMRLTR', 'MMO', 'MMOLTR',
       'MMW', 'MMWLTR', 'MMBALLDL', 'MMFLAGDL', 'MMTREEDL', 'MMWATCH',
       'MMPENCIL', 'MMREPEAT', 'MMHAND', 'MMFOLD', 'MMONFLR', 'MMREAD',
       'MMWRITE', 'MMDRAW', 'MMSCORE', 'ID', 'SITEID', 'USERDATE',
       'USERDATE2', 'VISCODE2', 'MMRECALL', 'MMLTR1', 'MMLTR2', 'MMLTR3',
       'MMLTR4', 'MMLTR5', 'MMLTR6', 'MMLTR7', 'WORLDSCORE', 'DONE',
       'NDREASON', 'WORDLIST', 'WORD1', 'WORD2', 'WORD3', 'WORD1DL',
       'WORD2DL', 'WORD3DL', 'DATE', 'SOURCE', 'DD_CRF_VERSION_LABEL',
       'LANGUAGE_CODE', 'HAS_QC_ERROR'], dtype=object)