# 01 Profile on inputs

In [1]:
from smv import SmvApp
from smv.smvconfig import SmvConfig
import pyspark.sql.functions as F
import pyspark.sql.types as T

sc.setLogLevel("ERROR")
smv_args = [
    "--smv-app-dir",
    "/home/bzhang/OMOP2Graph",
]
smvconf = SmvConfig(smv_args)
app = SmvApp.createInstance(smvconf, spark)

# Loading smvshell tools
# To see help:
#   > help(smv.smvshell)
from smv.smvshell import *

In [2]:
# List modules
ls()


omop:
  (I) input.CONCEPT
  (I) input.CONCEPT_ANCESTOR
  (I) input.CONCEPT_CLASS
  (I) input.CONCEPT_RELATIONSHIP
  (I) input.CONCEPT_SYNONYM
  (I) input.DOMAIN
  (I) input.DRUG_STRENGTH
  (I) input.RELATIONSHIP
  (I) input.VOCABULARY



In [3]:
# Resolve a module to DataFrame
concept=fullRun("input.CONCEPT")

02/01/2025 09:30:18 INFO smv: STARTING FORCE AN ACTION FOR DQM on omop.input.CONCEPT
02/01/2025 09:30:48 INFO smv: COMPLETED FORCE AN ACTION FOR DQM: omop.input.CONCEPT
02/01/2025 09:30:48 INFO smv: RunTime: 0:00:29.336465


In [4]:
concept.peek()

concept_id : string = 38001666
concept_name : string = Level I Photochemotherapy
domain_id : string = Observation
vocabulary_id : string = APC
concept_class_id : string = APC
standard_concept : string = None
concept_code : string = 0001
valid_start_date : date = 2011-01-01
valid_end_date : date = 2014-12-31
invalid_reason : string = D


In [7]:
domain=fullRun("input.DOMAIN")

02/01/2025 09:32:47 INFO smv: STARTING FORCE AN ACTION FOR DQM on omop.input.DOMAIN
02/01/2025 09:32:47 INFO smv: COMPLETED FORCE AN ACTION FOR DQM: omop.input.DOMAIN
02/01/2025 09:32:47 INFO smv: RunTime: 0:00:00.131458
02/01/2025 09:32:47 INFO smv: STARTING GENERATE USER METADATA on omop.input.DOMAIN
02/01/2025 09:32:47 INFO smv: COMPLETED GENERATE USER METADATA: omop.input.DOMAIN
02/01/2025 09:32:47 INFO smv: RunTime: 0:00:00.000003


In [8]:
domain.peek()

domain_id : string = Cost
domain_name : string = Cost
domain_concept_id : int = 581456


In [11]:
domain.where(F.col('domain_id')=='Observation').show()

+-----------+-----------+-----------------+
|  domain_id|domain_name|domain_concept_id|
+-----------+-----------+-----------------+
|Observation|Observation|               27|
+-----------+-----------+-----------------+



In [12]:
cclass=fullRun("CONCEPT_CLASS")

02/01/2025 09:35:07 INFO smv: STARTING FORCE AN ACTION FOR DQM on omop.input.CONCEPT_CLASS
02/01/2025 09:35:07 INFO smv: COMPLETED FORCE AN ACTION FOR DQM: omop.input.CONCEPT_CLASS
02/01/2025 09:35:07 INFO smv: RunTime: 0:00:00.124276
02/01/2025 09:35:07 INFO smv: STARTING GENERATE USER METADATA on omop.input.CONCEPT_CLASS
02/01/2025 09:35:07 INFO smv: COMPLETED GENERATE USER METADATA: omop.input.CONCEPT_CLASS
02/01/2025 09:35:07 INFO smv: RunTime: 0:00:00.000002


In [13]:
cclass.peek()

concept_class_id : string = Qualifier Value
concept_class_name : string = Qualifier Value
concept_class_concept_id : string = 44819021


In [14]:
domain.count()

50

In [15]:
domain.show(50)

+-------------------+--------------------+-----------------+
|          domain_id|         domain_name|domain_concept_id|
+-------------------+--------------------+-----------------+
|               Cost|                Cost|           581456|
|   Condition/Device|    Condition/Device|              235|
|             Gender|              Gender|                2|
|               Race|                Race|                3|
|          Ethnicity|           Ethnicity|                4|
|           Metadata|            Metadata|                7|
|              Visit|               Visit|                8|
|          Procedure|           Procedure|               10|
|           Modifier|            Modifier|               12|
|               Drug|                Drug|               13|
|              Route|Route Of Administ...|               15|
|               Unit|                Unit|               16|
|             Device|              Device|               17|
|          Condition|   

In [16]:
concept.where(F.col('domain_id') == "Drug").peek()

concept_id : string = 2614714
concept_name : string = Chlorhexidine containing antiseptic, 1 ml
domain_id : string = Drug
vocabulary_id : string = HCPCS
concept_class_id : string = HCPCS
standard_concept : string = None
concept_code : string = A4248
valid_start_date : date = 2004-01-01
valid_end_date : date = 2099-12-31
invalid_reason : string = None


In [17]:
v=fullRun("VOCABULARY")

02/01/2025 09:38:41 INFO smv: STARTING FORCE AN ACTION FOR DQM on omop.input.VOCABULARY
02/01/2025 09:38:41 INFO smv: COMPLETED FORCE AN ACTION FOR DQM: omop.input.VOCABULARY
02/01/2025 09:38:41 INFO smv: RunTime: 0:00:00.122613
02/01/2025 09:38:41 INFO smv: STARTING GENERATE USER METADATA on omop.input.VOCABULARY
02/01/2025 09:38:41 INFO smv: COMPLETED GENERATE USER METADATA: omop.input.VOCABULARY
02/01/2025 09:38:41 INFO smv: RunTime: 0:00:00.000002


In [18]:
v.count()

56

In [19]:
v.show(56)

+--------------------+--------------------+--------------------+--------------------+---------------------+
|       vocabulary_id|     vocabulary_name|vocabulary_reference|  vocabulary_version|vocabulary_concept_id|
+--------------------+--------------------+--------------------+--------------------+---------------------+
|              ICD9CM|International Cla...|http://www.cms.go...|ICD9CM v32 master...|                 5046|
|                 NDC|National Drug Cod...|http://www.nlm.ni...|        NDC 20240825|             44819105|
|          Visit Type|     OMOP Visit Type|      OMOP generated|                NULL|             44819150|
|           Drug Type|OMOP Drug Exposur...|      OMOP generated|                NULL|             44819126|
|                Plan|    OMOP Health Plan|      OMOP generated|                NULL|                32471|
|      Procedure Type|OMOP Procedure Oc...|      OMOP generated|                NULL|             44819128|
|            ICD9Proc|Intern

In [22]:
concept.where(F.col('domain_id') == "Procedure").smvHist("vocabulary_id")



+-------------+------+
|vocabulary_id| count|
+-------------+------+
|     ICD10PCS|196048|
|       SNOMED| 83542|
|     ICD9Proc|  4657|
|        HCPCS|  1342|
|       ICD9CM|   214|
|        ICD10|   178|
+-------------+------+



                                                                                

In [23]:
concept.where(F.col('vocabulary_id') == "ICD10").smvHist("domain_id")



+-----------+-----+
|  domain_id|count|
+-----------+-----+
|  Condition|14113|
|Observation| 2233|
|  Procedure|  178|
|Measurement|  114|
+-----------+-----+



                                                                                

In [24]:
concept.where(F.col('vocabulary_id') == "ICD9CM").smvHist("domain_id")



+-----------+-----+
|  domain_id|count|
+-----------+-----+
|  Condition|14929|
|Observation| 2204|
|Measurement|  217|
|  Procedure|  214|
+-----------+-----+



                                                                                

In [25]:
concept.where(F.col('vocabulary_id') == "ICD9Proc").smvHist("domain_id")



+---------+-----+
|domain_id|count|
+---------+-----+
|Procedure| 4657|
+---------+-----+



                                                                                

In [26]:
concept.where(F.col('vocabulary_id') == "HCPCS").smvHist("domain_id")



+-----------+-----+
|  domain_id|count|
+-----------+-----+
|     Device| 5272|
|Observation| 3309|
|       Drug| 1971|
|  Procedure| 1342|
|Measurement|  184|
|  Condition|    1|
+-----------+-----+



                                                                                

In [10]:
h=fullRun("CONCEPT_ANCESTOR")

02/02/2025 12:38:00 INFO smv: STARTING FORCE AN ACTION FOR DQM on omop.input.CONCEPT_ANCESTOR
02/02/2025 12:38:59 INFO smv: COMPLETED FORCE AN ACTION FOR DQM: omop.input.CONCEPT_ANCESTOR
02/02/2025 12:38:59 INFO smv: RunTime: 0:00:58.909597
02/02/2025 12:38:59 INFO smv: STARTING GENERATE USER METADATA on omop.input.CONCEPT_ANCESTOR
02/02/2025 12:38:59 INFO smv: COMPLETED GENERATE USER METADATA: omop.input.CONCEPT_ANCESTOR
02/02/2025 12:38:59 INFO smv: RunTime: 0:00:00.000003


In [6]:
h.peek()

ancestor_concept_id : string = 37018424
descendant_concept_id : string = 4235769
min_levels_of_separation : int = 4
max_levels_of_separation : int = 4


In [9]:
h.select(F.col("min_levels_of_separation").cast("string")).smvHist("min_levels_of_separation")

                                                                                

+------------------------+-------+
|min_levels_of_separation|  count|
+------------------------+-------+
|                       2|7901077|
|                       1|7539680|
|                       3|4719412|
|                       0|4594111|
|                       4|2174494|
|                       5|1559381|
|                       6| 955329|
|                       7| 488244|
|                       8| 257083|
|                       9| 130717|
|                      10|  67723|
|                      11|  32689|
|                      12|  16401|
|                      13|   7795|
|                      14|   3637|
|                      15|   1735|
|                      16|    786|
|                      17|    333|
|                      18|    126|
|                      19|     49|
+------------------------+-------+
only showing top 20 rows



In [11]:
h.count()

                                                                                

30450837

In [12]:
h.smvDiscoverPK()

selected_keys: [], unique_count: 0
selected_keys: ['descendant_concept_id'], unique_count: 4178
selected_keys: ['descendant_concept_id', 'ancestor_concept_id'], unique_count: 5000


['descendant_concept_id', 'ancestor_concept_id']

In [57]:
t=fullRun("CONCEPT_WITH_CHILDREN")

In [32]:
t.peek()

concept_id : string = 1000641
concept_name : string = clotrimazole 10 MG/ML / dexamethasone 0.4 MG/ML Topical Cream
domain_id : string = Drug
vocabulary_id : string = RxNorm
concept_class_id : string = Clinical Drug
standard_concept : string = S
concept_code : string = 246752
valid_start_date : date = 1970-01-01
valid_end_date : date = 2099-12-31
invalid_reason : string = None
ancestor_concept_id : string = 1000641
max_levels : int = 0


In [33]:
t.select(F.col("max_levels").cast("string")).smvHist("max_levels")

+----------+-------+
|max_levels|  count|
+----------+-------+
|         0|1513575|
|         1| 836059|
|         2| 491710|
|         3| 174504|
|         4|  17858|
|         5|   5707|
|         6|   1967|
|         7|   1168|
|         8|   1027|
|        10|    733|
|         9|    488|
|        11|    212|
|        12|    131|
|        13|     91|
|        14|     77|
|        15|     52|
|        16|     35|
|        18|     21|
|        17|     19|
|        20|     13|
+----------+-------+
only showing top 20 rows



In [34]:
t.smvHist('domain_id')

+------------------+-------+
|         domain_id|  count|
+------------------+-------+
|              Drug|2058180|
|         Procedure| 247057|
|       Measurement| 228864|
|         Geography| 203563|
|       Observation| 115959|
|         Condition|  98721|
|Spec Anatomic Site|  36836|
|            Device|  31718|
|              NULL|  14190|
|        Meas Value|   4067|
|          Specimen|   1870|
|          Metadata|   1061|
|              Unit|   1038|
|          Language|    836|
|      Revenue Code|    545|
|      Relationship|    209|
|             Route|    165|
|             Payer|    162|
|             Visit|    101|
|          Provider|     87|
+------------------+-------+
only showing top 20 rows



In [36]:
t.where(F.col("max_levels") >= 5).smvHist("domain_id")

+------------------+-----+
|         domain_id|count|
+------------------+-----+
|         Condition| 3326|
|              Drug| 2342|
|Spec Anatomic Site| 1957|
|         Procedure| 1824|
|       Observation| 1049|
|          Metadata|  863|
|         Geography|  113|
|       Measurement|  109|
|            Device|   93|
|          Specimen|   67|
|              NULL|   21|
|        Meas Value|   10|
|      Relationship|    2|
|          Language|    2|
|             Route|    1|
+------------------+-----+



In [51]:
std1=fullRun("STUDY_DOMAIN_DEEPTH")

02/02/2025 01:27:32 INFO smv: STARTING RUN & PERSIST OUTPUT on omop.links.CONCEPT_WITH_CHILDREN
02/02/2025 01:27:59 INFO smv: COMPLETED RUN & PERSIST OUTPUT: omop.links.CONCEPT_WITH_CHILDREN
02/02/2025 01:27:59 INFO smv: RunTime: 0:00:26.705802
02/02/2025 01:27:59 INFO smv: STARTING GENERATE USER METADATA on omop.links.CONCEPT_WITH_CHILDREN
02/02/2025 01:27:59 INFO smv: COMPLETED GENERATE USER METADATA: omop.links.CONCEPT_WITH_CHILDREN
02/02/2025 01:27:59 INFO smv: RunTime: 0:00:00.000002
02/02/2025 01:27:59 INFO smv: STARTING RUN & PERSIST OUTPUT on omop.links.STUDY_DOMAIN_DEEPTH
02/02/2025 01:27:59 INFO smv: COMPLETED RUN & PERSIST OUTPUT: omop.links.STUDY_DOMAIN_DEEPTH
02/02/2025 01:27:59 INFO smv: RunTime: 0:00:00.350916
02/02/2025 01:27:59 INFO smv: STARTING GENERATE USER METADATA on omop.links.STUDY_DOMAIN_DEEPTH
02/02/2025 01:27:59 INFO smv: COMPLETED GENERATE USER METADATA: omop.links.STUDY_DOMAIN_DEEPTH
02/02/2025 01:27:59 INFO smv: RunTime: 0:00:00.000003


In [52]:
std1.count()

214

In [53]:
std1.orderBy("domain_id", "max_levels").show(300)

+-------------------+----------+-------+
|          domain_id|max_levels|  count|
+-------------------+----------+-------+
|          Condition|         0| 145217|
|          Condition|         1|  20803|
|          Condition|         2|   9589|
|          Condition|         3|   4574|
|          Condition|         4|   2426|
|          Condition|         5|   1329|
|          Condition|         6|    783|
|          Condition|         7|    459|
|          Condition|         8|    312|
|          Condition|         9|    181|
|          Condition|        10|    117|
|          Condition|        11|     53|
|          Condition|        12|     35|
|          Condition|        13|     23|
|          Condition|        14|     16|
|          Condition|        15|     11|
|          Condition|        16|      4|
|          Condition|        17|      2|
|          Condition|        18|      1|
|   Condition Status|         0|     15|
|   Condition Status|         1|      6|
|   Condition St

In [54]:
std2=fullRun("STUDY_VOCABULARY_DEEPTH")

02/02/2025 01:28:30 INFO smv: STARTING RUN & PERSIST OUTPUT on omop.links.STUDY_VOCABULARY_DEEPTH
02/02/2025 01:28:30 INFO smv: COMPLETED RUN & PERSIST OUTPUT: omop.links.STUDY_VOCABULARY_DEEPTH
02/02/2025 01:28:30 INFO smv: RunTime: 0:00:00.360974
02/02/2025 01:28:31 INFO smv: STARTING GENERATE USER METADATA on omop.links.STUDY_VOCABULARY_DEEPTH
02/02/2025 01:28:31 INFO smv: COMPLETED GENERATE USER METADATA: omop.links.STUDY_VOCABULARY_DEEPTH
02/02/2025 01:28:31 INFO smv: RunTime: 0:00:00.000004


In [55]:
std2.count()

146

In [56]:
std2.orderBy("vocabulary_id", "max_levels").show(200)

+--------------------+----------+-------+
|       vocabulary_id|max_levels|  count|
+--------------------+----------+-------+
|                 APC|         0|   1910|
|                 CDM|         0|      2|
|                 CDM|         1|    103|
|                 CDM|         2|     69|
|                 CDM|         3|      1|
|                 CDM|         4|     22|
|                 CDM|         5|      6|
|                 CDM|         6|     59|
|                 CDM|         7|     35|
|                 CDM|         8|    273|
|                 CDM|         9|     18|
|                 CDM|        10|    432|
|                 CDM|        11|     40|
|CMS Place of Service|         0|     61|
|CMS Place of Service|         1|      1|
|CMS Place of Service|         2|      1|
|       Concept Class|         0|    423|
|    Condition Status|         0|     15|
|    Condition Status|         1|      6|
|    Condition Status|         2|      1|
|      Condition Type|         0| 

In [45]:
c=df("CONCEPT")

In [46]:
c.smvHist("vocabulary_id")



+----------------+-------+
|   vocabulary_id|  count|
+----------------+-------+
|RxNorm Extension|2146945|
|             NDC|1220572|
|          SNOMED|1084286|
|          RxNorm| 309670|
|    OMOP Genomic| 289889|
|             OSM| 203339|
|        ICD10PCS| 196224|
|          ICD9CM|  17564|
|           ICD10|  16638|
|           HCPCS|  12079|
|        ICD9Proc|   4657|
|             APC|   1910|
|             DRG|   1362|
|            UCUM|   1127|
|             CDM|   1060|
|    Relationship|    718|
|    Revenue Code|    538|
|   Concept Class|    423|
|   UB04 Typ bill|    298|
|            SOPT|    168|
+----------------+-------+
only showing top 20 rows



                                                                                

In [47]:
c.count()

                                                                                

5510609

In [58]:
t.count()

5510609

In [59]:
t.peek()

concept_id : string = 1000599
concept_name : string = ondansetron 2 MG/ML
domain_id : string = Drug
vocabulary_id : string = RxNorm
concept_class_id : string = Clinical Drug Comp
standard_concept : string = S
concept_code : string = 328448
valid_start_date : date = 1970-01-01
valid_end_date : date = 2099-12-31
invalid_reason : string = None
ancestor_concept_id : string = 1000599
max_levels : int = 3


In [50]:
t.where(F.col("vocabulary_id").isNull()).peek()

concept_id : string = None
concept_name : string = None
domain_id : string = None
vocabulary_id : string = None
concept_class_id : string = None
standard_concept : string = None
concept_code : string = None
valid_start_date : date = None
valid_end_date : date = None
invalid_reason : string = None
ancestor_concept_id : string = 1389536
max_levels : int = 0


In [60]:
x=df("STUDY_DOMAIN_VOCABULARY_DEEPTH")

In [61]:
x.count()

303

In [62]:
x.peek()

domain_id : string = Procedure
vocabulary_id : string = SNOMED
max_levels : int = 6
count : bigint = 374


In [63]:
x.orderBy("domain_id", "vocabulary_id", "max_levels").show(400)

+-------------------+--------------------+----------+-------+
|          domain_id|       vocabulary_id|max_levels|  count|
+-------------------+--------------------+----------+-------+
|          Condition|               HCPCS|         0|      1|
|          Condition|               ICD10|         0|  14113|
|          Condition|              ICD9CM|         0|  14929|
|          Condition|              SNOMED|         0| 116174|
|          Condition|              SNOMED|         1|  20803|
|          Condition|              SNOMED|         2|   9589|
|          Condition|              SNOMED|         3|   4574|
|          Condition|              SNOMED|         4|   2426|
|          Condition|              SNOMED|         5|   1329|
|          Condition|              SNOMED|         6|    783|
|          Condition|              SNOMED|         7|    459|
|          Condition|              SNOMED|         8|    312|
|          Condition|              SNOMED|         9|    181|
|       