# Now, bringing it all together 

Now that we can demonstrate the loading of the data (XPT) and the metadata (APIs) how do we link it all together?

First off, what questions are we looking to solve?
* Can I work out what version of SDTM was used to build the SDTM dataset?
* Can I get the metadata for a column in my SDTM dataset?
* Can I confirm all the coded terms in my dataset?


In [None]:
# First off - imports
import pandas as pd
from pandas import DataFrame

# Don't repeat yourself.....
from utils import load_cdiscpilot_dataset

In [None]:
# Can I get the metadata for a column in my SDTM dataset?

# first off, load the dataset

# Declare the type of the variable (optional)
dm: DataFrame
dm = load_cdiscpilot_dataset('DM')

# Get the columns
published_columns = list(dm.columns)

In [None]:
# Get the current set of columns
print(published_columns)

In [None]:
# now, to iterate over versions 
from dotenv import load_dotenv
load_dotenv()
import os
from client import LibraryClient

client = LibraryClient(os.getenv('CDISC_LIBRARY_API_TOKEN'))

# create a dataset for comparison
variables_by_version = {}

sdtm_ig_versions = client.get_sdtmig_versions()
for version in sdtm_ig_versions:
    # carve out the version
    version_id = version.get('href').split("/")[-1]
    if not str.isdigit(version_id[0]):
        # strip out the associated persons, devices IGs
        continue
    # get the dataset
    dataset = client.get_ig_dataset_by_version(version_id, "DM")
    for variable in dataset.get('datasetVariables'):
        # setdefault is like upsert for a dictionary
        variables_by_version.setdefault(version_id, []).append(variable.get('name'))



In [None]:
# now we can iterate over the versions and compare the dataset variable
for version, items in variables_by_version.items():
    print(f"Checking {version}")
    if set(items) == set(published_columns):
        print(f"Version {version} is a candidate")
    else:
        print(f"Version {version} is not a candidate")
        if set(items) - set(published_columns): 
            print(f"Variables missing from Dataset: {set(items) - set(published_columns)}")
        if set(published_columns) - set(items):
            print(f"Variables unexpected in Dataset: {set(published_columns) - set(items)}")

# we leave the equivalent for the SDTM model as an exercise for the reader.  There may even be a helper on the client!
        

So, that's a little weird!  The Define.xml shows that the version of SDTM is 3-1-2, but there are colummns present in the dataset from a more recent version.  The missing BRTHDTC and INV-- columns could be reconciled as an artifact of the dataset anonymisation.

In [None]:
# given we were told it was 3-1-2 in the define, let's iterate through the columns
import pprint
dm_upstream = client.get_ig_dataset_by_version("3-1-2", "DM")

# load the specifications
specifications = {}
for dataset_variable in dm_upstream.get('datasetVariables'):
    specifications[dataset_variable.get('name')] = dataset_variable

coalesced = []
# iterate over our columns
for column in published_columns:
    metadata = {}
    if column in specifications:
        print(f"** {column}")
        for key, value in specifications.get(column).items():
            if key == '_links':
                # the codelist is bound as a link (as it should be) rather than a simple attribute
                if 'codelist' in value:
                    print(f"codelist: {value.get('codelist')[0].get('href').split('/')[-1]}")
                    metadata["codelist"] = value.get('codelist')[0].get('href').split('/')[-1]
            else:
                metadata[key] = value
                print(f"{key}: {value}")
        coalesced.append(metadata)
    else:
        print(f"Column {column} not found")
    

In [None]:
# if you don't want to process the JSON you can convert it to a dataframe
from pandas import DataFrame

df = pd.json_normalize(coalesced)
df.set_index(["name"])
df.to_html('temp.html')


In [None]:
# retrieving the codelist values

# for example for AGEU
ageu = [x for x in dm_upstream.get('datasetVariables') if x.get('name') == 'AGEU'][0]

# we can use a function to make following links easier
age_codelist = client.get_link(ageu.get('_links').get('codelist')[0])
print(f"There are {len(age_codelist.get('_links').get('versions'))} versions of the AGEU codelist")

# get the current values
set_values = dm["AGEU"].unique()

# get the last version
ageu_last = client.get_link(age_codelist.get('_links').get('versions')[-1])
submission_values = [x.get('submissionValue') for x in ageu_last.get('terms')]

# check
for dataset_ageu_value in set_values:
    if dataset_ageu_value not in submission_values:
        print(f"Codelist value {dataset_ageu_value} not found")


In [None]:
# retrieving the codelist values
VAR = "RACE"

# for example for VAR
codedterm = [x for x in dm_upstream.get('datasetVariables') if x.get('name') == VAR][0]

# we can use a function to make following links easier
codelist = client.get_link(codedterm.get('_links').get('codelist')[0])
print(f"There are {len(codelist.get('_links').get('versions'))} versions of the {VAR} codelist")

# get the current values
set_values = dm[VAR].unique()

# get the last version
_last = client.get_link(codelist.get('_links').get('versions')[-1])
_values = [x.get('submissionValue') for x in _last.get('terms')]

# check
for dataset_codelist_value in set_values:
    if dataset_codelist_value not in _values:
        print(f"Codelist value {dataset_codelist_value} not found")

# all of this can be rolled into a function

# What's next
[Wrap Up](./05-Wrap-it-up.ipynb)

