In [1]:
import json
from functools import reduce
from statistics import mean

In [2]:
with open('repoToSteps.json') as repoToSteps:
    repoToStepsCWL = {
        repo: list(filter(lambda step: '.cwl' in step, steps))
        for repo, steps in json.loads(repoToSteps.read()).items()
    }

In [3]:
with open('phenotypeGroups.json') as phenotypeGroups:
    phenotypeGroups = json.loads(phenotypeGroups.read())

In [4]:
with open('intersections.json') as intersections:
    intersections = json.loads(intersections.read())

In [5]:
print('\nTotal definitions: ' + str(len(repoToStepsCWL.keys())))


Total definitions: 1171


In [6]:
conditionGroups = phenotypeGroups.keys()
print('\nCondition groups: ' + str(len(conditionGroups)))
groupedConditions = list(phenotypeGroups.keys()) + [
    json.dumps(item) for sublist in phenotypeGroups.values() for item in sublist
]
print('\nGrouped conditions: ' + str(len(groupedConditions)))
ungroupedConditions = [
    repo for repo in repoToStepsCWL.keys() if repo not in groupedConditions
]
print('\nUngrouped conditions: ' + str(len(ungroupedConditions)))
print('\nUnique conditions: ' + str(len(conditionGroups) + len(ungroupedConditions)))


Condition groups: 180

Grouped conditions: 706

Ungrouped conditions: 542

Unique conditions: 722


In [7]:
print(
    '\nProportion of conditions grouped: '
    + str(round(len(groupedConditions) / len(repoToStepsCWL.keys()) * 100, 2))
    + '%'
)
print(
    '\nDefinitions per condition: '
    + str(
        len(repoToStepsCWL.keys()) / (len(conditionGroups) + len(ungroupedConditions))
    )
)


Proportion of conditions grouped: 60.29%

Definitions per condition: 1.6218836565096952


In [8]:
largestConditionGroup = max(
    [
        (leadCondition, len(otherConditions) + 1)
        for leadCondition, otherConditions in phenotypeGroups.items()
    ],
    key=lambda conditionGroupEntry: conditionGroupEntry[1],
)
print('\nLargest condition group: ' + str(largestConditionGroup))
print(phenotypeGroups[largestConditionGroup[0]])


Largest condition group: ('{"\\"name\\"": "Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16", "\\"about\\"": "Diabetes - PH8"}', 45)
[{'"name"': 'Diabetes---4ce31400-16fa-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH24'}, {'"name"': 'Diabetes---a1f20850-1779-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH152'}, {'"name"': 'Diabetes-Medication---3541e3f0-1847-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes Medication - PH374'}, {'"name"': 'Diabetes---87522990-184a-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH375'}, {'"name"': 'Diabetes---c10cdf70-19eb-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH518'}, {'"name"': 'Diabetes---f3206ee0-19ec-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH519'}, {'"name"': 'Diabetes---b8c00ec0-19ee-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH520'}, {'"name"': 'Diabetes---e53eb430-1a37-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH580'}, {'"name"': 'Diabetes---1d8a7370-1aaa-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH618'},

In [9]:
overlappingConditions = set(
    map(
        lambda repo: repo['"name"'],
        reduce(
            lambda allConditionsWithIntersection, conditionsWithIntersection: allConditionsWithIntersection
            + json.loads(conditionsWithIntersection),
            list(intersections[largestConditionGroup[0]].keys()),
            [],
        ),
    )
)
print(
    '\nIntersecting conditions in largest condition group: '
    + str(overlappingConditions)
)
print(
    '\nNumber of intersection conditions in largest condition group: '
    + str(len(overlappingConditions))
)


Intersecting conditions in largest condition group: {'Diabetes---bfa45a20-3979-11ef-918f-350181f4a5db', 'Insulin---29ff8250-1d1e-11ef-94c0-09c4aef33dd3', 'Glp-1Receptoragonists---20f140a0-1c14-11ef-bdee-f10829e63eeb', 'Anti-Diabetics---89b99330-1d59-11ef-94c0-09c4aef33dd3', 'Diabetes---cd970470-1ad4-11ef-9de4-4d4ea830ad16', 'Diabetes-Elixhauser-primary-care---398956f0-1e00-11ef-94c0-09c4aef33dd3', 'Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16', 'Metformin---6da8b740-1cd5-11ef-bdee-f10829e63eeb', 'Diabetes---4ce31400-16fa-11ef-9de4-4d4ea830ad16', 'Antidiabetic-Medications---c8208cc0-1bbd-11ef-bdee-f10829e63eeb', 'Sglt-2Inhibitors---922c6c80-1c24-11ef-bdee-f10829e63eeb', 'Diabetes---96e3b550-1d1c-11ef-94c0-09c4aef33dd3', 'Insulin---04bad9e0-1c15-11ef-bdee-f10829e63eeb', 'Metformin---3d721d70-1a1c-11ef-9de4-4d4ea830ad16', 'Thiazolidinediones---b8c49280-1c26-11ef-bdee-f10829e63eeb', 'Sulfonylureas---76fa76a0-1a26-11ef-9de4-4d4ea830ad16', 'Diabetes-Charlson-primary-care---c4782000-1df3-

In [10]:
from util import filterFirstOccurences

overlapNumbers = list(
    map(
        lambda conditionsToIntersectionSteps: {
            'conditions': json.loads(conditionsToIntersectionSteps[0]),
            'overlapping': len(filterFirstOccurences(conditionsToIntersectionSteps[1])),
        },
        intersections[largestConditionGroup[0]].items(),
    )
)

In [11]:
mostIntersectingSteps = max(
    overlapNumbers,
    key=lambda overlapNumbersEntry: overlapNumbersEntry['overlapping'],
)

In [12]:
print(
    '\nDefinition pair with most intersecting steps in largest condition group + number of overlapping (+ number of steps (% of overlapping) in respective workflows): '
    + str(mostIntersectingSteps)
    + ' -- '
    + str(len(repoToStepsCWL[json.dumps(mostIntersectingSteps['conditions'][0])]))
    + ' ('
    + str(
        round(
            mostIntersectingSteps['overlapping']
            / len(repoToStepsCWL[json.dumps(mostIntersectingSteps['conditions'][0])])
            * 100,
            2,
        )
    )
    + '%) '
    + str(len(repoToStepsCWL[json.dumps(mostIntersectingSteps['conditions'][1])]))
    + ' ('
    + str(
        round(
            mostIntersectingSteps['overlapping']
            / len(repoToStepsCWL[json.dumps(mostIntersectingSteps['conditions'][1])])
            * 100,
            2,
        )
    )
    + '%)'
)


Definition pair with most intersecting steps in largest condition group + number of overlapping (+ number of steps (% of overlapping) in respective workflows): {'conditions': [{'"name"': 'CCU000-Diabetes---01d2fd40-1db7-11ef-94c0-09c4aef33dd3', '"about"': 'CCU000 Diabetes - PH945'}, {'"name"': 'CCU002_01-Diabetes-and-diabates-medication---a80de070-2204-11ef-ba9f-3d1e4076db47', '"about"': 'CCU002_01 Diabetes and diabates medication - PH965'}], 'overlapping': 86} -- 95 (90.53%) 95 (90.53%)


In [13]:
print('\nOverlapping steps:')
for line in intersections[largestConditionGroup[0]][
    json.dumps(mostIntersectingSteps['conditions'])
]:
    print(line)


Overlapping steps:
['retinal-ccu000-diabetes---secondary.cwl', 'renal-ccu002_01-diabetes-and-diabates-medication---primary.cwl']
['neurological-ccu000-diabetes---secondary.cwl', 'neurologic-ccu002_01-diabetes-and-diabates-medication---primary.cwl']
['ccu000-diabetes-leprechaunism---secondary.cwl', 'ccu002_01-diabetes-and-diabates-medication-leprechaunism---primary.cwl']
['stable-ccu000-diabetes---secondary.cwl', 'stable-ccu002_01-diabetes-and-diabates-medication---primary.cwl']
['ccu000-diabetes-vessel---secondary.cwl', 'ccu002_01-diabetes-and-diabates-medication-vessel---primary.cwl']
['ccu000-diabetes-education---secondary.cwl', 'ccu002_01-diabetes-and-diabates-medication-education---primary.cwl']
['neuropathic-ccu000-diabetes---secondary.cwl', 'neuropathic-ccu002_01-diabetes-and-diabates-medication---primary.cwl']
['peripheral-ccu000-diabetes---secondary.cwl', 'peripheral-ccu002_01-diabetes-and-diabates-medication---primary.cwl']
['ccu000-diabetes-treated---secondary.cwl', 'ccu002_

In [14]:
leastIntersectingSteps = min(
    overlapNumbers,
    key=lambda overlapNumbersEntry: overlapNumbersEntry['overlapping'],
)

In [15]:
print('\nLeast intersecting steps: ' + str(leastIntersectingSteps))


Least intersecting steps: {'conditions': [{'"name"': 'Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH8'}, {'"name"': 'Diabetes---a1f20850-1779-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH152'}], 'overlapping': 1}


In [16]:
averageIntersectingSteps = round(
    mean(list(map(lambda overlap: overlap['overlapping'], overlapNumbers))), 2
)

In [17]:
stepLengthsInLargestGroup = [
    (
        len(repoToStepsCWL[largestGroupPhenotype])
        if isinstance(largestGroupPhenotype, str)
        else len(repoToStepsCWL[json.dumps(largestGroupPhenotype)])
    )
    for largestGroupPhenotype in (
        [largestConditionGroup[0]] + phenotypeGroups[largestConditionGroup[0]]
    )
]
print('\nStep lengths in largest group: ' + str(sorted(stepLengthsInLargestGroup)))
print(
    '\nAverage intersecting steps (+ average number of steps (% of overlapping)): '
    + str(int(averageIntersectingSteps))
    + ' ('
    + str(int(mean(stepLengthsInLargestGroup)))
    + ' ('
    + str(
        round(
            int(averageIntersectingSteps) / int(mean(stepLengthsInLargestGroup)) * 100,
            2,
        )
    )
    + '%)'
    + ')'
)


Step lengths in largest group: [1, 2, 2, 3, 3, 4, 4, 4, 6, 8, 11, 15, 19, 20, 22, 25, 28, 28, 29, 30, 31, 32, 32, 33, 38, 41, 41, 41, 42, 45, 57, 62, 68, 75, 79, 81, 86, 86, 89, 90, 95, 95, 95, 95, 124]

Average intersecting steps (+ average number of steps (% of overlapping)): 7 (42 (16.67%))
