In [155]:
import json
from functools import reduce
from statistics import mean

In [156]:
with open('repoToSteps.json') as repoToSteps:
    repoToStepsCWL = {
        repo: list(filter(lambda step: '.cwl' in step, steps))
        for repo, steps in json.loads(repoToSteps.read()).items()
    }

In [157]:
with open('phenotypeGroups.json') as phenotypeGroups:
    phenotypeGroups = json.loads(phenotypeGroups.read())

In [158]:
with open('intersections.json') as intersections:
    intersections = json.loads(intersections.read())

In [159]:
print('\nTotal definitions: ' + str(len(repoToStepsCWL.keys())))


Total definitions: 1171


In [160]:
conditionGroups = phenotypeGroups.keys()
print('\nCondition groups: ' + str(len(conditionGroups)))
groupedConditions = list(phenotypeGroups.keys()) + [
    json.dumps(item) for sublist in phenotypeGroups.values() for item in sublist
]
print('\nGrouped conditions: ' + str(len(groupedConditions)))
ungroupedConditions = [
    repo for repo in repoToStepsCWL.keys() if repo not in groupedConditions
]
print('\nUngrouped conditions: ' + str(len(ungroupedConditions)))
print('\nUnique conditions: ' + str(len(conditionGroups) + len(ungroupedConditions)))


Condition groups: 184

Grouped conditions: 702

Ungrouped conditions: 543

Unique conditions: 727


In [161]:
print(
    '\nProportion of conditions grouped: '
    + str(round(len(groupedConditions) / len(repoToStepsCWL.keys()) * 100, 2))
    + '%'
)
print(
    '\nDefinitions per condition: '
    + str(
        len(repoToStepsCWL.keys()) / (len(conditionGroups) + len(ungroupedConditions))
    )
)


Proportion of conditions grouped: 59.95%

Definitions per condition: 1.610729023383769


In [162]:
largestConditionGroup = max(
    [
        (leadCondition, len(otherConditions) + 1)
        for leadCondition, otherConditions in phenotypeGroups.items()
    ],
    key=lambda conditionGroupEntry: conditionGroupEntry[1],
)
print('\nLargest condition group: ' + str(largestConditionGroup))
print(phenotypeGroups[largestConditionGroup[0]])


Largest condition group: ('{"\\"name\\"": "Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16", "\\"about\\"": "Diabetes - PH8"}', 27)
[{'"name"': 'Diabetes---4ce31400-16fa-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH24'}, {'"name"': 'Diabetes---a1f20850-1779-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH152'}, {'"name"': 'Diabetes-mellitus-of-various-forms---a6b97c60-1832-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes mellitus, of various forms - PH349'}, {'"name"': 'Diabetes-Medication---3541e3f0-1847-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes Medication - PH374'}, {'"name"': 'Diabetes---87522990-184a-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH375'}, {'"name"': 'Diabetes---c10cdf70-19eb-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH518'}, {'"name"': 'Diabetes---f3206ee0-19ec-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH519'}, {'"name"': 'Diabetes---b8c00ec0-19ee-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH520'}, {'"name"': 'Diabetes-Mellitus-Type-II---

In [163]:
overlappingConditions = set(
    map(
        lambda repo: repo['"name"'],
        reduce(
            lambda allConditionsWithIntersection, conditionsWithIntersection: allConditionsWithIntersection
            + json.loads(conditionsWithIntersection),
            list(intersections[largestConditionGroup[0]].keys()),
            [],
        ),
    )
)
print(
    '\nIntersecting conditions in largest condition group: '
    + str(overlappingConditions)
)
print(
    '\nNumber of intersection conditions in largest condition group: '
    + str(len(overlappingConditions))
)


Intersecting conditions in largest condition group: {'Diabetes---e53eb430-1a37-11ef-9de4-4d4ea830ad16', 'Diabetes---4d9cec30-1d9f-11ef-94c0-09c4aef33dd3', 'Diabetes---74eb7c70-1d4a-11ef-94c0-09c4aef33dd3', 'Diabetes-With-Complications---a1ac26d0-1d39-11ef-94c0-09c4aef33dd3', 'Diabetes---cd970470-1ad4-11ef-9de4-4d4ea830ad16', 'Diabetes-with-end-organ-damage-Elixhauser-primary-care---363d7560-1e02-11ef-94c0-09c4aef33dd3', 'Diabetes-Medication---3541e3f0-1847-11ef-9de4-4d4ea830ad16', 'Diabetes---87522990-184a-11ef-9de4-4d4ea830ad16', 'Diabetes---94b07310-1d47-11ef-94c0-09c4aef33dd3', 'Diabetes-mellitus-of-various-forms---a6b97c60-1832-11ef-9de4-4d4ea830ad16', 'Diabetes-Mellitus-Type-II---55276580-19f2-11ef-9de4-4d4ea830ad16', 'Diabetes---96e3b550-1d1c-11ef-94c0-09c4aef33dd3', 'Diabetes---b8c00ec0-19ee-11ef-9de4-4d4ea830ad16', 'Diabetes---5845e8e0-1d3a-11ef-94c0-09c4aef33dd3', 'Diabetes-Mellitus---872f75f0-20c7-11ef-ba9f-3d1e4076db47', 'Diabetes---bfa45a20-3979-11ef-918f-350181f4a5db', 'D

In [164]:
overlapNumbers = list(
    map(
        lambda conditionsToIntersectionSteps: {
            'conditions': json.loads(conditionsToIntersectionSteps[0]),
            'overlapping': len(conditionsToIntersectionSteps[1]),
        },
        intersections[largestConditionGroup[0]].items(),
    )
)
mostIntersectionpingSteps = max(
    overlapNumbers,
    key=lambda overlapNumbersEntry: overlapNumbersEntry['overlapping'],
)
leastIntersectionpingSteps = min(
    overlapNumbers,
    key=lambda overlapNumbersEntry: overlapNumbersEntry['overlapping'],
)
averageIntersectionpingSteps = round(
    mean(list(map(lambda overlap: overlap['overlapping'], overlapNumbers))), 2
)
print(
    '\nMost overlapping steps (+ number of steps (% of overlapping) in respective workflows): '
    + str(mostIntersectionpingSteps)
    + ' -- '
    + str(len(repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][0])]))
    + ' ('
    + str(
        round(
            mostIntersectionpingSteps['overlapping']
            / len(
                repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][0])]
            )
            * 100,
            2,
        )
    )
    + '%) '
    + str(len(repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][1])]))
    + ' ('
    + str(
        round(
            mostIntersectionpingSteps['overlapping']
            / len(
                repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][1])]
            )
            * 100,
            2,
        )
    )
    + '%)'
)


Most overlapping steps (+ number of steps (% of overlapping) in respective workflows): {'conditions': [{'"name"': 'Diabetes---b8c00ec0-19ee-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH520'}, {'"name"': 'Diabetes-Mellitus---872f75f0-20c7-11ef-ba9f-3d1e4076db47', '"about"': 'Diabetes Mellitus - PH419'}], 'overlapping': 36} -- 90 (40.0%) 89 (40.45%)


In [165]:
print('\nLeast overlapping steps: ' + str(leastIntersectionpingSteps))


Least overlapping steps: {'conditions': [{'"name"': 'Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH8'}, {'"name"': 'Diabetes---c10cdf70-19eb-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH518'}], 'overlapping': 1}


In [166]:
print(
    '\nAverage overlapping steps (+ average number of steps (% of overlapping)): '
    + str(int(averageIntersectionpingSteps))
    + ' ('
    + str(int(mean([len(values) for values in repoToStepsCWL.values()])))
    + ' ('
    + str(
        round(
            int(averageIntersectionpingSteps)
            / int(mean([len(values) for values in repoToStepsCWL.values()]))
            * 100,
            2,
        )
    )
    + '%)'
    + ')'
)


Average overlapping steps (+ average number of steps (% of overlapping)): 9 (28 (32.14%))
