In [1]:
import json
from functools import reduce
from statistics import mean

In [2]:
with open('repoToSteps.json') as repoToSteps:
    repoToStepsCWL = {
        repo: list(filter(lambda step: '.cwl' in step, steps))
        for repo, steps in json.loads(repoToSteps.read()).items()
    }

In [3]:
with open('phenotypeGroups.json') as phenotypeGroups:
    phenotypeGroups = json.loads(phenotypeGroups.read())

In [4]:
with open('workflowIntersections.json') as workflowIntersections:
    workflowIntersections = json.loads(workflowIntersections.read())

In [5]:
print('\nTotal conditions: ' + str(len(repoToStepsCWL.keys())))


Total conditions: 1090


In [6]:
print('\nUnique conditions: ' + str(len(phenotypeGroups.keys())))


Unique conditions: 135


In [7]:
largestConditionGroup = max(
    [
        (leadCondition, len(otherConditions) + 1)
        for leadCondition, otherConditions in phenotypeGroups.items()
    ],
    key=lambda conditionGroupEntry: conditionGroupEntry[1],
)
print('\nLargest condition group: ' + str(largestConditionGroup))


Largest condition group: ('{"\\"name\\"": "Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16", "\\"about\\"": "Diabetes - PH8"}', 25)


In [8]:
overlappingSiblings = set(
    map(
        lambda repo: repo['"name"'],
        reduce(
            lambda allConditionsWithIntersection, conditionsWithIntersection: allConditionsWithIntersection
            + json.loads(conditionsWithIntersection),
            list(workflowIntersections[largestConditionGroup[0]].keys()),
            [],
        ),
    )
)
print('\nIntersecting siblings in largest condition group: ' + str(overlappingSiblings))


Intersecting siblings in largest condition group: {'Type-2-Diabetes---dfc54970-1a27-11ef-9de4-4d4ea830ad16', 'Type-1-Diabetes---2d8a10e0-1c27-11ef-bdee-f10829e63eeb', 'Diabetes---e53eb430-1a37-11ef-9de4-4d4ea830ad16', 'Diabetes---cd970470-1ad4-11ef-9de4-4d4ea830ad16', 'Diabetes---c10cdf70-19eb-11ef-9de4-4d4ea830ad16', 'Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16', 'Type-1-Diabetes---bcced3f0-1bd5-11ef-bdee-f10829e63eeb', 'Diabetes---bfa45a20-3979-11ef-918f-350181f4a5db', 'Diabetes---96e3b550-1d1c-11ef-94c0-09c4aef33dd3', 'Diabetes---f3206ee0-19ec-11ef-9de4-4d4ea830ad16', 'Type-2-Diabetes---b30a8860-19bd-11ef-9de4-4d4ea830ad16', 'Diabetes---1d8a7370-1aaa-11ef-9de4-4d4ea830ad16', 'Diabetes---a1f20850-1779-11ef-9de4-4d4ea830ad16', 'Type-2-Diabetes---7fcbd1f0-1bd6-11ef-bdee-f10829e63eeb', 'Diabetes---4d9cec30-1d9f-11ef-94c0-09c4aef33dd3', 'Type-2-Diabetes---1c10b020-1c28-11ef-bdee-f10829e63eeb', 'Diabetes---5845e8e0-1d3a-11ef-94c0-09c4aef33dd3', 'Diabetes---87522990-184a-11ef-9de4-4d4

In [9]:
print(
    '\nNumber of overlapping siblings in largest condition group: '
    + str(len(overlappingSiblings))
)


Number of overlapping siblings in largest condition group: 22


In [10]:
overlapNumbers = list(
    map(
        lambda conditionsToIntersectionSteps: {
            'conditions': json.loads(conditionsToIntersectionSteps[0]),
            'overlapping': len(conditionsToIntersectionSteps[1]),
        },
        workflowIntersections[largestConditionGroup[0]].items(),
    )
)
mostIntersectionpingSteps = max(
    overlapNumbers,
    key=lambda overlapNumbersEntry: overlapNumbersEntry['overlapping'],
)
leastIntersectionpingSteps = min(
    overlapNumbers,
    key=lambda overlapNumbersEntry: overlapNumbersEntry['overlapping'],
)
averageIntersectionpingSteps = round(
    mean(list(map(lambda overlap: overlap['overlapping'], overlapNumbers))), 2
)
print(
    '\nMost overlapping steps (+ number of steps (% of overlapping) in respective workflows): '
    + str(mostIntersectionpingSteps)
    + ' -- '
    + str(len(repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][0])]))
    + ' ('
    + str(
        round(
            mostIntersectionpingSteps['overlapping']
            / len(
                repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][0])]
            )
            * 100,
            2,
        )
    )
    + '%) '
    + str(len(repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][1])]))
    + ' ('
    + str(
        round(
            mostIntersectionpingSteps['overlapping']
            / len(
                repoToStepsCWL[json.dumps(mostIntersectionpingSteps['conditions'][1])]
            )
            * 100,
            2,
        )
    )
    + '%)'
)


Most overlapping steps (+ number of steps (% of overlapping) in respective workflows): {'conditions': [{'"name"': 'Type-2-Diabetes---b30a8860-19bd-11ef-9de4-4d4ea830ad16', '"about"': 'Type 2 Diabetes - PH490'}, {'"name"': 'Diabetes---b8c00ec0-19ee-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH520'}], 'overlapping': 32} -- 111 (28.83%) 90 (35.56%)


In [11]:
print('\nLeast overlapping steps: ' + str(leastIntersectionpingSteps))


Least overlapping steps: {'conditions': [{'"name"': 'Diabetes---d88be170-16f1-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH8'}, {'"name"': 'Diabetes---c10cdf70-19eb-11ef-9de4-4d4ea830ad16', '"about"': 'Diabetes - PH518'}], 'overlapping': 1}


In [12]:
print(
    '\nAverage overlapping steps (+ average number of steps (% of overlapping)): '
    + str(int(averageIntersectionpingSteps))
    + ' ('
    + str(int(mean([len(values) for values in repoToStepsCWL.values()])))
    + ' ('
    + str(
        round(
            int(averageIntersectionpingSteps)
            / int(mean([len(values) for values in repoToStepsCWL.values()]))
            * 100,
            2,
        )
    )
    + '%)'
    + ')'
)


Average overlapping steps (+ average number of steps (% of overlapping)): 9 (29 (31.03%))
