# Data preprocessing

This script makes sure all pipelines in the data base will have similar boundary conditions:

1. They have to start with "Software", so if this is missing we add it
2. They have to end with "Graph_measure" followed by "Result_aggregation", we remove the aggregation step.
3. After applying the firs two steps, only 10 out of 220 Pipelines have different ending points. For now we exclude them but we should further check these manually

In [None]:
import os
import pandas as pd

wd = os.getcwd()
pipeline_df = pd.read_excel(f'{wd}/data/Database.xlsx', sheet_name='Coding_steps')

# Process the data to create the list of pipelines
pipelines = pipeline_df.apply(lambda row: row.dropna().tolist(), axis=1).tolist()
authors = [pipeline[0] for pipeline in pipelines]
pipelines = [pipeline[1:] for pipeline in pipelines]

# Ensure every pipeline starts with "Software" if it does not already
modified_pipelines = [["Software"] + pipeline if pipeline[0] != "Software" else pipeline for pipeline in pipelines]

# Remove "Result_aggregation" if the step before is "Graph_measures"
for pipeline in modified_pipelines:
    if len(pipeline) > 1 and pipeline[-1] == "Result_aggregation" and pipeline[-2] == "Graph_measures":
        pipeline.pop()


Save the cleaned data for use in the main script:

In [2]:
# Prepare the matching pipelines and their corresponding authors
matching_pipelines = [
    pipeline for pipeline in modified_pipelines if pipeline[0] == "Software" and pipeline[-1] == "Graph_measures"
]
matching_authors = [
    authors[i] for i, pipeline in enumerate(modified_pipelines) if pipeline[0] == "Software" and pipeline[-1] == "Graph_measures"
]

# Create a DataFrame with each pipeline step as a separate column
max_steps = max(len(pipeline) for pipeline in matching_pipelines)
matching_pipelines_df = pd.DataFrame([pipeline + [None]*(max_steps-len(pipeline)) for pipeline in matching_pipelines])
matching_pipelines_df.insert(0, 'Author', matching_authors)

matching_pipelines_df.to_excel(f'{wd}/Data/Database_clean.xlsx', index=False)