# Data Cleaning and Exploration

In [10]:
import ast

import numpy as np
import pandas as pd

### Load Dataset

In [11]:
def safe_parse_all_columns(df):
    """
    Parse all columns in a DataFrame to numeric, coercing errors.
    """
    df['notes'] = df['notes'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['chords'] = df['chords'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['velocities'] = df['velocities'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['durations'] = df['durations'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['offsets'] = df['offsets'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    df['ordered_events'] = df['ordered_events'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    return df

def load_dataframe_from_two_csvs(file1, file2):
    """
    Load and concatenate two CSV files into a single pandas DataFrame.
    """
    df1 = pd.read_csv(file1)
    df2 = pd.read_csv(file2)
    full_df = pd.concat([df1, df2], ignore_index=True)
    full_df = safe_parse_all_columns(full_df)

    return full_df

def save_dataframe_to_two_csvs(df, file1, file2):
    """
    Split a DataFrame in half and save it into two CSV files.
    """
    halfway = len(df) // 2
    df.iloc[:halfway].to_csv(file1, index=False)
    df.iloc[halfway:].to_csv(file2, index=False)

In [12]:
root = 'data_processed/'
file1 = root + 'data_part1.csv'
file2 = root + 'data_part2.csv'

df = load_dataframe_from_two_csvs(file1, file2)

## Data cleaning

We first check that there is consistency among the values we have, for example:
- The number of notes plus the number of chords should sum to the number of velocities, durations, offsets and events.

In [13]:
def check_dataset_consistency(df):
    failed_rows = []

    for i in range(len(df)):
        row = df.iloc[i]
        try:
            notes = row['notes']
            chords = row['chords']
            durations = row['durations']
            velocities = row['velocities']
            offsets = row['offsets']
            events = row['ordered_events']

            cond1 = len(notes) + len(chords) == len(durations)
            cond2 = len(durations) == len(velocities) == len(offsets) == len(events)

            if not (cond1 and cond2):
                failed_rows.append(i)

        except Exception as e:
            print(f"Error parsing row {i}: {e}")
            failed_rows.append(i)

    total = len(df)
    failed = len(failed_rows)
    passed = total - failed

    print(f"\n Passed: {passed}/{total}")
    print(f" Failed: {failed}/{total}")
    if failed > 0:
        print(f"Indices of failed rows: {failed_rows}")

    return failed_rows

check_dataset_consistency(df)


 Passed: 2775/2775
 Failed: 0/2775


[]