#Validate Test Data

In [2]:
import json
import pandas as pd
from uuid import uuid4

# Read JSON file
def read_json_file(json_path):
    try:
        with open(json_path, 'r') as file:
            data = json.load(file)
        return data
    except Exception as e:
        print(f"Error reading JSON file: {e}")
        return None

# Read CSV file
def read_csv_file(csv_path):
    try:
        df = pd.read_csv(csv_path)
        return df
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

# Compare JSON and CSV, find matching rows
def find_matching_rows(json_data, csv_df):
    results = []
    differences = []

    for query_item in json_data:
        query = query_item['query']
        ground_truth = query_item['ground_truth']

        for event in ground_truth:
            event_title = event['event']
            event_datetime = f"{event['date']} {event['time']}"
            expected_rows = event['matching_csv_rows']

            # Find matching rows in CSV
            matching_rows = csv_df[
                (csv_df['Title'] == event_title) &
                (csv_df['Date and Time'] == event_datetime)
            ].index.tolist()

            # Convert to 1-based indexing (assuming CSV rows are 1-based in JSON)
            matching_rows = [row + 1 for row in matching_rows]

            # Check for differences
            if set(matching_rows) != set(expected_rows):
                differences.append({
                    'query': query,
                    'event': event_title,
                    'date_time': event_datetime,
                    'json_rows': expected_rows,
                    'csv_rows': matching_rows,
                    'difference': list(set(expected_rows).symmetric_difference(set(matching_rows)))
                })

            # Update event with corrected row numbers
            event['matching_csv_rows'] = matching_rows

        results.append({
            'query': query,
            'ground_truth': ground_truth
        })

    return results, differences

# Create DataFrame to highlight differences
def create_diff_dataframe(differences):
    diff_data = []
    for diff in differences:
        diff_data.append({
            'Query': diff['query'],
            'Event': diff['event'],
            'Date and Time': diff['date_time'],
            'JSON Rows': diff['json_rows'],
            'CSV Rows': diff['csv_rows'],
            'Difference': diff['difference']
        })
    return pd.DataFrame(diff_data)

# Save corrected JSON
def save_corrected_json(data, output_path):
    try:
        with open(output_path, 'w') as file:
            json.dump(data, file, indent=2)
        print(f"Corrected JSON saved to {output_path}")
    except Exception as e:
        print(f"Error saving JSON file: {e}")

# Main execution
def main(json_path, csv_path, output_json_path):
    # Read files
    json_data = read_json_file(json_path)
    csv_df = read_csv_file(csv_path)

    if json_data is None or csv_df is None:
        return

    # Find matches and differences
    corrected_data, differences = find_matching_rows(json_data, csv_df)

    # Create and display differences DataFrame
    if differences:
        diff_df = create_diff_dataframe(differences)
        print("\nDifferences found between JSON and CSV:")
        print(diff_df)
    else:
        print("\nNo differences found between JSON and CSV")

    # Save corrected JSON
    save_corrected_json(corrected_data, output_json_path)

    return diff_df

# Example usage in Colab
if __name__ == "__main__":
    # For Colab, files would typically be uploaded or accessed via Google Drive
    json_path = "/content/test_questions_and_answers.json"
    csv_path = "/content/events.csv"
    output_json_path = "/content/test_questions_and_answers_corrected.json"

    df = main(json_path, csv_path, output_json_path)


Differences found between JSON and CSV:
                                                 Query  \
0    What events are related to mental health aware...   
1    Is there a weekly meeting for Red Essex on the...   
2    When is the International Cafe for Mexican & S...   
3        What are some creative writing events in May?   
4        What are some creative writing events in May?   
..                                                 ...   
129  What event is focused on IT and arts & crafts ...   
130    What event features a popular quiz with prizes?   
131    What event features a popular quiz with prizes?   
132    What event features a popular quiz with prizes?   
133    What event features a popular quiz with prizes?   

                                            Event  \
0    Mental Health Awareness Week - Sip and Paint   
1                        Red Essex weekly meeting   
2     International Cafe: Mexican & Spanish Night   
3                      Community Creative Writing 

In [5]:
df

Unnamed: 0,Query,Event,Date and Time,JSON Rows,CSV Rows,Difference
0,What events are related to mental health aware...,Mental Health Awareness Week - Sip and Paint,12th May 6pm - 8:30pm,[2],[],[2]
1,Is there a weekly meeting for Red Essex on the...,Red Essex weekly meeting,19th May 6pm - 8pm,[36],[47],"[36, 47]"
2,When is the International Cafe for Mexican & S...,International Cafe: Mexican & Spanish Night,15th May 7:30pm - 9pm,[25],[27],"[25, 27]"
3,What are some creative writing events in May?,Community Creative Writing,21st May 12:30pm - 11th June 2pm,[44],[54],"[44, 54]"
4,What are some creative writing events in May?,Community Creative Writing,28th May 12:30pm - 18th June 2pm,[56],[79],"[56, 79]"
...,...,...,...,...,...,...
129,What event is focused on IT and arts & crafts ...,Community Learning Project,24th May 10:15am - 2pm,[56],[69],"[56, 69]"
130,What event features a popular quiz with prizes?,Rollover Quiz,15th May 7pm - 9pm,[23],[26],"[26, 23]"
131,What event features a popular quiz with prizes?,Rollover Quiz,22nd May 7pm - 9pm,[49],[62],"[49, 62]"
132,What event features a popular quiz with prizes?,Rollover Quiz,29th May 7pm - 9pm,[71],[85],"[85, 71]"


In [6]:
# Find rows where 'CSV Rows' is an empty list
empty_csv_rows = df[df['CSV Rows'].apply(lambda x: x == [])]

# Display the result
empty_csv_rows

Unnamed: 0,Query,Event,Date and Time,JSON Rows,CSV Rows,Difference
0,What events are related to mental health aware...,Mental Health Awareness Week - Sip and Paint,12th May 6pm - 8:30pm,[2],[],[2]
5,What events are there for students who want to...,Mental Health Awareness Week - Sip and Paint,12th May 6pm - 8:30pm,[1],[],[1]
88,What events are focused on mental health or we...,Mental Health Awareness Week - Sip and Paint,12th May 6pm - 8:30pm,[1],[],[1]
112,What is the next major event related to sports?,Champions League Final - Football Team Takeover,31st May - 1st June 8pm - 10pm,[99],[],[99]
119,Is there a talent show or performance event in...,Launch Festival 2025,13th May - 16th May noon - 6pm,[6],[],[6]
