In [1]:
import pandas as pd
import json
import os
import re 

In [2]:
data_dir = "taskjson"
output_dir = "taskcsv"
os.makedirs(output_dir, exist_ok=True)

In [3]:
def extract_headlines(data):
    headlines = []
    if isinstance(data, list):
        for item in data:
            headlines.extend(extract_headlines(item))
    elif isinstance(data, dict):
        for key, value in data.items():
            if key.lower() == 'headline':
                headlines.append(value)
            else:
                headlines.extend(extract_headlines(value))
    return headlines

In [4]:
for i in range(27):
    json_filename = f"wikiHow{i}.json"
    json_file_path = os.path.join(data_dir, json_filename)

    if not os.path.exists(json_file_path):
        print(f"File not found: {json_file_path}")
        continue

    try:
        with open(json_file_path, 'r', encoding='utf-8') as file:
            json_data = json.load(file)
        print(f"Loaded data from {json_file_path}")
    except Exception as e:
        print(f"Error loading {json_file_path}: {e}")
        continue

    data = []

    for entry in json_data:
        main_task = entry.get('MainTask', 'N/A')
        main_task = re.sub(r'[^A-Za-z0-9\s]', '', main_task)  # Remove special characters

        # Extract the headlines
        headlines = extract_headlines(entry)
        combined_headlines = ', '.join(headlines)

        # Add the task and headlines to data
        data.append([main_task, combined_headlines])

    if not data:
        print(f"No headlines found for {json_filename}, skipping CSV creation.")
        continue

    # Create DataFrame and rename columns
    df = pd.DataFrame(data, columns=['MainTask', 'CombinedHeadlines'])
    df.rename(columns={'MainTask': 'Input', 'CombinedHeadlines': 'Output'}, inplace=True)

    csv_filename = f"wikiHow{i}.csv"
    csv_file_path = os.path.join(output_dir, csv_filename)

    try:
        df.to_csv(csv_file_path, index=False)
        print(f"Saved {csv_file_path.replace(os.sep, '/')}")
    except Exception as e:
        print(f"Error saving {csv_file_path}: {e}")

Loaded data from taskjson\wikiHow0.json
Saved taskcsv/wikiHow0.csv
Loaded data from taskjson\wikiHow1.json
Saved taskcsv/wikiHow1.csv
Loaded data from taskjson\wikiHow2.json
Saved taskcsv/wikiHow2.csv
Loaded data from taskjson\wikiHow3.json
Saved taskcsv/wikiHow3.csv
Loaded data from taskjson\wikiHow4.json
Saved taskcsv/wikiHow4.csv
Loaded data from taskjson\wikiHow5.json
Saved taskcsv/wikiHow5.csv
Loaded data from taskjson\wikiHow6.json
Saved taskcsv/wikiHow6.csv
Loaded data from taskjson\wikiHow7.json
Saved taskcsv/wikiHow7.csv
Loaded data from taskjson\wikiHow8.json
Saved taskcsv/wikiHow8.csv
Loaded data from taskjson\wikiHow9.json
Saved taskcsv/wikiHow9.csv
Loaded data from taskjson\wikiHow10.json
Saved taskcsv/wikiHow10.csv
Loaded data from taskjson\wikiHow11.json
Saved taskcsv/wikiHow11.csv
Loaded data from taskjson\wikiHow12.json
Saved taskcsv/wikiHow12.csv
Loaded data from taskjson\wikiHow13.json
Saved taskcsv/wikiHow13.csv
Loaded data from taskjson\wikiHow14.json
Saved taskcsv