In [7]:
import json
import os
import pandas as pd

# Set the path to the 'idealista' folder
folder_path = 'P2_data/idealista/'

# Set the path to the output folder
output_folder = 'P2_data/idealista_json'

# Get a list of subfolders inside the 'idealista' folder
subfolders = [f for f in os.listdir(folder_path) if os.path.isdir(os.path.join(folder_path, f))]
documents = []

# Iterate over the subfolders
for subfolder in subfolders:
    subfolder_path = os.path.join(folder_path, subfolder)
    
    # Find the Parquet file within the subfolder
    parquet_files = [f for f in os.listdir(subfolder_path) if f.endswith('.parquet')]
    
    if len(parquet_files) == 1:
        parquet_file = os.path.join(subfolder_path, parquet_files[0])
        
        # Read the Parquet file into a DataFrame
        df = pd.read_parquet(parquet_file)
        
        # Convert DataFrame to list of JSON records
        json_records = df.to_dict(orient='records')
        
        # Extract the date from the subfolder name
        date = subfolder[:-10]
        
        # Create a dictionary for the final document
        document = {
            "_id": date,
            "value": json_records
        }
        
        documents.append(document)
        
        # Write the final document to the output JSON file
    else:
        print(f"Skipping folder {subfolder}: No or multiple Parquet files found.")
    
    # Define the output JSON file path
        json_file = os.path.join(output_folder, 'idealista.json')
    
    with open(json_file, 'w', encoding='utf-8-sig') as file:
        json.dump(documents, file, ensure_ascii=False)