In [18]:
import pandas as pd
import requests
import datetime
import hashlib
import json

def fetch_metadata(url):
    """Fetch JSON metadata from an API endpoint."""
    response = requests.get(url)
    response.raise_for_status()
    return response.json()

def generate_hash(data):
    """Generate a SHA-256 hash for the dataset metadata."""
    data_str = json.dumps(data, sort_keys=True)  # Convert JSON to sorted string
    return hashlib.sha256(data_str.encode()).hexdigest()  # Compute hash

def merge_metadata(dataframes):
    """Merge multiple metadata DataFrames while preserving all columns."""
    merged_df = pd.concat(dataframes, ignore_index=True, sort=False)
    return merged_df

def save_to_csv(df, filename="merged_metadata.csv"):
    """Save the merged DataFrame to a CSV file."""
    df.to_csv(filename, index=False)
    print(f"Saved merged metadata to {filename}")

if __name__ == "__main__":
    # Read dataset names from datasets.csv
    datasets_df = pd.read_csv("opendata_swiss_datasets.csv")

    # Pick 10 random datasets from the CSV (assuming the dataset names are in the 'Dataset_Name' column)
    random_datasets = datasets_df['Dataset_Name'].sample(n=10, random_state=42).tolist()

    # List to store dataframes
    dataframes = []

    # Fetch and store API responses for the 10 random datasets
    for dataset in random_datasets:
        url = f"https://ckan.opendata.swiss/api/3/action/package_show?id={dataset}"
        timestamp = datetime.datetime.now().isoformat()

        # Fetch metadata
        data = fetch_metadata(url)

        # Generate hash
        dataset_hash = generate_hash(data)

        # Normalize JSON and add metadata columns
        df = pd.json_normalize(data)
        df['Dataset_Name'] = dataset  # Add dataset name column
        df['Request_Timestamp'] = timestamp  # Add timestamp column
        df['Metadata_Hash'] = dataset_hash  # Add hash column

        dataframes.append(df)

    # Merge all the dataframes
    merged_df = merge_metadata(dataframes)

    # Save the merged data to CSV
    save_to_csv(merged_df)


Saved merged metadata to merged_metadata.csv


### explode result.resources

In [21]:
import pandas as pd
import ast

# Read the CSV file
df_merged = pd.read_csv("merged_metadata.csv")

# Convert 'result.resources' column from string representation to a list of dictionaries
df_merged['result.resources'] = df_merged['result.resources'].apply(ast.literal_eval)

# Explode the 'result.resources' column while keeping the 'Dataset_Name' column
df_resources = df_merged.explode('result.resources')

# Convert dictionary entries to separate columns
df_resources = df_resources[['Dataset_Name', 'result.resources']].reset_index(drop=True)
df_resources = df_resources.join(df_resources.pop('result.resources').apply(pd.Series))



# Optionally, save the extracted resource details to a CSV file
df_resources.to_csv("resources_metadata.csv", index=False)
print("Saved extracted resources to resources_metadata.csv")


Saved extracted resources to resources_metadata.csv
