In [30]:
import pandas as pd
import numpy as np 
import json
import geopandas as gpd
from pprint import pprint
import os

In [40]:
directory = "./processed_data/"

dataframes = {}
for filename in os.listdir(directory):
    if filename.endswith(".csv"):  
        file_path = os.path.join(directory, filename)
        try:
            df = pd.read_csv(file_path)
            dataframes[filename] = {
                "columns": df.columns.tolist(),
                "values": df.values.tolist()
            }
            
        except Exception as e:
            print(f"Could not read {filename} due to: {e}")


In [41]:
# Create placeholder dataframes for simulation
dfs = {
    name[:-4]: pd.DataFrame(columns=columns['columns'], data=columns['values']) for name, columns in dataframes.items()
}


In [42]:
final_columns = [
    "ID", "Name", "Type", "Latitude", "Longitude", "Description", "Details",
    "Address", "Geometry", "Contact"
]


In [43]:

final_rows = []

# Process each dataframe
for df_name, df_columns in dataframes.items():
    # Load the simulated dataframe
    df = dfs[df_name[:-4]]
    
    # Standardize column names
    standardized = {
        "ID": df["id"] if "id" in df.columns else None,
        "Name": df["name"] if "name" in df.columns else None,
        "Type": df["type"] if "type" in df.columns else None,
        "Latitude": df["latitude"] if "latitude" in df.columns else None,
        "Longitude": df["longitude"] if "longitude" in df.columns else None,
        "Description": df["description"] if "description" in df.columns else None,
        "Details": df.apply(
            lambda row: ", ".join(f"{col}: {row[col]}" for col in df.columns if col not in ["id", "name", "latitude", "longitude", "description", "country","city", "street", "postcode", "bounds", "geometry", "email", "phone", "type","housenumber", "website"] and pd.notna(row[col])),
            axis=1
        ) if len(df) > 0 else None,
        "Address": df.apply(
            lambda row: ", ".join(str(row[col]) for col in ["country", "city","street", "postcode", "housenumber"] if col in df.columns and pd.notna(row[col])),
            axis=1
        ) if len(df) > 0 else None,
        "Geometry": df["geometry"] if "geometry" in df.columns else None,
        "Contact": df.apply(
            lambda row: ", ".join(str(row[col]) for col in ["email", "phone", "website"] if col in df.columns and pd.notna(row[col])),
            axis=1
        ) if len(df) > 0 else None,
    }
    
    # Convert standardized dictionary to a DataFrame
    standardized_df = pd.DataFrame(standardized)
    
    # Append standardized dataframe to the final rows
    final_rows.append(standardized_df)




In [44]:
# Concatenate all rows into a final dataframe
final_df = pd.concat(final_rows, ignore_index=True)

# Reorder columns according to desired schema
final_df = final_df[final_columns]



In [45]:
final_df.to_csv('./final_processed_data.csv', index=False)