In [3]:
# Merge data of CSV files we are going to use
import pandas as pd
from os.path import exists
from os import makedirs

# Define data path
dataPath = '../data/birthPlaceRegion/'
mergedPath = '../data/merged_data/'
if not exists(mergedPath):
    makedirs(mergedPath)


# Define file paths for each year
def merge_csvs_explicit():
    years = range(1997, 2026)  # Only up to 2024 (existing data)
    
    dfs = []
    for year in years:
        file_path = f"{dataPath}{year}_birthPlaceRegion_sex.csv"
        try:
            df = pd.read_csv(file_path, delimiter=',')
            # Replace '..' with 0 in column 'Value' and coerce to numeric
            if 'Value' in df.columns:
                df['Value'] = df['Value'].replace('..', 0)
                df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)
            dfs.append(df)
            print(f"Successfully read {year}")
        except FileNotFoundError:
            print(f"Warning: File for year {year} not found")
    
    if dfs:
        df_all_years = pd.concat(dfs, ignore_index=True)
        print(f"Successfully merged {len(dfs)} files")
        return df_all_years
    else:
        print("No files were found to merge")
        return None


# Concatenate all DataFrames into a single DataFrame
df_combined = merge_csvs_explicit()  # or merge_csvs_explicit()
# Save the combined DataFrame to a new CSV file
df_combined.to_csv(f"{mergedPath}merged_birthPlaceRegion_sex.csv", index=False)
print("Merged data saved to 'merged_birthPlaceRegion_sex.csv'")



Successfully read 1997
Successfully read 1998
Successfully read 1999
Successfully read 2000
Successfully read 2001
Successfully read 2002
Successfully read 2003
Successfully read 2004
Successfully read 2005
Successfully read 2006
Successfully read 2007
Successfully read 2008
Successfully read 2009
Successfully read 2010
Successfully read 2011
Successfully read 2012
Successfully read 2013
Successfully read 2014
Successfully read 2015
Successfully read 2016
Successfully read 2017
Successfully read 2018
Successfully read 2019
Successfully read 2020
Successfully read 2021
Successfully read 2022
Successfully read 2023
Successfully read 2024
Successfully read 2025
Successfully merged 29 files
Merged data saved to 'merged_birthPlaceRegion_sex.csv'


In [4]:
# Merge data of CSV files Data_Lloc_naix_(esp_vs_fuera)
#Define data path
dataPath = '../data/birthPlace_spain_v_outside/'
mergedPath = '../data/merged_data/'
# Define file paths for each year
def merge_csvs_explicit_esp():
    years = range(1997, 2026)  # Only up to 2024 (existing data)
    
    dfs = []
    for year in years:
        file_path = f"{dataPath}{year}_birthPlace_spain_v_outside.csv"
        try:
            df = pd.read_csv(file_path, delimiter=',')
            # Replace '..' with 0 in column 'Value' and coerce to numeric
            if 'Value' in df.columns:
                df['Value'] = df['Value'].replace('..', 0)
                df['Value'] = pd.to_numeric(df['Value'], errors='coerce').fillna(0)
            dfs.append(df)
            print(f"Successfully read {year}")
        except FileNotFoundError:
            print(f"Warning: File for year {year} not found")
    
    if dfs:
        df_all_years = pd.concat(dfs, ignore_index=True)
        print(f"Successfully merged {len(dfs)} files")
        return df_all_years
    else:
        print("No files were found to merge")
        return None


# Concatenate all DataFrames into a single DataFrame
df_combined = merge_csvs_explicit_esp()  # or merge_csvs_explicit()
# Save the combined DataFrame to a new CSV file
df_combined.to_csv(f"{mergedPath}merged_birthPlace_spain_v_outside.csv", index=False)
print("Merged data saved to 'merged_birthPlace_spain_v_outside.csv'")


Successfully read 1997
Successfully read 1998
Successfully read 1999
Successfully read 2000
Successfully read 2001
Successfully read 2002
Successfully read 2003
Successfully read 2004
Successfully read 2005
Successfully read 2006
Successfully read 2007
Successfully read 2008
Successfully read 2009
Successfully read 2010
Successfully read 2011
Successfully read 2012
Successfully read 2013
Successfully read 2014
Successfully read 2015
Successfully read 2016
Successfully read 2017
Successfully read 2018
Successfully read 2019
Successfully read 2020
Successfully read 2021
Successfully read 2022
Successfully read 2023
Successfully read 2024
Successfully read 2025
Successfully merged 29 files
Merged data saved to 'merged_birthPlace_spain_v_outside.csv'


## Data Adoptions

In [5]:
# Path to adoption folders
from ntpath import join
from os.path import exists
from os import makedirs
import re
import pandas as pd

path_adoption_tot = '../data_original/adoption/Total_adoption/'
path_adoption_inq = '../data_original/adoption/Adoption_inquiries/'
mergedPath = '../data/merged_data/'
if not exists(mergedPath):
    makedirs(mergedPath)
# Merge adoption total data
# ...existing code...
def merge_adoption_data(path, out_name='merged_adoption_total.csv'):
    import re
    from io import StringIO
    from os.path import join

    files = [
        't15833202300-201200.csv',
        't15833201100.csv',
        't15833201000.csv',
        't15833200900.csv',
        't15833200800-199800.csv'
    ]

    dfs = []
    for file in files:
        fp = join(path, file)
        try:
            with open(fp, 'r', encoding='utf-8') as f:
                lines = f.readlines()
        except FileNotFoundError:
            print(f"Warning: {file} not found in {path}")
            continue
        except Exception as e:
            print(f"Error opening {file}: {e}")
            continue

        # find the header line that starts with a comma followed by a 4-digit year (handles single- and multi-year files)
        header_idx = None
        for i, line in enumerate(lines):
            if re.match(r'^\s*,\s*\d{4}', line):
                header_idx = i
                break
        # fallback: any line that contains a 4-digit year
        if header_idx is None:
            for i, line in enumerate(lines):
                if re.search(r'\d{4}', line):
                    header_idx = i
                    break
        if header_idx is None:
            print(f"Warning: year-header row not found in {file}, skipping")
            continue

        csv_text = ''.join(lines[header_idx:])  # header + data portion
        try:
            tmp = pd.read_csv(StringIO(csv_text), sep=',', header=0, dtype=str, keep_default_na=False)
        except Exception as e:
            print(f"Error reading CSV portion of {file}: {e}")
            continue

        # Ensure a column name for the first column and forward-fill broken rows
        first_col = tmp.columns[0]
        if first_col == '' or first_col is None:
            tmp = tmp.rename(columns={tmp.columns[0]: 'Category'})
            first_col = 'Category'
        tmp[first_col] = tmp[first_col].replace('', pd.NA).ffill()

        # melt to long format: Category | Year | Value
        long = tmp.melt(id_vars=[first_col], var_name='Year', value_name='Value')

        # cleanup Year and Value
        long['Year'] = long['Year'].astype(str).str.extract(r'(\d{4})')[0]
        long = long.dropna(subset=['Year'])
        long['Year'] = long['Year'].astype(int)

        long['Value'] = long['Value'].astype(str).str.strip()
        long['Value'] = long['Value'].replace({'': None, '..': None})
        long['Value'] = pd.to_numeric(long['Value'], errors='coerce').fillna(0).astype(int)

        long = long.rename(columns={first_col: 'Category'})

        dfs.append(long)
        print(f"Processed {file}")

    if not dfs:
        print("No adoption files merged")
        return None

    df_all = pd.concat(dfs, ignore_index=True)
    if not exists(mergedPath):
        makedirs(mergedPath)
    df_all.to_csv(join(mergedPath, out_name), index=False)
    print(f"Saved {out_name} with {len(df_all)} rows")
    return df_all
# ...existing code...

# Merge adoption total data
merge_adoption_data(path_adoption_tot, out_name='merged_adoption_total.csv')


Processed t15833202300-201200.csv
Processed t15833201100.csv
Processed t15833201000.csv
Processed t15833200900.csv
Processed t15833200800-199800.csv
Saved merged_adoption_total.csv with 1678 rows


Unnamed: 0,Category,Year,Value
0,Niños adoptados,2023,64
1,Total Europa,2023,1
2,Bosnia y Herzegovina,2023,0
3,Bulgaria,2023,0
4,Croacia,2023,0
...,...,...,...
1673,Tailandia,1998,0
1674,Vietnam,1998,0
1675,China,1998,28
1676,Resto de Asia,1998,0


#### Data Clean for Adoption Inquiries

In [7]:
# Clean adoption inquiries data files by stripping extraneous headers
def strip_header_and_save(in_fp, out_fp):
    import re
    from io import StringIO
    import pandas as pd

    with open(in_fp, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    # Prefer header lines like ",2009" or ",2023,2022,..."
    header_idx = None
    for i, line in enumerate(lines):
        if re.match(r'^\s*,\s*\d{4}', line):
            header_idx = i
            break

    # fallback: find first data-like line (contains digits and commas) and take previous line as header
    if header_idx is None:
        for i, line in enumerate(lines):
            if ',' in line and re.search(r'\d', line):
                header_idx = max(0, i - 1)
                break

    if header_idx is None:
        raise RuntimeError(f"Header not found in {in_fp}")

    csv_text = ''.join(lines[header_idx:])
    df = pd.read_csv(StringIO(csv_text), sep=',', header=0, dtype=str, keep_default_na=False)
    df.to_csv(out_fp, index=False)
    print(f"Saved cleaned CSV: {out_fp}")

# Example: clean one file
strip_header_and_save(
    "../data_original/adoption/Adoption_inquiries/t15832c1.csv")
# Batch: process all CSVs in a folder
import os, glob
folder = "../data_original/adoption/Adoption_inquiries/"
for fp in glob.glob(os.path.join(folder, "*.csv")):
    out = fp.replace(".csv", "_clean.csv")
    try:
        strip_header_and_save(fp, out)
    except Exception as e:
        print("Skipped", fp, "->", e)


TypeError: strip_header_and_save() missing 1 required positional argument: 'out_fp'