# Data Loading 

This notebook serves to ingest all raw datasets, to clean them, harmonize & merge them if needed and save the cleaned CSVs. 



## 0. Import Dependencies

In [396]:
# import libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path
import kagglehub
import zipfile
import glob
import shutil
import re
import sys
#!{sys.executable} -m pip install pycountry
import pycountry
#!{sys.executable} -m pip install fuzzywuzzy
from fuzzywuzzy import fuzz

In [78]:
# folder path 
DATA_RAW = os.path.join("..", "data", "raw")
DATA_CLEAN = os.path.join("..", "data", "clean")
DATA_PROCESSED = os.path.join("..", "data", "processed")

In [3]:
# Seed for reproducibility
SEED = 42
np.random.seed(SEED)

## 1. File Paths

In [80]:
# folder path 
BASE_DIR = Path(r"C:/Users/black/Documents/Ironhack/final_project")
RAW_DIR = BASE_DIR / "data" / "raw"
CLEAN_DIR = BASE_DIR / "data" / "clean"
PROCESSED_DIR = BASE_DIR / "data" / "processed"

RAW_DIR.mkdir(parents=True, exist_ok=True)
CLEAN_DIR.mkdir(parents=True, exist_ok=True)
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

## 2. Helper Functions

### 2.1 Loading Dataset from Kaggle

In [41]:
def download_kagglehub_dataset(dataset_name, download_dir):
    """ Download & save dataset from KaggleHub."""
    try:
        print(f"Downloading dataset: {dataset_name}")
        path = kagglehub.dataset_download(dataset_name)
        print("Dataset downloaded to:", path)

        # Move downloaded files into your /data/raw folder
        dest = download_dir / dataset_name.replace("/", "_")
        dest.mkdir(parents=True, exist_ok=True)

        # Copy all downloaded files to the raw directory
        source_path = Path(path)
        for file in source_path.rglob("*"):  
            if file.is_file():
                relative_path = file.relative_to(source_path)
                new_file = dest / relative_path.name  
                
                if not new_file.exists():  
                    print(f"Copying: {file.name}")
                    shutil.copy2(file, new_file)

        print(f"Files moved to: {dest}")
        return dest

    except Exception as e:
        print(f"Error downloading dataset {dataset_name}: {e}")
        return None

def unzip_files_in_folder(folder_path):
    """Extract all ZIP files inside a folder."""
    zip_files = list(folder_path.glob("*.zip"))

    if not zip_files:
        return

    for z in zip_files:
        print(f"Extracting: {z.name}")
        try:
            with zipfile.ZipFile(z, "r") as zip_ref:
                zip_ref.extractall(folder_path)
        except Exception as e:
            print(f"Error unzipping {z.name}: {e}")



def load_csv_from_folder(folder_path):
    """Find and load the first CSV file in the folder with encoding handling."""
    csv_files = list(folder_path.glob("*.csv"))

    if not csv_files:
        print(f"No CSV files found in: {folder_path}")
        return None

    if len(csv_files) > 1:
        print(f"Multiple CSV files found, loading the first one:\n{csv_files}")

    file_to_load = csv_files[0]
    print(f"Loading CSV: {file_to_load.name}")

    encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            print(f"  Trying {encoding}...")
            df = pd.read_csv(file_to_load, encoding=encoding)
            print(f"Success with {encoding}!")
            return df
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    print("  All encodings failed. Loading with error='ignore'...")
    try:
        df = pd.read_csv(file_to_load, encoding='utf-8', encoding_errors='ignore')
        print("Loaded (some characters may be missing)")
        return df
    except Exception as e:
        print(f"Failed completely: {e}")
        return None


### 2.2 Standardization + Quick Clean

In [15]:
def standardize_columns(df):
    """Convert all column names to snake_case."""
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace(r"[^a-zA-Z0-9_]", "", regex=True)
    )
    return df

In [16]:
def quick_clean(df):
    """Light cleaning: strip whitespace, unify types."""
    for col in df.select_dtypes(include="object"):
        df[col] = df[col].str.strip()
    return df

In [None]:
def save_df(df, path):
    """Save DataFrame cleanly."""
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(path, index=False)
    print(f"Saved to: {path}")


def load_kagglehub_csv(dataset_name, raw_dir, auto_save=True):
    """
    High-level helper:
    1. Download dataset from KaggleHub
    2. Load first CSV found
    3. Standardize columns & basic cleaning
    4. Auto-save cleaned version to RAW_DIR
    """
    # Create a clean filename for saving
    clean_filename = dataset_name.replace("/", "_") + "_cleaned.csv"
    save_path = raw_dir / clean_filename
    
    # Check if already exists
    if save_path.exists():
        print(f"Loading existing cleaned file: {clean_filename}")
        df = pd.read_csv(save_path)
        return df
    
    # Otherwise, download and process
    folder = download_kagglehub_dataset(dataset_name, raw_dir)
    if folder is None:
        return None
    
    unzip_files_in_folder(folder)

    df = load_csv_from_folder(folder)
    if df is None:
        return None

    df = standardize_columns(df)
    df = quick_clean(df)
    
    # Auto-save the cleaned version
    if auto_save:
        save_df(df, save_path, f"Saved cleaned data: {clean_filename}")
    
    return df

### 2.3 Loading + Saving

In [397]:
def download_kagglehub_dataset(dataset_name, download_dir):
    """ Download & save dataset from KaggleHub."""
    try:
        print(f"Downloading dataset: {dataset_name}")
        path = kagglehub.dataset_download(dataset_name)
        print("Dataset downloaded to:", path)

        # Move downloaded files into your /data/raw folder
        dest = download_dir / dataset_name.replace("/", "_")
        dest.mkdir(parents=True, exist_ok=True)

        # Copy all downloaded files to the raw directory
        source_path = Path(path)
        for file in source_path.rglob("*"):  
            if file.is_file():
                relative_path = file.relative_to(source_path)
                new_file = dest / relative_path.name  
                
                if not new_file.exists():  
                    print(f"Copying: {file.name}")
                    shutil.copy2(file, new_file)

        print(f"Files moved to: {dest}")
        return dest

    except Exception as e:
        print(f"Error downloading dataset {dataset_name}: {e}")
        return None

def unzip_files_in_folder(folder_path):
    """Extract all ZIP files inside a folder."""
    zip_files = list(folder_path.glob("*.zip"))

    if not zip_files:
        return

    for z in zip_files:
        print(f"Extracting: {z.name}")
        try:
            with zipfile.ZipFile(z, "r") as zip_ref:
                zip_ref.extractall(folder_path)
        except Exception as e:
            print(f"Error unzipping {z.name}: {e}")



def load_csv_from_folder(folder_path):
    """Find and load the first CSV file in the folder with encoding handling."""
    csv_files = list(folder_path.glob("*.csv"))

    if not csv_files:
        print(f"No CSV files found in: {folder_path}")
        return None

    if len(csv_files) > 1:
        print(f"Multiple CSV files found, loading the first one:\n{csv_files}")

    file_to_load = csv_files[0]
    print(f"Loading CSV: {file_to_load.name}")

    encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            print(f"  Trying {encoding}...")
            df = pd.read_csv(file_to_load, encoding=encoding)
            print(f"Success with {encoding}!")
            return df
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    print("  All encodings failed. Loading with error='ignore'...")
    try:
        df = pd.read_csv(file_to_load, encoding='utf-8', encoding_errors='ignore')
        print("Loaded (some characters may be missing)")
        return df
    except Exception as e:
        print(f"Failed completely: {e}")
        return None
def standardize_columns(df):
    """Convert all column names to snake_case."""
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace(r"[^a-zA-Z0-9_]", "", regex=True)
    )
    return df
def quick_clean(df):
    """Light cleaning: strip whitespace, unify types."""
    for col in df.select_dtypes(include="object"):
        df[col] = df[col].str.strip()
    return df
def load_kagglehub_csv(dataset_name, raw_dir):
    """
    High-level helper:
    1. Download dataset from KaggleHub
    2. Load first CSV found
    3. Standardize columns & basic cleaning
    """
    folder = download_kagglehub_dataset(dataset_name, raw_dir)
    if folder is None:
        return None
    
    unzip_files_in_folder(folder)

    df = load_csv_from_folder(folder)
    if df is None:
        return None

    df = standardize_columns(df)
    df = quick_clean(df)
    return df
def save_df(df, path):
    """Save DataFrame cleanly."""
    df.to_csv(path, index=False)
    print(f"Saved to: {path}")

### 2.4 ISO Code

In [430]:
def get_iso_code(country_name, method='alpha_3'):
    """
    Get ISO code for a country name using pycountry.
    
    Parameters:
    -----------
    country_name : str
        The country name to look up
    method : str
        'alpha_2' for 2-letter codes (US, GB)
        'alpha_3' for 3-letter codes (USA, GBR) - default
    
    Returns:
    --------
    str or None : ISO code if found, None otherwise
    """
    if pd.isna(country_name) or country_name == '':
        return None
    
    country_name = str(country_name).strip()
    
    # Direct lookup by name
    try:
        country = pycountry.countries.get(name=country_name)
        if country:
            return country.alpha_3 if method == 'alpha_3' else country.alpha_2
    except:
        pass
    
    # Try official name
    try:
        country = pycountry.countries.get(official_name=country_name)
        if country:
            return country.alpha_3 if method == 'alpha_3' else country.alpha_2
    except:
        pass
    
    # Fuzzy matching for close matches
    best_match = None
    best_score = 0
    
    for country in pycountry.countries:
        # Check name
        score = fuzz.ratio(country_name.lower(), country.name.lower())
        if score > best_score:
            best_score = score
            best_match = country
        
        # Check common_name if exists
        if hasattr(country, 'common_name'):
            score = fuzz.ratio(country_name.lower(), country.common_name.lower())
            if score > best_score:
                best_score = score
                best_match = country
        
        # Check official_name if exists
        if hasattr(country, 'official_name'):
            score = fuzz.ratio(country_name.lower(), country.official_name.lower())
            if score > best_score:
                best_score = score
                best_match = country
    
    # Only return if confidence is high enough (>85% match)
    if best_score >= 85 and best_match:
        return best_match.alpha_3 if method == 'alpha_3' else best_match.alpha_2
    
    return None


def create_country_mapping():
    """
    Create a manual mapping for countries that pycountry struggles with.
    This handles common variations and special cases.
    """
    manual_mapping = {
        # Common variations
        'United States': 'USA',
        'USA': 'USA',
        'US': 'USA',
        'United States of America': 'USA',
        'UK': 'GBR',
        'United Kingdom': 'GBR',
        'England': 'GBR',
        'Great Britain': 'GBR',
        'Russia': 'RUS',
        'Russian Federation': 'RUS',
        'South Korea': 'KOR',
        'Korea, Republic of': 'KOR',
        'North Korea': 'PRK',
        "Korea, Democratic People's Republic of": 'PRK',
        'Vietnam': 'VNM',
        'Viet Nam': 'VNM',
        'Syria': 'SYR',
        'Syrian Arab Republic': 'SYR',
        'Iran': 'IRN',
        'Iran, Islamic Republic of': 'IRN',
        'Venezuela': 'VEN',
        'Venezuela, Bolivarian Republic of': 'VEN',
        'Bolivia': 'BOL',
        'Bolivia, Plurinational State of': 'BOL',
        'Tanzania': 'TZA',
        'Tanzania, United Republic of': 'TZA',
        'Moldova': 'MDA',
        'Moldova, Republic of': 'MDA',
        'Laos': 'LAO',
        "Lao People's Democratic Republic": 'LAO',
        'Palestine': 'PSE',
        'Palestinian Territory': 'PSE',
        'Palestinian Territories': 'PSE',
        'State of Palestine': 'PSE',
        'Czechia': 'CZE',
        'Czech Republic': 'CZE',
        'Turkey': 'TUR',
        'Türkiye': 'TUR',
        'Cape Verde': 'CPV',
        'Cabo Verde': 'CPV',
        'Congo': 'COG',
        'Republic of the Congo': 'COG',
        'Congo, Republic of the': 'COG',
        'Democratic Republic of the Congo': 'COD',
        'Congo, Democratic Republic of the': 'COD',
        'DR Congo': 'COD',
        'DRC': 'COD',
        'Ivory Coast': 'CIV',
        "Côte d'Ivoire": 'CIV',
        'Cote d\'Ivoire': 'CIV',
        'Brunei': 'BRN',
        'Brunei Darussalam': 'BRN',
        'Micronesia': 'FSM',
        'Micronesia, Federated States of': 'FSM',
        'Macedonia': 'MKD',
        'North Macedonia': 'MKD',
        'The former Yugoslav Republic of Macedonia': 'MKD',
        'Eswatini': 'SWZ',
        'Swaziland': 'SWZ',
        'East Timor': 'TLS',
        'Timor-Leste': 'TLS',
        'Burma': 'MMR',
        'Myanmar': 'MMR',
        'Gambia': 'GMB',
        'The Gambia': 'GMB',
        'Bahamas': 'BHS',
        'The Bahamas': 'BHS',
        'Congo Democratic Republic': 'COD',
        'Congo, Dem. Rep.': 'COD',
        'Bahamas, The': 'BHS',  
        'Congo, Rep.': 'COG',
        'Egypt, Arab Rep.': 'EGY',
        'Gambia, The': 'GMB',
        'Hong Kong SAR, China': 'HKG',
        'Hong Kong, China': 'HKG',
        'Macau, China': 'MAC',
        'Taiwan, China': 'TWN',
        'Iran, Islamic Rep.': 'IRN',
        'Korea, Rep.': 'KOR',
        'Lao PDR': 'LAO',
        'Venezuela, RB': 'VEN',
        'Yemen, Rep.': 'YEM',
        'Channel Islands': 'GBR',
        'United States Virgin Islands': 'VIR',
        'Netherlands (Kingdom of the)': 'NLD',
    }
    return manual_mapping


def add_iso_codes(df, country_col='country', method='alpha_3', new_col_name='iso'):
    """
    Add ISO codes to a DataFrame with a country column.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing country names
    country_col : str
        Name of the column containing country names (default: 'country')
    method : str
        'alpha_2' for 2-letter codes or 'alpha_3' for 3-letter codes
    new_col_name : str
        Name for the new ISO code column (default: 'iso')
    
    Returns:
    --------
    pd.DataFrame : DataFrame with new ISO code column
    dict : Dictionary of unmatched countries
    """
    if country_col not in df.columns:
        print(f"Warning: Column '{country_col}' not found in DataFrame")
        return df, {}
    
    df = df.copy()
    manual_mapping = create_country_mapping()
    
    # Apply manual mapping first, then pycountry lookup
    def get_code(country_name):
        if pd.isna(country_name):
            return None
        
        country_name_clean = str(country_name).strip()
        
        # Check manual mapping first
        if country_name_clean in manual_mapping:
            return manual_mapping[country_name_clean]
        
        # Otherwise use pycountry
        return get_iso_code(country_name_clean, method=method)
    
    # Add ISO codes
    df[new_col_name] = df[country_col].apply(get_code)
    
    # Identify unmatched countries
    unmatched = df[df[new_col_name].isna()][country_col].unique()
    unmatched_dict = {country: None for country in unmatched if pd.notna(country)}
    
    if unmatched_dict:
        print(f"\nWarning: {len(unmatched_dict)} countries could not be matched:")
        for country in sorted(unmatched_dict.keys()):
            print(f"  - {country}")
        print("\nConsider adding these to the manual mapping.")
    else:
        print(f"All countries successfully matched!")
    
    # Summary statistics
    total = len(df)
    matched = df[new_col_name].notna().sum()
    print(f"\nMatching Summary:")
    print(f"   Total rows: {total}")
    print(f"   Matched: {matched} ({matched/total*100:.1f}%)")
    print(f"   Unmatched: {total - matched} ({(total-matched)/total*100:.1f}%)")
    
    return df, unmatched_dict


def standardize_all_countries(datasets_dict):
    """
    Add ISO codes to all datasets in a dictionary.
    
    Parameters:
    -----------
    datasets_dict : dict
        Dictionary where keys are dataset names and values are DataFrames
        Example: {'domestic': dv_df_pro, 'vawg': vawg_df_pro, ...}
    
    Returns:
    --------
    dict : Dictionary with updated DataFrames
    dict : Dictionary of all unmatched countries by dataset
    """
    updated_datasets = {}
    all_unmatched = {}
    
    # Define which column contains country names for each dataset
    country_columns = {
        'domestic': None,  
        'vawg': 'country',
        'freedom': 'country',
        'danger': 'country',
        'gdp': 'country',  
        'wage_gap': None,  
        'unemployment': 'country',
        'gii': 'country',
        'legal': 'country',
        'partner_violence': 'country',
        'eq_laws': 'continent', 
    }
    
    for dataset_name, df in datasets_dict.items():
        print(f"\n{'='*60}")
        print(f"Processing: {dataset_name.upper()}")
        print(f"{'='*60}")
        
        country_col = country_columns.get(dataset_name)
        
        # Skip if no country column
        if country_col is None:
            print(f"Skipping (no country column or already has ISO)")
            updated_datasets[dataset_name] = df
            continue
        
        # Skip if already has ISO column
        if 'iso' in df.columns:
            print(f"Already has 'iso' column")
            updated_datasets[dataset_name] = df
            continue
        
        # Add ISO codes
        updated_df, unmatched = add_iso_codes(
            df, 
            country_col=country_col,
            method='alpha_3',
            new_col_name='iso'
        )
        
        updated_datasets[dataset_name] = updated_df
        if unmatched:
            all_unmatched[dataset_name] = unmatched
    
    # Final summary
    print(f"\n{'='*60}")
    print("FINAL SUMMARY")
    print(f"{'='*60}")
    if all_unmatched:
        print("\nDatasets with unmatched countries:")
        for dataset, countries in all_unmatched.items():
            print(f"\n  {dataset.upper()}: {len(countries)} unmatched")
            for country in sorted(countries.keys())[:11]:  
                print(f"    - {country}")
            if len(countries) > 11:
                print(f"    ... and {len(countries) - 11} more")
    else:
        print("\nAll countries matched across all datasets!")
    
    return updated_datasets, all_unmatched


## 2. Load all Data

### 2.1 Load & Quick Clean

In [85]:
dv_df_clean = load_kagglehub_csv("fahmidachowdhury/domestic-violence-against-women", CLEAN_DIR)


Downloading dataset: fahmidachowdhury/domestic-violence-against-women
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\fahmidachowdhury\domestic-violence-against-women\versions\1
Copying: Domestic violence.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\fahmidachowdhury_domestic-violence-against-women
Loading CSV: Domestic violence.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: fahmidachowdhury_domestic-violence-against-women_cleaned.csv


In [86]:
vawg_df_clean = load_kagglehub_csv("whenamancodes/violence-against-women-girls", CLEAN_DIR)

Downloading dataset: whenamancodes/violence-against-women-girls
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\whenamancodes\violence-against-women-girls\versions\1
Copying: Violence Against Women  Girls Data.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\whenamancodes_violence-against-women-girls
Loading CSV: Violence Against Women  Girls Data.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: whenamancodes_violence-against-women-girls_cleaned.csv


In [88]:
hfi_df_clean = load_kagglehub_csv("gsutters/the-human-freedom-index", CLEAN_DIR)

Loading existing cleaned file: gsutters_the-human-freedom-index_cleaned.csv


In [97]:
mdc_df_clean = load_kagglehub_csv("arpitsinghaiml/most-dangerous-countries-for-women-2024", CLEAN_DIR)

Downloading dataset: arpitsinghaiml/most-dangerous-countries-for-women-2024
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\arpitsinghaiml\most-dangerous-countries-for-women-2024\versions\1
Copying: most-dangerous-countries-for-women-2024.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\arpitsinghaiml_most-dangerous-countries-for-women-2024
Loading CSV: most-dangerous-countries-for-women-2024.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: arpitsinghaiml_most-dangerous-countries-for-women-2024_cleaned.csv


In [89]:
gdp_df_clean = load_kagglehub_csv("zgrcemta/world-gdpgdp-gdp-per-capita-and-annual-growths", CLEAN_DIR)

Downloading dataset: zgrcemta/world-gdpgdp-gdp-per-capita-and-annual-growths
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\zgrcemta\world-gdpgdp-gdp-per-capita-and-annual-growths\versions\2
Copying: gdp.csv
Copying: gdp_growth.csv
Copying: gdp_per_capita.csv
Copying: gdp_per_capita_growth.csv
Copying: gdp_ppp.csv
Copying: gdp_ppp_per_capita.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths
Multiple CSV files found, loading the first one:
[WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/clean/zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths/gdp.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/clean/zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths/gdp_growth.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/clean/zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths/gdp_per_capita.csv'), WindowsPath('C:/Use

In [90]:
gwg_df_clean = load_kagglehub_csv("mpwolke/cusersmarildownloadsgapcsv", CLEAN_DIR)

Downloading dataset: mpwolke/cusersmarildownloadsgapcsv
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\mpwolke\cusersmarildownloadsgapcsv\versions\1
Copying: gap.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\mpwolke_cusersmarildownloadsgapcsv
Loading CSV: gap.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: mpwolke_cusersmarildownloadsgapcsv_cleaned.csv


In [91]:
gud_df_clean = load_kagglehub_csv("sazidthe1/global-unemployment-data", CLEAN_DIR)

Downloading dataset: sazidthe1/global-unemployment-data
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\sazidthe1\global-unemployment-data\versions\1
Copying: global_unemployment_data.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\sazidthe1_global-unemployment-data
Loading CSV: global_unemployment_data.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: sazidthe1_global-unemployment-data_cleaned.csv


In [99]:
wed_df_clean = load_kagglehub_csv("nelgiriyewithana/world-educational-data", CLEAN_DIR)

Downloading dataset: nelgiriyewithana/world-educational-data
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\nelgiriyewithana\world-educational-data\versions\1
Copying: Global_Education.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\nelgiriyewithana_world-educational-data
Loading CSV: Global_Education.csv
  Trying utf-8...
  Trying latin-1...
Success with latin-1!
Saved cleaned data: nelgiriyewithana_world-educational-data_cleaned.csv


In [92]:
gii_df_clean = load_kagglehub_csv("gianinamariapetrascu/gender-inequality-index", CLEAN_DIR)

Downloading dataset: gianinamariapetrascu/gender-inequality-index
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\gianinamariapetrascu\gender-inequality-index\versions\2
Copying: Gender_Inequality_Index.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\gianinamariapetrascu_gender-inequality-index
Loading CSV: Gender_Inequality_Index.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: gianinamariapetrascu_gender-inequality-index_cleaned.csv


In [93]:
lf_df_clean = load_kagglehub_csv("willianoliveiragibin/legal-frameworks", CLEAN_DIR)

Downloading dataset: willianoliveiragibin/legal-frameworks
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\willianoliveiragibin\legal-frameworks\versions\1
Copying: legal-frameworks-gender-equality-within-marriage-and-family new.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\willianoliveiragibin_legal-frameworks
Loading CSV: legal-frameworks-gender-equality-within-marriage-and-family new.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: willianoliveiragibin_legal-frameworks_cleaned.csv


In [295]:
aip_df_raw = pd.read_csv(r"data\raw\physical_sexual_abuse_current_or_former_partner.csv")
aip_df_clean = quick_clean(aip_df_raw)
aip_df_clean = standardize_columns(aip_df_clean)

In [95]:
gel_df_clean = load_kagglehub_csv("shreyasur965/global-gender-equality-in-business-laws1970-2023", CLEAN_DIR)

Downloading dataset: shreyasur965/global-gender-equality-in-business-laws1970-2023
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\shreyasur965\global-gender-equality-in-business-laws1970-2023\versions\2
Copying: women-rights.csv
Copying: women_rights_column_descriptors.txt
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\clean\shreyasur965_global-gender-equality-in-business-laws1970-2023
Loading CSV: women-rights.csv
  Trying utf-8...
Success with utf-8!
Saved cleaned data: shreyasur965_global-gender-equality-in-business-laws1970-2023_cleaned.csv


### 2.2 Sanity Check

In [None]:
datasets = {
    "domestic": dv_df_clean,
    "vawg": vawg_df_clean,
    "freedom": hfi_df_clean,
    "danger": mdc_df_clean,
    "gdp": gdp_df_clean,
    "wage_gap": gwg_df_clean,
    "unemployment": gud_df_clean,
    "gii": gii_df_clean,
    "legal": lf_df_clean,
    "partner_violence": aip_df_clean,
    "eq_laws": gel_df_clean
}

for name, df in datasets.items():
    print(f"--- {name.upper()} ---")
    print("Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("Missing values:", df.isna().sum().sum())
    print()

--- DOMESTIC ---
Shape: (347, 7)
Columns: ['sl_no', 'age', 'education', 'employment', 'income', 'marital_status', 'violence']
Missing values: 0

--- VAWG ---
Shape: (12600, 8)
Columns: ['recordid', 'country', 'gender', 'demographics_question', 'demographics_response', 'question', 'survey_year', 'value']
Missing values: 1413

--- FREEDOM ---
Shape: (1458, 123)
Columns: ['year', 'iso_code', 'countries', 'region', 'pf_rol_procedural', 'pf_rol_civil', 'pf_rol_criminal', 'pf_rol', 'pf_ss_homicide', 'pf_ss_disappearances_disap', 'pf_ss_disappearances_violent', 'pf_ss_disappearances_organized', 'pf_ss_disappearances_fatalities', 'pf_ss_disappearances_injuries', 'pf_ss_disappearances', 'pf_ss_women_fgm', 'pf_ss_women_missing', 'pf_ss_women_inheritance_widows', 'pf_ss_women_inheritance_daughters', 'pf_ss_women_inheritance', 'pf_ss_women', 'pf_ss', 'pf_movement_domestic', 'pf_movement_foreign', 'pf_movement_women', 'pf_movement', 'pf_religion_estop_establish', 'pf_religion_estop_operate', 'pf_re

## 3. Select Data 

### 3.1 Domestic Violence

- droping case number
- no missing values
- potential further processing:
    income bucktes (no, low, middle, high)

In [431]:
dv_df_pro = dv_df_clean.drop(columns="sl_no")

categorical_columns = ['education', 'employment', 'marital_status', 'violence']
for col in categorical_columns:
    dv_df_pro[col] = dv_df_pro[col].astype('category')



### 3.2 Violence Against Women & Girls

In [432]:
vawg_df_pro = vawg_df_clean.drop(columns="survey_year")

#create demographic group column (text normalized)
def normalize_text(s: str) -> str:
    s = s.lower()
    s = s.replace(" ", "_")
    s = s.replace("-", "_")
    s = s.replace(",", "")
    return s

vawg_df_pro["demographic_group"] = (
    vawg_df_pro["demographics_question"].apply(normalize_text)
    + "_" +
    vawg_df_pro["demographics_response"].apply(normalize_text)
)

In [433]:
# pivot table
def pivot_vawg(df: pd.DataFrame) -> pd.DataFrame:

    pivot_df = df.pivot_table(
        index=["country", "question"],
        columns="demographic_group",
        values="value",
        aggfunc="mean"
    )

    pivot_df = pivot_df.reset_index()

    # Fix column names: remove pivot table formatting
    pivot_df.columns = [str(col) for col in pivot_df.columns]

    return pivot_df


vawg_df_pro = pivot_vawg(vawg_df_pro)




In [434]:
categorical_columns = ['country', 'question']
for col in categorical_columns:
    vawg_df_pro[col] = vawg_df_pro[col].astype('category')

In [435]:
# Impute missing values

def impute_vawg(df: pd.DataFrame) -> pd.DataFrame:
    """
    Impute missing values in VAWG demographic % data.
    Strategy:
        1) If missing → country median for that question
        2) If still missing → global median for that demographic column
    """
    id_cols = ["country", "question"]
    feature_cols = [c for c in df.columns if c not in id_cols]

    for feature in feature_cols:
        df[feature] = df.groupby(["country", "question"])[feature].transform(
            lambda x: x.fillna(x.median())
        )

    for feature in feature_cols:
        if df[feature].isna().sum() > 0:
            df[feature] = df[feature].fillna(df[feature].median())

    return df

# Clean Question

def clean_question_text(df: pd.DataFrame) -> pd.DataFrame:
    """
    Standardize violence-related question text to ensure consistent keys.
    """
    df["question"] = df["question"].str.replace(r"^\.\.\.\s*", "", regex=True)

    # Text normalization: lowercase, strip, unify spaces
    df["question"] = (
        df["question"]
        .str.lower()
        .str.strip()
        .str.replace(r"\s+", " ", regex=True)
    )

    # Unify similar question labels
    replacements = {
        "if she burns food": "if she burns the food",
        "if she burns the food?": "if she burns the food",
        "if she goes out without telling him": "if she goes out without informing him",
    }

    df["question"] = df["question"].replace(replacements)

    return df


# romve columns with high missing values

def drop_sparse_columns(df: pd.DataFrame, threshold: float = 0.5) -> pd.DataFrame:
    """
    Drops columns with more than threshold % missing values.
    threshold=0.5 -> drop columns with >50% missing.
    """

    id_cols = ["country", "question"]
    feature_cols = [c for c in df.columns if c not in id_cols]

    missing_ratio = df[feature_cols].isna().mean()
    drop_cols = missing_ratio[missing_ratio > threshold].index.tolist()

    print(f"Dropping {len(drop_cols)} sparse columns: {drop_cols}")

    df = df.drop(columns=drop_cols)

    return df


# unique key (country + question)

def create_unique_key(df: pd.DataFrame) -> pd.DataFrame:
    """
    for later modeling
    """
    df["key"] = df["country"].astype(str) + " | " + df["question"].astype(str)
    return df


# pipeline

def clean_vawg_full(vawg_df_pro: pd.DataFrame) -> pd.DataFrame:
    df = vawg_df_pro.copy()
    df = clean_question_text(df)
    df = impute_vawg(df)
    df = drop_sparse_columns(df, threshold=0.5)
    df = create_unique_key(df)
    return df.reset_index(drop=True)


# apply

vawg_df_pro = clean_vawg_full(vawg_df_pro)
vawg_df_pro.head()

  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].transform(
  df[feature] = df.groupby(["country", "question"])[feature].tra

Dropping 0 sparse columns: []


  df[feature] = df.groupby(["country", "question"])[feature].transform(


Unnamed: 0,country,question,age_15_24,age_25_34,age_35_49,education_higher,education_no_education,education_primary,education_secondary,employment_employed_for_cash,employment_employed_for_kind,employment_unemployed,marital_status_married_or_living_together,marital_status_never_married,marital_status_widowed_divorced_separated,residence_rural,residence_urban,key
0,Afghanistan,for at least one specific reason,77.1,76.0,76.3,58.3,79.2,73.95,72.8,76.4,80.2,73.75,76.6,15.1,58.2,79.1,66.95,Afghanistan | for at least one specific reason
1,Afghanistan,if she argues with him,50.95,52.3,53.2,34.55,55.2,48.95,49.1,51.65,50.65,50.45,52.5,15.1,43.3,53.8,47.55,Afghanistan | if she argues with him
2,Afghanistan,if she burns the food,13.35,13.2,13.7,7.3,14.85,10.05,10.4,14.6,13.85,11.6,13.4,15.1,10.05,13.9,11.45,Afghanistan | if she burns the food
3,Afghanistan,if she goes out without informing him,65.5,64.45,62.85,42.35,67.2,60.0,58.85,63.8,66.85,60.9,64.2,15.1,47.5,67.45,52.45,Afghanistan | if she goes out without informin...
4,Afghanistan,if she neglects the children,35.65,38.6,36.9,27.15,39.3,36.45,33.7,36.85,39.95,40.2,37.35,15.1,36.2,38.5,33.15,Afghanistan | if she neglects the children


### 3.3 Violence/Sexual Abuse Current or former partner

In [436]:
# only select needed values
aip_df_pro = aip_df_clean[["location", "value"]]

# remove everything in square brackets including the brackets
aip_df_pro['value'] = aip_df_pro['value'].str.replace(r'\s*\[.*?\]', '', regex=True)

# trip any whitespace
aip_df_pro['value'] = aip_df_pro['value'].str.strip()

# rename column
aip_df_pro = aip_df_pro.rename(columns={"location" :  "country"})

# change value type
cat_col_vawg= ['country']
for col in cat_col_vawg:
    aip_df_pro[col] = aip_df_pro[col].astype('category')


num_col_vawg= ['value']
for col in num_col_vawg:
    aip_df_pro[col] = aip_df_pro[col].astype('int')



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aip_df_pro['value'] = aip_df_pro['value'].str.replace(r'\s*\[.*?\]', '', regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  aip_df_pro['value'] = aip_df_pro['value'].str.strip()


### 3.4 Human Freedom Index

In [437]:
hfi_df_clean = pd.read_csv(r"data\clean\gsutters_the-human-freedom-index\hfi_cc_2022.csv")
hfi_df_clean.columns.tolist()

# choose only needed data 
hfi_df_pro = hfi_df_clean[["year", "countries", "region", "hf_score", 'pf_score', 'ef_score']]

# only use latest data (2020) -> all available, better as mean from all years due to NaNs
hfi_df_pro  = hfi_df_pro[hfi_df_pro["year"] == 2020]

# drop year column 
hfi_df_pro = hfi_df_pro.drop(columns=["year"])

# rename columns
hfi_df_pro = hfi_df_pro.rename(columns={"countries" : "country"})

# change value type
cat_col_hfi = ['country', 'region']
for col in cat_col_hfi:
    hfi_df_pro[col] = hfi_df_pro[col].astype('category')



### 3.5 Most Dangerous Countries

In [438]:
mdc_df_clean.head()

# rename columns
mdc_df_pro = mdc_df_clean.rename(columns={
    "mostdangerouscountriesforwomen_womenpeaceandsecurityindex_score_2023" : "wpsi",
    "mostdangerouscountriesforwomen_womensdangerindexwdi_totalscore_2019" : "wdi",
    "mostdangerouscountriesforwomen_wdistreetsafety_2019" : "wss",
    "mostdangerouscountriesforwomen_wdiintentionalhomicide_2019" : "ih",
    "mostdangerouscountriesforwomen_wdinonpartnerviolence_2019" : "npv",
    "mostdangerouscountriesforwomen_wdiintimatepartnerviolence_2019" : "ipv",
    "mostdangerouscountriesforwomen_wdilegaldiscrimination_2019" : "ld",
    "mostdangerouscountriesforwomen_wdiglobalgendergap_2019" : "ggg",
    "mostdangerouscountriesforwomen_wdigenderinequality_2019" : "gi",
    "mostdangerouscountriesforwomen_wdiattitudestowardviolence_2019" : "atv"
    })


# split into two data sets due to missing values
# 1. Women Peace and Security Index 2023 (all countries): WPSI
wpsi_df_pro = mdc_df_pro[["country", "wpsi"]]

# 2. Women Danger index + respective parameters; remove ggg & gi as given in other data
mdc_df_pro = mdc_df_pro.drop(columns=["wpsi", "ggg", "gi"])

# drop missing values
mdc_df_pro = mdc_df_pro.dropna()

# reset index
mdc_df_pro = mdc_df_pro.reset_index(drop=True)


### 3.6 GDP

In [439]:
# keeping relevant columns
gdp_df_pro = gdp_df_clean[['country_name', 'code', '2020']]

# rename columns
gdp_df_pro = gdp_df_pro.rename(columns={'country_name' : "country", 'code' : "iso", '2020' : "gdp"})

# manual fill-in NaN
gdp_df_pro[gdp_df_pro.isna().any(axis=1)]

gdp_df_pro.iloc[0, 2] = 2.482e+9
gdp_df_pro.iloc[6, 2] = 2.891e+9
gdp_df_pro.iloc[38, 2] = 9.44e+9
gdp_df_pro.iloc[69, 2] = 2.084e+9
gdp_df_pro.iloc[78, 2] = 3.272e+9
gdp_df_pro.iloc[84, 2] = 3e+9
gdp_df_pro.iloc[91, 2] = 3.08e+9
gdp_df_pro.iloc[108, 2] = 6.68e+9
gdp_df_pro.iloc[137, 2] = 2.2e+9
gdp_df_pro.iloc[172, 2] = 9.45e+9
gdp_df_pro.iloc[193, 2] = 1.5847e+10
gdp_df_pro.iloc[199, 2] = 5.79e+9
gdp_df_pro.iloc[216, 2] = 5.4e+9
gdp_df_pro.iloc[227, 2] = 1.205e+10
gdp_df_pro.iloc[235, 2] = 4.582e+10
gdp_df_pro.iloc[254, 2] = 8.277e+10
gdp_df_pro.iloc[235, 2] = 4.582e+10
gdp_df_pro.iloc[262, 2] = 2.02e+10

# delete unsignificant rows
gdp_df_pro = gdp_df_pro.drop([110, 147, 164, 212, 225, 255, 256])

# log GDP to reduce skeweness
gdp_df_pro["log_GDP"] = np.log10(gdp_df_pro["gdp"])

# drop non-log GDP
gdp_df_pro = gdp_df_pro.drop(columns=["gdp"])

# change value type
cat_col_gdp= ['country', "iso"]
for col in cat_col_gdp:
    gdp_df_pro[col] = gdp_df_pro[col].astype('category')


### 3.7 Gender Wage Gap

In [None]:
# data into right format
column_names = ['iso', 'indicator', 'subject', 'measure', 'frequency', 'time', 'gwg_%']

gwg_df_pro = gwg_df_clean['locationindicatorsubjectmeasurefrequencytimevalue'].str.split(';', expand=True)

gwg_df_pro.columns = column_names

# only select useful data
gwg_df_pro = gwg_df_pro[gwg_df_pro["time"] == "2018"]  

gwg_df_pro = gwg_df_pro[gwg_df_pro["subject"] != "SELFEMPLOYED"]

gwg_df_pro = gwg_df_pro[["iso", "gwg_%"]]

# change gwg_% format 
gwg_df_pro["gwg_%"] = gwg_df_pro["gwg_%"].astype(str).str.split(".").str[:2].str.join(".").astype(float)

# reset index
gwg_df_pro = gwg_df_pro.reset_index(drop=True)

# change value type
cat_col_gwg= ["iso"]
for col in cat_col_gwg:
    gwg_df_pro[col] = gwg_df_pro[col].astype('category')



[CategoricalDtype(categories=['AUS', 'AUT', 'BEL', 'BGR', 'CAN', 'CHE', 'COL', 'CRI',
                   'CYP', 'CZE', 'DEU', 'DNK', 'ESP', 'EST', 'EU27', 'FIN',
                   'FRA', 'GBR', 'GRC', 'HRV', 'HUN', 'IRL', 'ISL', 'ISR',
                   'ITA', 'JPN', 'KOR', 'LTU', 'LVA', 'MEX', 'MLT', 'NLD',
                   'NOR', 'NZL', 'OECD', 'POL', 'PRT', 'ROU', 'SVK', 'SVN',
                   'SWE', 'TUR', 'USA'],
 , ordered=False, categories_dtype=object),
 dtype('float64')]

### 3.8 Global Unemployment Data

In [441]:
gud_df_clean.head()

# select relevant columns
gud_df_pro = gud_df_clean[["country_name", "sex", "age_group", "2024"]]

# rename columnc 
gud_df_pro = gud_df_pro.rename(columns={"country_name" : "country", "age_group" : "age", "2024" : "percentage_unemployement"})

# handle missing values
gud_df_pro[gud_df_pro.isna().any(axis=1)]

# fill Palestinian Territories with data from 2022
gud_df_pro.iloc[756, 3] = 56.709
gud_df_pro.iloc[757, 3] = 36.385
gud_df_pro.iloc[758, 3] = 40.045
gud_df_pro.iloc[759, 3] = 31.563
gud_df_pro.iloc[760, 3] = 16.772
gud_df_pro.iloc[761, 3] = 20.186

# fill Ukraine with data from 2021
gud_df_pro.iloc[1056, 3] = 20.412
gud_df_pro.iloc[1057, 3] = 9.519
gud_df_pro.iloc[1058, 3] = 10.143
gud_df_pro.iloc[1059, 3] = 18.085
gud_df_pro.iloc[1060, 3] = 8.933
gud_df_pro.iloc[1061, 3] = 9.543

# change value type
cat_col_gud= ["country", "sex", "age"]
for col in cat_col_gud:
    gud_df_pro[col] = gud_df_pro[col].astype('category')

# reset index
gud_df_pro  = gud_df_pro.reset_index(drop=True)


### 3.9 Global Ineguality Index

In [442]:
gii_df_clean[gii_df_clean.isna().any(axis=1)]

# missing values handling open

Unnamed: 0,country,human_development,gii,rank,maternal_mortality,adolescent_birth_rate,seats_parliament,f_secondary_educ,m_secondary_educ,f_labour_force,m_labour_force,iso
3,Hong Kong,Very high,,,,1.6,,77.1,83.4,53.5,65.8,HKG
15,Liechtenstein,Very high,,,,3.0,28.0,,,,,LIE
39,Andorra,Very high,,,,5.9,46.4,70.7,72.4,,,AND
43,San Marino,Very high,,,,3.8,33.3,81.8,84.3,,,SMR
68,Grenada,High,,,25.0,32.7,32.1,,,,,GRD
70,Antigua and Barbuda,High,,,42.0,33.1,31.4,,,,,ATG
71,Seychelles,High,,,53.0,53.4,22.9,,,,,SYC
74,Saint Kitts and Nevis,High,,,,38.2,25.0,,,,,KNA
81,Palau,High,,,,42.5,6.9,96.9,97.3,,,PLW
101,Dominica,High,,,,38.5,34.4,,,,,DMA


### 3.10 Legal Frameworks


In [443]:
# Legal frameworks that promote, enforce and monitor gender equality (percentage of achievement, 0 - 100)

lf_df_pro = lf_df_clean.drop(columns = "unnamed_3")
lf_df_pro = lf_df_pro.rename(columns = {"511___legal_frameworks" : "percent_leg_equ_achiev_marriage"})
lf_df_pro = lf_df_pro[lf_df_pro["year"] == 2022]
lf_df_pro = lf_df_pro.drop(columns="year")

cat_col_lf= ["country"]
for col in cat_col_lf:
    lf_df_pro[col] = lf_df_pro[col].astype('category')

### 3.11 Equality Laws

In [444]:
# only select latest data
gel_df_pro = gel_df_clean[gel_df_clean["year"] == 2023]

# rename columns
gel_df_pro = gel_df_pro.rename(columns={"entity" : "continent"})

# drop unnessecary columns
gel_df_pro = gel_df_pro.drop(columns=["year"])

# reset index
gel_df_pro = gel_df_pro.reset_index(drop=True)

## 4. ISO Country Column

In [None]:
# datasets dictionary
datasets = {
    "domestic": dv_df_pro,
    "vawg": vawg_df_pro,
    "freedom": hfi_df_pro,
    "danger": mdc_df_pro,
    "gdp": gdp_df_pro,
    "wage_gap": gwg_df_pro,
    "unemployment": gud_df_pro,
    "gii": gii_df_clean,
    "legal": lf_df_pro,
    "partner_violence": aip_df_pro,
    "eq_laws": gel_df_pro
}
    
# apply standardization
updated_datasets, unmatched = standardize_all_countries(datasets)
    
# unpack updated datasets back to individual variables
dv_df_pro = updated_datasets['domestic']
vawg_df_pro = updated_datasets['vawg']
hfi_df_pro = updated_datasets['freedom']
mdc_df_pro = updated_datasets['danger']
gdp_df_pro = updated_datasets['gdp']
gwg_df_pro = updated_datasets['wage_gap']
gud_df_pro = updated_datasets['unemployment']
gii_df_clean = updated_datasets['gii']
lf_df_pro = updated_datasets['legal']
aip_df_pro = updated_datasets['partner_violence']
gel_df_pro = updated_datasets['eq_laws']



Processing: DOMESTIC
Skipping (no country column or already has ISO)

Processing: VAWG
All countries successfully matched!

Matching Summary:
   Total rows: 415
   Matched: 415 (100.0%)
   Unmatched: 0 (0.0%)

Processing: FREEDOM
All countries successfully matched!

Matching Summary:
   Total rows: 165
   Matched: 165 (100.0%)
   Unmatched: 0 (0.0%)

Processing: DANGER
All countries successfully matched!

Matching Summary:
   Total rows: 50
   Matched: 50 (100.0%)
   Unmatched: 0 (0.0%)

Processing: GDP
Already has 'iso' column

Processing: WAGE_GAP
Skipping (no country column or already has ISO)

Processing: UNEMPLOYMENT
All countries successfully matched!

Matching Summary:
   Total rows: 1134
   Matched: 1134 (100.0%)
   Unmatched: 0 (0.0%)

Processing: GII
Already has 'iso' column

Processing: LEGAL

  - Central and Southern Asia (UN)
  - Eastern and South-Eastern Asia (UN)
  - Europe and Northern America (UN)
  - Latin America and the Caribbean (UN)
  - Least Developed Countries 

### Sanity Check

In [456]:
datasets = {
    "domestic": dv_df_pro,
    "vawg": vawg_df_pro,
    "freedom": hfi_df_pro,
    "danger": mdc_df_pro,
    "gdp": gdp_df_pro,
    "wage_gap": gwg_df_pro,
    "unemployment": gud_df_pro,
    "gii": gii_df_clean,
    "legal": lf_df_pro,
    "partner_violence": aip_df_pro,
    "eq_laws": gel_df_pro
}

for name, df in datasets.items():
    print(f"--- {name.upper()} ---")
    print("Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("Data types:", df.dtypes.tolist())
    print("Missing values:", df.isna().sum().sum())
    print()

--- DOMESTIC ---
Shape: (347, 6)
Columns: ['age', 'education', 'employment', 'income', 'marital_status', 'violence']
Data types: [dtype('int64'), CategoricalDtype(categories=['none', 'primary', 'secondary', 'tertiary'], ordered=False, categories_dtype=object), CategoricalDtype(categories=['employed', 'semi employed', 'unemployed'], ordered=False, categories_dtype=object), dtype('int64'), CategoricalDtype(categories=['married', 'unmarred'], ordered=False, categories_dtype=object), CategoricalDtype(categories=['no', 'yes'], ordered=False, categories_dtype=object)]
Missing values: 0

--- VAWG ---
Shape: (415, 19)
Columns: ['country', 'question', 'age_15_24', 'age_25_34', 'age_35_49', 'education_higher', 'education_no_education', 'education_primary', 'education_secondary', 'employment_employed_for_cash', 'employment_employed_for_kind', 'employment_unemployed', 'marital_status_married_or_living_together', 'marital_status_never_married', 'marital_status_widowed_divorced_separated', 'residenc

## Saving Processed Data

In [459]:
for name, df in datasets.items():
    file_path = PROCESSED_DIR / f"{name}.csv"
    save_df(df, file_path)

Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\domestic.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\vawg.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\freedom.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\danger.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\gdp.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\wage_gap.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\unemployment.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\gii.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\legal.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\partner_violence.csv
Saved to: C:\Users\black\Documents\Ironhack\final_project\data\processed\eq_laws.csv
