# Data Loading 

This notebook serves to ingest all raw datasets, to clean them, harmonize & merge them if needed and save the cleaned CSVs. 



## 0. Import Dependencies

In [26]:
# import libraries
import pandas as pd
import numpy as np
import os
from pathlib import Path
import kagglehub
import zipfile
import glob
import shutil
import re

In [None]:
# folder path 
DATA_RAW = os.path.join("..", "data", "raw")
DATA_CLEAN = os.path.join("..", "data", "clean")
DATA_INTERIM = os.path.join("..", "data", "interim")

In [3]:
# Seed for reproducibility
SEED = 42
np.random.seed(SEED)

## 1. File Paths

In [20]:
# folder path 
BASE_DIR = Path(r"C:/Users/black/Documents/Ironhack/final_project")
RAW_DIR = BASE_DIR / "data" / "raw"
CLEAN_DIR = BASE_DIR / "data" / "clean"

RAW_DIR.mkdir(parents=True, exist_ok=True)
CLEAN_DIR.mkdir(parents=True, exist_ok=True)

# 2. Helper Functions


In [41]:
def download_kagglehub_dataset(dataset_name, download_dir):
    """ Download & save dataset from KaggleHub."""
    try:
        print(f"Downloading dataset: {dataset_name}")
        path = kagglehub.dataset_download(dataset_name)
        print("Dataset downloaded to:", path)

        # Move downloaded files into your /data/raw folder
        dest = download_dir / dataset_name.replace("/", "_")
        dest.mkdir(parents=True, exist_ok=True)

        # Copy all downloaded files to the raw directory
        source_path = Path(path)
        for file in source_path.rglob("*"):  
            if file.is_file():
                relative_path = file.relative_to(source_path)
                new_file = dest / relative_path.name  
                
                if not new_file.exists():  
                    print(f"Copying: {file.name}")
                    shutil.copy2(file, new_file)

        print(f"Files moved to: {dest}")
        return dest

    except Exception as e:
        print(f"Error downloading dataset {dataset_name}: {e}")
        return None

def unzip_files_in_folder(folder_path):
    """Extract all ZIP files inside a folder."""
    zip_files = list(folder_path.glob("*.zip"))

    if not zip_files:
        return

    for z in zip_files:
        print(f"Extracting: {z.name}")
        try:
            with zipfile.ZipFile(z, "r") as zip_ref:
                zip_ref.extractall(folder_path)
        except Exception as e:
            print(f"Error unzipping {z.name}: {e}")



def load_csv_from_folder(folder_path):
    """Find and load the first CSV file in the folder with encoding handling."""
    csv_files = list(folder_path.glob("*.csv"))

    if not csv_files:
        print(f"No CSV files found in: {folder_path}")
        return None

    if len(csv_files) > 1:
        print(f"Multiple CSV files found, loading the first one:\n{csv_files}")

    file_to_load = csv_files[0]
    print(f"Loading CSV: {file_to_load.name}")

    encodings = ['utf-8', 'latin-1', 'iso-8859-1', 'cp1252']
    
    for encoding in encodings:
        try:
            print(f"  Trying {encoding}...")
            df = pd.read_csv(file_to_load, encoding=encoding)
            print(f"Success with {encoding}!")
            return df
        except UnicodeDecodeError:
            continue
        except Exception as e:
            print(f"Error: {e}")
            continue
    
    print("  All encodings failed. Loading with error='ignore'...")
    try:
        df = pd.read_csv(file_to_load, encoding='utf-8', encoding_errors='ignore')
        print("Loaded (some characters may be missing)")
        return df
    except Exception as e:
        print(f"Failed completely: {e}")
        return None


In [15]:
def standardize_columns(df):
    """Convert all column names to snake_case."""
    df.columns = (
        df.columns
        .str.strip()
        .str.lower()
        .str.replace(" ", "_")
        .str.replace("-", "_")
        .str.replace(r"[^a-zA-Z0-9_]", "", regex=True)
    )
    return df

In [16]:
def quick_clean(df):
    """Light cleaning: strip whitespace, unify types."""
    for col in df.select_dtypes(include="object"):
        df[col] = df[col].str.strip()
    return df

In [17]:
def load_kagglehub_csv(dataset_name, raw_dir):
    """
    High-level helper:
    1. Download dataset from KaggleHub
    2. Load first CSV found
    3. Standardize columns & basic cleaning
    """
    folder = download_kagglehub_dataset(dataset_name, raw_dir)
    if folder is None:
        return None
    
    unzip_files_in_folder(folder)

    df = load_csv_from_folder(folder)
    if df is None:
        return None

    df = standardize_columns(df)
    df = quick_clean(df)
    return df

In [18]:
def save_df(df, path):
    """Save DataFrame cleanly."""
    df.to_csv(path, index=False)
    print(f"Saved to: {path}")

## 2. Load all Data

### 2.1 Load & Quick Clean

In [28]:
dv_df_raw = load_kagglehub_csv("fahmidachowdhury/domestic-violence-against-women", RAW_DIR)

Downloading dataset: fahmidachowdhury/domestic-violence-against-women
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\fahmidachowdhury\domestic-violence-against-women\versions\1
Copying: Domestic violence.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\fahmidachowdhury_domestic-violence-against-women
Loading CSV: Domestic violence.csv


In [29]:
vawg_df_raw = load_kagglehub_csv("whenamancodes/violence-against-women-girls", RAW_DIR)

Downloading dataset: whenamancodes/violence-against-women-girls
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\whenamancodes\violence-against-women-girls\versions\1
Copying: Violence Against Women  Girls Data.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\whenamancodes_violence-against-women-girls
Loading CSV: Violence Against Women  Girls Data.csv


In [30]:
hfi_df_raw = load_kagglehub_csv("gsutters/the-human-freedom-index", RAW_DIR)

Downloading dataset: gsutters/the-human-freedom-index
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\gsutters\the-human-freedom-index\versions\5
Copying: hfi_cc_2018.csv
Copying: hfi_cc_2019.csv
Copying: hfi_cc_2020.csv
Copying: hfi_cc_2021.csv
Copying: hfi_cc_2022.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\gsutters_the-human-freedom-index
Multiple CSV files found, loading the first one:
[WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/gsutters_the-human-freedom-index/hfi_cc_2018.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/gsutters_the-human-freedom-index/hfi_cc_2019.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/gsutters_the-human-freedom-index/hfi_cc_2020.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/gsutters_the-human-freedom-index/hfi_cc_2021.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw

In [31]:
mdc_df_raw = load_kagglehub_csv("arpitsinghaiml/most-dangerous-countries-for-women-2024", RAW_DIR)

Downloading dataset: arpitsinghaiml/most-dangerous-countries-for-women-2024
Downloading from https://www.kaggle.com/api/v1/datasets/download/arpitsinghaiml/most-dangerous-countries-for-women-2024?dataset_version_number=1...


100%|██████████| 2.81k/2.81k [00:00<00:00, 1.36MB/s]

Extracting files...
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\arpitsinghaiml\most-dangerous-countries-for-women-2024\versions\1
Copying: most-dangerous-countries-for-women-2024.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\arpitsinghaiml_most-dangerous-countries-for-women-2024
Loading CSV: most-dangerous-countries-for-women-2024.csv





In [32]:
gdp_df_raw = load_kagglehub_csv("zgrcemta/world-gdpgdp-gdp-per-capita-and-annual-growths", RAW_DIR)

Downloading dataset: zgrcemta/world-gdpgdp-gdp-per-capita-and-annual-growths
Downloading from https://www.kaggle.com/api/v1/datasets/download/zgrcemta/world-gdpgdp-gdp-per-capita-and-annual-growths?dataset_version_number=2...


100%|██████████| 556k/556k [00:00<00:00, 954kB/s]

Extracting files...
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\zgrcemta\world-gdpgdp-gdp-per-capita-and-annual-growths\versions\2
Copying: gdp.csv
Copying: gdp_growth.csv
Copying: gdp_per_capita.csv
Copying: gdp_per_capita_growth.csv
Copying: gdp_ppp.csv
Copying: gdp_ppp_per_capita.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths
Multiple CSV files found, loading the first one:
[WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths/gdp.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths/gdp_growth.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/zgrcemta_world-gdpgdp-gdp-per-capita-and-annual-growths/gdp_per_capita.csv'), WindowsPath('C:/Users/black/Documents/Ironhack/final_project/data/raw/zgrcemta_world




In [33]:
gwg_df_raw = load_kagglehub_csv("mpwolke/cusersmarildownloadsgapcsv", RAW_DIR)

Downloading dataset: mpwolke/cusersmarildownloadsgapcsv
Downloading from https://www.kaggle.com/api/v1/datasets/download/mpwolke/cusersmarildownloadsgapcsv?dataset_version_number=1...


100%|██████████| 12.8k/12.8k [00:00<00:00, 5.13MB/s]

Extracting files...
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\mpwolke\cusersmarildownloadsgapcsv\versions\1
Copying: gap.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\mpwolke_cusersmarildownloadsgapcsv
Loading CSV: gap.csv





In [34]:
gud_df_raw = load_kagglehub_csv("sazidthe1/global-unemployment-data", RAW_DIR)

Downloading dataset: sazidthe1/global-unemployment-data
Downloading from https://www.kaggle.com/api/v1/datasets/download/sazidthe1/global-unemployment-data?dataset_version_number=1...


100%|██████████| 40.9k/40.9k [00:00<00:00, 449kB/s]

Extracting files...
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\sazidthe1\global-unemployment-data\versions\1
Copying: global_unemployment_data.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\sazidthe1_global-unemployment-data
Loading CSV: global_unemployment_data.csv





In [42]:
wed_df_raw = load_kagglehub_csv("nelgiriyewithana/world-educational-data", RAW_DIR)

Downloading dataset: nelgiriyewithana/world-educational-data
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\nelgiriyewithana\world-educational-data\versions\1
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\nelgiriyewithana_world-educational-data
Loading CSV: Global_Education.csv
  Trying utf-8...
  Trying latin-1...
Success with latin-1!


In [36]:
gii_df_raw = load_kagglehub_csv("gianinamariapetrascu/gender-inequality-index", RAW_DIR)

Downloading dataset: gianinamariapetrascu/gender-inequality-index
Downloading from https://www.kaggle.com/api/v1/datasets/download/gianinamariapetrascu/gender-inequality-index?dataset_version_number=2...


100%|██████████| 5.10k/5.10k [00:00<00:00, 2.59MB/s]

Extracting files...
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\gianinamariapetrascu\gender-inequality-index\versions\2
Copying: Gender_Inequality_Index.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\gianinamariapetrascu_gender-inequality-index
Loading CSV: Gender_Inequality_Index.csv





In [38]:
lf_df_raw = load_kagglehub_csv("willianoliveiragibin/legal-frameworks", RAW_DIR)

Downloading dataset: willianoliveiragibin/legal-frameworks
Downloading from https://www.kaggle.com/api/v1/datasets/download/willianoliveiragibin/legal-frameworks?dataset_version_number=1...


100%|██████████| 2.17k/2.17k [00:00<00:00, 705kB/s]

Extracting files...
Dataset downloaded to: C:\Users\black\.cache\kagglehub\datasets\willianoliveiragibin\legal-frameworks\versions\1
Copying: legal-frameworks-gender-equality-within-marriage-and-family new.csv
Files moved to: C:\Users\black\Documents\Ironhack\final_project\data\raw\willianoliveiragibin_legal-frameworks
Loading CSV: legal-frameworks-gender-equality-within-marriage-and-family new.csv





### 2.2 Sanity Check

In [43]:
datasets = {
    "domestic": dv_df_raw,
    "vawg": vawg_df_raw,
    "freedom": hfi_df_raw,
    "danger": mdc_df_raw,
    "gdp": gdp_df_raw,
    "wage_gap": gwg_df_raw,
    "unemployment": gud_df_raw,
    "education": wed_df_raw,
    "gii": gii_df_raw,
    "legal": lf_df_raw
}

for name, df in datasets.items():
    print(f"--- {name.upper()} ---")
    print("Shape:", df.shape)
    print("Columns:", df.columns.tolist())
    print("Missing values:", df.isna().sum().sum())
    print()

--- DOMESTIC ---
Shape: (347, 7)
Columns: ['sl_no', 'age', 'education', 'employment', 'income', 'marital_status', 'violence']
Missing values: 0

--- VAWG ---
Shape: (12600, 8)
Columns: ['recordid', 'country', 'gender', 'demographics_question', 'demographics_response', 'question', 'survey_year', 'value']
Missing values: 1413

--- FREEDOM ---
Shape: (1458, 123)
Columns: ['year', 'iso_code', 'countries', 'region', 'pf_rol_procedural', 'pf_rol_civil', 'pf_rol_criminal', 'pf_rol', 'pf_ss_homicide', 'pf_ss_disappearances_disap', 'pf_ss_disappearances_violent', 'pf_ss_disappearances_organized', 'pf_ss_disappearances_fatalities', 'pf_ss_disappearances_injuries', 'pf_ss_disappearances', 'pf_ss_women_fgm', 'pf_ss_women_missing', 'pf_ss_women_inheritance_widows', 'pf_ss_women_inheritance_daughters', 'pf_ss_women_inheritance', 'pf_ss_women', 'pf_ss', 'pf_movement_domestic', 'pf_movement_foreign', 'pf_movement_women', 'pf_movement', 'pf_religion_estop_establish', 'pf_religion_estop_operate', 'pf_re

## 3. Select & Merge Data