In [11]:
import os
import pandas as pd
import numpy as np

# Set project root to ml_project directory
if os.getcwd().endswith('src'):
    os.chdir('..')
    
project_root = os.getcwd()
print(f"Working directory: {project_root}")

# Load datasets
main_df = pd.read_csv("data/learn_dataset.csv")
sport_df = pd.read_csv("data/learn_dataset_sport.csv")
job_df = pd.read_csv("data/learn_dataset_job.csv")
job_security_df = pd.read_csv("data/learn_dataset_JOB_SECURITY.csv")
retired_former_df = pd.read_csv("data/learn_dataset_retired_former.csv")
retired_jobs_df = pd.read_csv("data/learn_dataset_retired_jobs.csv")
retired_pension_df = pd.read_csv("data/learn_dataset_retired_pension.csv")

print(f"✓ Datasets loaded: main_df{main_df.shape}, sport_df{sport_df.shape}, job_df{job_df.shape}, job_security_df{job_security_df.shape}, retired_former_df{retired_former_df.shape}, retired_jobs_df{retired_jobs_df.shape}, retired_pension_df{retired_pension_df.shape}")

Working directory: /Users/arthur/Documents/Universite/M2-QEA/Machine Learning/final_project/ml_project
✓ Datasets loaded: main_df(50044, 10), sport_df(6460, 2), job_df(19336, 11), job_security_df(24224, 2), retired_former_df(13176, 4), retired_jobs_df(11226, 11), retired_pension_df(11226, 2)


In [12]:
# Function to merge and combine overlapping columns
def merge_and_combine(left_df, right_df, on='primary_key'):
    # Find overlapping columns (excluding the merge key)
    overlap_cols = [col for col in left_df.columns if col in right_df.columns and col != on]
    
    # Merge with suffixes
    merged = left_df.merge(right_df, on=on, how='left', suffixes=('', '_new'))
    
    # Combine overlapping columns (fill NaN in original with values from new)
    for col in overlap_cols:
        if col in merged.columns and f'{col}_new' in merged.columns:
            merged[col] = merged[col].fillna(merged[f'{col}_new'])
            merged.drop(f'{col}_new', axis=1, inplace=True)
    
    return merged

# Merge all dataframes to main_df using primary_key
print(f"Initial main_df shape: {main_df.shape}")

# Merge sport data
main_df = merge_and_combine(main_df, sport_df)
print(f"After merging sport_df: {main_df.shape}")

# Merge job data
main_df = merge_and_combine(main_df, job_df)
print(f"After merging job_df: {main_df.shape}")

# Merge job security data
main_df = merge_and_combine(main_df, job_security_df)
print(f"After merging job_security_df: {main_df.shape}")

# Merge retired former data
main_df = merge_and_combine(main_df, retired_former_df)
print(f"After merging retired_former_df: {main_df.shape}")

# Merge retired jobs data
main_df = merge_and_combine(main_df, retired_jobs_df)
print(f"After merging retired_jobs_df: {main_df.shape}")

# Merge retired pension data
main_df = merge_and_combine(main_df, retired_pension_df)
print(f"After merging retired_pension_df: {main_df.shape}")

print(f"\n✓ Final merged dataframe shape: {main_df.shape}")
print(f"✓ Total columns: {len(main_df.columns)}")

Initial main_df shape: (50044, 10)
After merging sport_df: (50044, 11)
After merging job_df: (50044, 21)
After merging job_security_df: (50044, 22)
After merging retired_former_df: (50044, 25)
After merging retired_jobs_df: (50044, 26)
After merging retired_pension_df: (50044, 27)

✓ Final merged dataframe shape: (50044, 27)
✓ Total columns: 27


In [None]:
# One-hot encode categorical variables with less than 50 distinct values
categorical_cols = main_df.select_dtypes(include=['object', 'category']).columns

# Separate categorical columns by number of distinct values
cols_to_encode = [col for col in categorical_cols if main_df[col].nunique() < 50]
cols_to_drop = [col for col in categorical_cols if main_df[col].nunique() >= 50]

print(f"Columns to encode ({len(cols_to_encode)}): {cols_to_encode}")
print(f"Columns to drop ({len(cols_to_drop)}): {cols_to_drop}")

# Drop high-cardinality categorical columns
main_df = main_df.drop(columns=cols_to_drop)

# One-hot encode the remaining categorical columns
main_df = pd.get_dummies(main_df, columns=cols_to_encode, drop_first=True)

AttributeError: 'list' object has no attribute 'tolist'

In [None]:
main_df.to_pickle("data/merged_learn_dataset.pkl")