In [1]:
# 📚 Imports
import pandas as pd

In [3]:
# 📂 Fichiers d'entrée et de sortie
input_path = "../../data/cleaning/step7_revol_util_corrected.parquet"
output_path = "../../data/cleaning/step8_change_title.parquet"

# 📥 Chargement des données
df = pd.read_parquet(input_path)

# 🧠 Fonction de classification des métiers
def classify_job(title):
    if pd.isnull(title) or title.strip() == "":
        return 'unknown'
    title = title.lower()

    if any(x in title for x in ['teacher', 'professor', 'educator', 'instructor', 'principal', 'trainer', 'paraprofessional', 'librarian']):
        return 'education'
    if any(x in title for x in ['nurse', 'doctor', 'health', 'medical', 'therapist', 'pharmacist', 'cna', 'lpn', 'physician',
                                 'dental hygienist', 'paramedic', 'caregiver', 'emt', 'lvn', 'dentist', 'kaiser permanente', 
                                 'phlebotomist', 'specialist']):
        return 'health'
    if any(x in title for x in ['driver', 'transport', 'truck', 'transit', 'dispatcher', 'carrier', 'delivery', 'usps', 'ups', 
                                 'postal', 'conductor', 'courier', 'logistics', 'shipping']):
        return 'transport'
    if any(x in title for x in ['engineer', 'developer', 'it', 'software', 'technician', 'tech', 'machinist', 'maintenance', 
                                 'programmer', 'assembler', 'lineman', 'scientist', 'chemist', 'planner', 'inspector', 'installer', 'estimator']):
        return 'tech'
    if any(x in title for x in ['manager', 'director', 'ceo', 'supervisor', 'executive', 'vp', 'president', 'foreman', 
                                 'superintendent', 'cfo', 'lead', 'partner', 'coo', 'management', 'operations']):
        return 'management'
    if any(x in title for x in ['police', 'fire', 'military', 'army', 'officer', 'correctional', 'sergeant', 'deputy', 
                                 'sheriff', 'usaf', 'navy', 'captain', 'law', 'department defense', 'lieutenant', 'detective', 
                                 'nypd', 'government', 'state california', 'federal government', 'dept veterans affairs', 'coast guard',
                                 'social worker', 'air force']):
        return 'public_service'
    if any(x in title for x in ['retail', 'sales', 'store', 'customer service', 'cashier', 'server', 'associate', 'walmart', 
                                 'csr', 'clerk', 'bartender', 'agent', 'buyer', 'teller', 'real estate', 'realtor', 'dealer',
                                 'target', 'best buy', 'lowes', 'costco', 'macys', 'home depot', 'the home depot', 'verizon', 'verizon wireless', 'att']):
        return 'retail'
    if any(x in title for x in ['construction', 'contractor', 'electrician', 'mechanic', 'operator', 'laborer', 'custodian', 
                                 'carpenter', 'assembler', 'painter', 'plumber']):
        return 'construction'
    if any(x in title for x in ['accountant', 'finance', 'bank', 'analyst', 'auditor', 'paralegal', 'attorney', 'controller', 
                                 'financial advisor', 'bookkeeper', 'accounting', 'accounts payable', 'accounts receivable', 'billing specialist', 
                                 'loan processor', 'broker', 'morgan chase', 'cpa', 'chase', 'wells fargo']):
        return 'finance'
    if any(x in title for x in ['admin', 'assistant', 'administrator', 'secretary', 'receptionist', 'coordinator', 'clerical', 
                                 'staff', 'processor', 'mailhandler', 'housekeeper', 'housekeeping']):
        return 'administration'
    if any(x in title for x in ['consultant', 'business', 'owner', 'counselor', 'advisor', 'consulting', 'human resources', 'hr']):
        return 'consulting'
    if any(x in title for x in ['designer', 'marketing', 'communications', 'producer', 'stylist']):
        return 'creative'
    if any(x in title for x in ['warehouse', 'production', 'inspector', 'cook', 'chef', 'material handler', 'stocker', 'assembly']):
        return 'manual_work'
    if 'pastor' in title:
        return 'religious'
    if 'student' in title:
        return 'student'
    if 'unemployed' in title:
        return 'unemployed'
    if 'unknown' in title:
        return 'unknown'
    return 'other'

# 🧼 Application de la fonction
df['emp_title_clean'] = df['emp_title'].str.strip().str.lower()
df['job_category'] = df['emp_title_clean'].apply(classify_job)
df['job_category_encoded'] = df['job_category'].astype('category').cat.codes

# 💾 Sauvegarde du fichier nettoyé
df.to_parquet(output_path, index=False)
print(f"✅ Fichier sauvegardé dans : {output_path}")

✅ Fichier sauvegardé dans : ../../data/cleaning/step8_change_title.parquet
