In [20]:
import pandas as pd
import numpy as np
from datetime import datetime
import os


os.makedirs('data', exist_ok=True)
os.makedirs('outputs', exist_ok=True)

In [21]:
data_path = 'data/cleaned_data.csv'
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,NACCID,NACCADC,PACKET,FORMVER,VISITMO,VISITDAY,VISITYR,NACCVNUM,NACCAVST,NACCNVST,...,INLIVWTH,NACCFAM,NACCMOM,NACCFADM,NACCAM,NACCFFTD,NACCFM,NACCBMI,NACCUDSD,DEMENTED
0,NACC002909,186,I,3.0,12,28,2022,1,2,2,...,1.0,1.0,0.0,0,9.0,0,9.0,32.4,3,0
1,NACC002909,186,F,3.0,1,23,2024,2,2,2,...,1.0,1.0,0.0,0,9.0,0,9.0,30.7,3,0
2,NACC003487,186,I,3.0,11,15,2023,1,1,1,...,1.0,0.0,0.0,0,9.0,0,9.0,23.7,1,0
3,NACC004352,186,I,3.0,10,5,2021,1,1,1,...,,,,0,,0,,888.8,4,1
4,NACC004687,186,I,3.0,11,14,2022,1,1,1,...,0.0,9.0,0.0,0,9.0,0,9.0,19.0,1,0


Identify Target Variable

In [22]:
TARGET_CANDIDATES = [
    'DEMENTED',     # Binary: 0=Normal, 1=Demented (most direct)
    'NORMCOG',      # Binary: 1=Normal cognition
    'NACCALZD',     # Alzheimer's disease diagnosis
    'NACCALZP',     # Probable AD
    'CDRGLOB',      # CDR Global (can convert: 0=normal, >=0.5=impaired)
    'NACCCOGF',     # Cognitive status
]

TARGET = None
print(" Searching for target variable...\n")

for candidate in TARGET_CANDIDATES:
    if candidate in df.columns:
        print(f"✓ Found: {candidate}")
        print(f"  Value counts: {df[candidate].value_counts().to_dict()}")
        print(f"  Missing: {df[candidate].isna().sum()} ({df[candidate].isna().sum()/len(df)*100:.1f}%)")
        print()
        
        if TARGET is None and candidate == 'DEMENTED':
            TARGET = candidate

# If DEMENTED not found, use first available
if TARGET is None:
    for candidate in TARGET_CANDIDATES:
        if candidate in df.columns:
            TARGET = candidate
            break

# If still not found, try creating from CDRGLOB
if TARGET is None and 'CDRGLOB' in df.columns:
    print("⚙️  Creating binary target from CDRGLOB")
    print("   0 = Normal cognition")
    print("   0.5+ = Cognitive impairment/dementia")
    df['DEMENTIA_BINARY'] = (df['CDRGLOB'] >= 0.5).astype(int)
    TARGET = 'DEMENTIA_BINARY'

if TARGET is None:
    print("  Could not find or create target variable!")
    print("   Available columns that might be targets:")
    possible = [c for c in df.columns if any(x in c.upper() for x in ['DEM', 'COG', 'CDR', 'DIAG'])]
    for p in possible[:20]:
        print(f"     - {p}")
else:
    print(f" Selected target variable: {TARGET}")
    print(f"   Distribution: {df[TARGET].value_counts().to_dict()}")

 Searching for target variable...

✓ Found: DEMENTED
  Value counts: {0: 137606, 1: 57590}
  Missing: 0 (0.0%)

 Selected target variable: DEMENTED
   Distribution: {0: 137606, 1: 57590}


Separate Features and Target

In [23]:
X = df.drop(columns=[TARGET])
y = df[TARGET]

print(f" Target: {TARGET}")
print(f"   Features: {len(X.columns)}")
print(f"   Samples: {len(X):,}")

 Target: DEMENTED
   Features: 43
   Samples: 195,196


Create Age-Based Features

In [24]:
if 'NACCAGE' in X.columns:

    X['age_squared'] = X['NACCAGE'] ** 2
    X['age_cubed'] = X['NACCAGE'] ** 3
    

    X['age_group_65'] = (X['NACCAGE'] >= 65).astype(int)
    X['age_group_75'] = (X['NACCAGE'] >= 75).astype(int)
    X['age_group_85'] = (X['NACCAGE'] >= 85).astype(int)
    
    print("Age features created: 5")
    print("   - age_squared")
    print("   - age_cubed")
    print("   - age_group_65, age_group_75, age_group_85")
else:
    print("⚠️  NACCAGE not found, skipping age features")

Age features created: 5
   - age_squared
   - age_cubed
   - age_group_65, age_group_75, age_group_85


Create Education Features

In [25]:

if 'EDUC' in X.columns:

    X['low_education'] = (X['EDUC'] < 12).astype(int)  # Less than high school
    X['high_education'] = (X['EDUC'] >= 16).astype(int)  # College graduate+
    
    print("Education features created: 2")
    print("   - low_education (<12 years)")
    print("   - high_education (>=16 years)")
    
    # Age-Education interactions
    if 'NACCAGE' in X.columns:
        X['age_edu_interaction'] = X['NACCAGE'] * X['EDUC']
        X['age_edu_ratio'] = X['NACCAGE'] / (X['EDUC'] + 1)
        print("\n Age-Education interactions: 2")
        print("   - age_edu_interaction")
        print("   - age_edu_ratio")
else:
    print("⚠️  EDUC not found, skipping education features")

Education features created: 2
   - low_education (<12 years)
   - high_education (>=16 years)

 Age-Education interactions: 2
   - age_edu_interaction
   - age_edu_ratio


Create Social Isolation Score

In [26]:
social_isolation = 0
factors = 0

# Not married
if 'MARISTAT' in X.columns:
    social_isolation += (X['MARISTAT'] != 1).astype(int)
    factors += 1
    print("    Using MARISTAT (marital status)")

# Lives alone
if 'NACCLIVS' in X.columns:
    social_isolation += (X['NACCLIVS'] == 1).astype(int)
    factors += 1
    print("    Using NACCLIVS (living situation)")

# No live-in companion
if 'INLIVWTH' in X.columns:
    social_isolation += (X['INLIVWTH'] == 0).astype(int)
    factors += 1
    print("    Using INLIVWTH (informant lives with)")

if factors > 0 and isinstance(social_isolation, pd.Series):
    X['social_isolation_score'] = social_isolation
    print(f"\n Social isolation score created (based on {factors} factors)")
    print(f"   Distribution: {X['social_isolation_score'].value_counts().to_dict()}")
else:
    print("  Cannot create social isolation score (no relevant features)")

    Using MARISTAT (marital status)
    Using NACCLIVS (living situation)
    Using INLIVWTH (informant lives with)

 Social isolation score created (based on 3 factors)
   Distribution: {0: 100287, 3: 43724, 1: 32710, 2: 18475}


Create Family Risk Score

In [27]:
family_risk = 0
factors = 0


if 'NACCMOM' in X.columns:
    family_risk += (X['NACCMOM'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCMOM (mother's dementia)")

if 'NACCDAG' in X.columns:
    family_risk += (X['NACCDAG'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCDAG (father's dementia)")
elif 'NACCFAD' in X.columns:
    family_risk += (X['NACCFAD'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCFAD (father's dementia)")


if 'NACCFAM' in X.columns:
    family_risk += (X['NACCFAM'] == 1).astype(int)
    factors += 1
    print("   ✓ Using NACCFAM (family history)")

if factors > 0 and isinstance(family_risk, pd.Series):
    X['family_risk_score'] = family_risk
    X['has_family_history'] = (family_risk > 0).astype(int)
    print(f"\n Family risk features created (based on {factors} factors)")
    print(f"   family_risk_score distribution: {X['family_risk_score'].value_counts().to_dict()}")
    print(f"   has_family_history: {X['has_family_history'].value_counts().to_dict()}")
else:
    print("⚠️  Cannot create family risk score (no relevant features)")

   ✓ Using NACCMOM (mother's dementia)
   ✓ Using NACCFAM (family history)

 Family risk features created (based on 2 factors)
   family_risk_score distribution: {0: 85416, 2: 70779, 1: 39001}
   has_family_history: {1: 109780, 0: 85416}


Create Temporal Features

In [28]:
if 'NACCDAYS' in X.columns:
    X['years_in_study'] = X['NACCDAYS'] / 365.25
    X['months_in_study'] = X['NACCDAYS'] / 30.44
    print(" Temporal features created: 2")
    print("   - years_in_study")
    print("   - months_in_study")
    
    if 'NACCVNUM' in X.columns:
        X['avg_days_between_visits'] = X['NACCDAYS'] / (X['NACCVNUM'] + 1)
        print("\n Visit frequency feature created:")
        print("   - avg_days_between_visits")
else:
    print("  NACCDAYS not found, skipping temporal features")

 Temporal features created: 2
   - years_in_study
   - months_in_study

 Visit frequency feature created:
   - avg_days_between_visits


Feature Engineering Summary

In [29]:
original_features = len(df.columns) - 1  
new_features_count = len(X.columns) - original_features



print(f"Original features: {original_features}")
print(f"New features created: {new_features_count}")
print(f"Total features: {len(X.columns)}")
print(f"Samples: {len(X):,}")


Original features: 43
New features created: 15
Total features: 58
Samples: 195,196


In [30]:
df_engineered = pd.concat([X, y], axis=1)
output_path = 'data/engineered_data.csv'
df_engineered.to_csv(output_path, index=False)

In [32]:
NON_MEDICAL_FEATURES = {
'visit_info': [
        'NACCID',       # Subject ID
        'NACCADC',      # ADC at which subject was seen
        'PACKET',       # Packet code (I=Initial, F=Follow-up, T=Telephone)
        'FORMVER',      # Form version
        'VISITMO',      # Visit month
        'VISITDAY',     # Visit day
        'VISITYR',      # Visit year
        'NACCVNUM',     # Visit number (order)
        'NACCAVST',     # Total number of all UDS visits
        'NACCNVST',     # Total number of in-person visits
        'NACCDAYS',     # Days from initial to most recent visit
        'NACCFDYS',     # Days from initial to each follow-up
    ],
    'subject_demographics': [
        # Birth & Age
        'BIRTHMO',      # Birth month
        'BIRTHYR',      # Birth year
        'NACCAGE',      # Age at visit *
        'NACCAGEB',     # Age at baseline/initial visit
        
        # Sex & Gender
        'SEX',          # Sex (1=Male, 2=Female)
        
        # Race & Ethnicity
        'HISPANIC',     # Hispanic/Latino ethnicity
        'HISPOR',       # Hispanic origin
        'HISPORX',      # Hispanic origin, other (specify)
        'RACE',         # Primary race
        'RACEX',        # Race, other (specify)
        'RACESEC',      # Second race
        'RACESECX',     # Second race, other
        'RACETER',      # Third race
        'RACETERX',     # Third race, other
        'NACCNIHR',     # Derived NIH race category
        
        # Language
        'PRIMLANG',     # Primary language
        'PRIMLANX',     # Primary language, other
         'EDUC', 
         # Marital & Living Situation *
        'MARISTAT',     # Marital status (1=Married, 2=Widowed, etc.)
        'NACCLIVS',     # Living situation (derived)
        'INDEPEND',     # Level of independence
        'RESIDENC',     # Type of residence
        
        # Other
        'HANDED',       # Handedness (1=Left, 2=Right, 3=Ambidextrous)
        'NACCREAS',     # Primary reason for coming to ADC (derived)
        'NACCREFR',     # Principal referral source (derived)
    ],
    'coparticipant_demographics': [
        'INBIRMO',      # Co-participant birth month
        'INBIRYR',      # Co-participant birth year
        'INSEX',        # Co-participant sex
        'NEWINF',       # Is this a new co-participant?
        
        # Co-participant Race/Ethnicity
        'INHISP',       # Hispanic/Latino
        'INHISPOR',     # Hispanic origin
        'INHISPOX',     # Hispanic origin, other
        'INRACE',       # Race
        'INRACEX',      # Race, other
        'INRASEC',      # Second race
        'INRASECX',     # Second race, other
        'INRATER',      # Third race
        'INRATERX',     # Third race, other
        
        # Co-participant Education & Relationship
        'INEDUC',       # Years of education
        'INRELTO',      # Relationship to subject 
        'INRELTOX',     # Relationship, other
        'INKNOWN',      # How long known subject (months)
        'INLIVWTH',     # Lives with subject? 
    ],        # Years of education (0-36)
    'family_history': [
        # General Family History
        'NACCFAM',      # Any first-degree family member with cognitive impairment 
        'NACCMOM',      # Mother with cognitive impairment 
        'NACCDAG',      # Father with cognitive impairment 
        # Note: Some datasets might use 'NACCFAD' instead of 'NACCDAG'
        
        # Genetic Mutations (Alzheimer's)
        'NACCFADM',     # Evidence of dominantly inherited AD mutation
        'NACCAM',       # Evidence for AD mutation (specific genes)
        'NACCAMX',      # AD mutation, other
        'NACCAMS',      # Source of evidence for AD mutation
        'NACCAMSX',     # Source, other
        
        # Genetic Mutations (FTLD)
        'NACCFFTD',     # Evidence for FTLD mutation
        'NACCFM',       # Evidence for FTLD mutation (specific)
        'NACCFMX',      # FTLD mutation, other
        'NACCFMS',      # Source of evidence
        'NACCFMSX',     # Source, other
    ],
    
    'nacc_derived': [
        'NACCBMI',      # Body Mass Index (derived, could argue borderline)
        'NACCUDSD',     # UDS version
    ],
}
# Flatten the dictionary to get all non-medical features
ALL_NON_MEDICAL = []
for category, features in NON_MEDICAL_FEATURES.items():
    ALL_NON_MEDICAL.extend(features)

print(f" Defined {len(ALL_NON_MEDICAL)} potential non-medical features across {len(NON_MEDICAL_FEATURES)} categories")
print("\n Features by category:")
for category, features in NON_MEDICAL_FEATURES.items():
    print(f"  - {category}: {len(features)} features")

 Defined 70 potential non-medical features across 5 categories

 Features by category:
  - visit_info: 12 features
  - subject_demographics: 25 features
  - coparticipant_demographics: 18 features
  - family_history: 13 features
  - nacc_derived: 2 features


In [33]:
available_features = [f for f in ALL_NON_MEDICAL if f in df.columns]
missing_features = [f for f in ALL_NON_MEDICAL if f not in df.columns]

print(f" Available non-medical features: {len(available_features)}/{len(ALL_NON_MEDICAL)}")
print(f"Missing features: {len(missing_features)}")

if missing_features:
    print(f"\n⚠️  Missing features (first 20):")
    for feat in missing_features[:20]:
        print(f"     - {feat}")

print(f"\n Available features:")
for i, feat in enumerate(available_features, 1):
    print(f"{i:3d}. {feat}")

 Available non-medical features: 43/70
Missing features: 27

⚠️  Missing features (first 20):
     - HISPOR
     - HISPORX
     - RACEX
     - RACESEC
     - RACESECX
     - RACETER
     - RACETERX
     - PRIMLANX
     - INHISP
     - INHISPOR
     - INHISPOX
     - INRACE
     - INRACEX
     - INRASEC
     - INRASECX
     - INRATER
     - INRATERX
     - INEDUC
     - INRELTOX
     - INKNOWN

 Available features:
  1. NACCID
  2. NACCADC
  3. PACKET
  4. FORMVER
  5. VISITMO
  6. VISITDAY
  7. VISITYR
  8. NACCVNUM
  9. NACCAVST
 10. NACCNVST
 11. NACCDAYS
 12. NACCFDYS
 13. BIRTHMO
 14. BIRTHYR
 15. NACCAGE
 16. NACCAGEB
 17. SEX
 18. HISPANIC
 19. RACE
 20. NACCNIHR
 21. PRIMLANG
 22. EDUC
 23. MARISTAT
 24. NACCLIVS
 25. INDEPEND
 26. RESIDENC
 27. HANDED
 28. NACCREAS
 29. NACCREFR
 30. INBIRMO
 31. INBIRYR
 32. INSEX
 33. NEWINF
 34. INRELTO
 35. INLIVWTH
 36. NACCFAM
 37. NACCMOM
 38. NACCFADM
 39. NACCAM
 40. NACCFFTD
 41. NACCFM
 42. NACCBMI
 43. NACCUDSD


identify target variable

In [34]:
TARGET_CANDIDATES = [
    'DEMENTED',     # Binary: 0=Normal, 1=Demented (most direct)
    'NORMCOG',      # Binary: 1=Normal cognition
    'NACCALZD',     # Alzheimer's disease diagnosis
    'NACCALZP',     # Probable AD
    'CDRGLOB',      # CDR Global (can convert: 0=normal, >=0.5=impaired)
    'NACCCOGF',     # Cognitive status
]

TARGET = None
print(" Searching for target variable...\n")

for candidate in TARGET_CANDIDATES:
    if candidate in df.columns:
        print(f"✓ Found: {candidate}")
        print(f"  Value counts: {df[candidate].value_counts().to_dict()}")
        print(f"  Missing: {df[candidate].isna().sum()} ({df[candidate].isna().sum()/len(df)*100:.1f}%)")
        print()
        
        if TARGET is None and candidate == 'DEMENTED':
            TARGET = candidate

# If DEMENTED not found, use first available
if TARGET is None:
    for candidate in TARGET_CANDIDATES:
        if candidate in df.columns:
            TARGET = candidate
            break

# If still not found, try creating from CDRGLOB
if TARGET is None and 'CDRGLOB' in df.columns:
    print("⚙️  Creating binary target from CDRGLOB")
    print("   0 = Normal cognition")
    print("   0.5+ = Cognitive impairment/dementia")
    df['DEMENTIA_BINARY'] = (df['CDRGLOB'] >= 0.5).astype(int)
    TARGET = 'DEMENTIA_BINARY'

if TARGET is None:
    print("  Could not find or create target variable!")
    print("   Available columns that might be targets:")
    possible = [c for c in df.columns if any(x in c.upper() for x in ['DEM', 'COG', 'CDR', 'DIAG'])]
    for p in possible[:20]:
        print(f"     - {p}")
else:
    print(f" Selected target variable: {TARGET}")
    print(f"   Distribution: {df[TARGET].value_counts().to_dict()}")

 Searching for target variable...

✓ Found: DEMENTED
  Value counts: {0: 137606, 1: 57590}
  Missing: 0 (0.0%)

 Selected target variable: DEMENTED
   Distribution: {0: 137606, 1: 57590}


creating warking dataset

In [36]:
df_work = df[available_features + [TARGET]].copy()

# Remove rows where target is missing
initial_count = len(df_work)
df_work = df_work[df_work[TARGET].notna()]
removed_count = initial_count - len(df_work)

print(f" Working dataset created!")
print(f"   - Initial rows: {initial_count:,}")
print(f"   - Removed (missing target): {removed_count:,}")
print(f"   - Final rows: {len(df_work):,}")
print(f"   - Features: {len(available_features)}")
print(f"   - Target: {TARGET}")
print(f"\n   Target distribution:")
for value, count in df_work[TARGET].value_counts().items():
    pct = count / len(df_work) * 100
    print(f"     {value}: {count:,} ({pct:.1f}%)")

 Working dataset created!
   - Initial rows: 195,196
   - Removed (missing target): 0
   - Final rows: 195,196
   - Features: 43
   - Target: DEMENTED

   Target distribution:
     0: 137,606 (70.5%)
     1: 57,590 (29.5%)


handle NACC special   codes

In [37]:
MISSING_CODES = [-4, 88, 888, 8888, 99, 999, 9999]

print(" Handling NACC special codes...")
print("   Converting to NaN: {MISSING_CODES}\n")

# Count occurrences before replacement
codes_found = {}
for code in MISSING_CODES:
    count = (df_work == code).sum().sum()
    if count > 0:
        codes_found[code] = count

if codes_found:
    print("   Special codes found:")
    for code, count in sorted(codes_found.items()):
        print(f"     {code}: {count:,} occurrences")
else:
    print("   No special codes found")

# Replace with NaN (except in target)
for col in df_work.columns:
    if col != TARGET:
        df_work[col] = df_work[col].replace(MISSING_CODES, np.nan)

print("\n Special codes converted to NaN")

 Handling NACC special codes...
   Converting to NaN: {MISSING_CODES}

   No special codes found

 Special codes converted to NaN


missing values analysis

In [31]:
missing_stats = pd.DataFrame({
    'Column': df_work.columns,
    'Missing_Count': df_work.isnull().sum(),
    'Missing_Percentage': (df_work.isnull().sum() / len(df_work) * 100).round(2),
    'Dtype': df_work.dtypes
}).sort_values('Missing_Percentage', ascending=False)
total_cells = df_work.shape[0] * df_work.shape[1]
missing_cells = df_work.isnull().sum().sum()
missing_pct = (missing_cells / total_cells * 100)

missing_stats[missing_stats['Missing_Count'] > 0]

Unnamed: 0,Column,Missing_Count,Missing_Percentage,Dtype
NACCFM,NACCFM,61950,31.74,float64
NACCAM,NACCAM,61950,31.74,float64
NEWINF,NEWINF,58060,29.74,float64
NACCBMI,NACCBMI,25911,13.27,float64
INBIRMO,INBIRMO,14061,7.2,float64
INBIRYR,INBIRYR,13835,7.09,float64
INLIVWTH,INLIVWTH,8288,4.25,float64
INSEX,INSEX,8288,4.25,float64
INRELTO,INRELTO,8288,4.25,float64
NACCAGE,NACCAGE,3596,1.84,float64


 Remove High-Missing Features


In [33]:
MISSING_THRESHOLD = 50.0

high_missing = missing_stats[missing_stats['Missing_Percentage'] > MISSING_THRESHOLD]
cols_to_remove = high_missing['Column'].tolist()

# Don't remove target
if TARGET in cols_to_remove:
    cols_to_remove.remove(TARGET)

if cols_to_remove:
    print(f"Removing {len(cols_to_remove)} features with >{MISSING_THRESHOLD}% missing:")
    for col in cols_to_remove:
        pct = high_missing[high_missing['Column'] == col]['Missing_Percentage'].values[0]
        print(f"  - {col}: {pct:.1f}% missing")
    
    df_work = df_work.drop(columns=cols_to_remove)
    print(f"\n Removed {len(cols_to_remove)} high-missing features")
else:
    print(f" No features with >{MISSING_THRESHOLD}% missing")

print(f"\nRemaining features: {df_work.shape[1] - 1}  (+ target)")

 No features with >50.0% missing

Remaining features: 43  (+ target)


Remove Duplicates

In [34]:

initial_rows = len(df_work)
duplicates = df_work.duplicated().sum()

print(f"Duplicate rows found: {duplicates}")

if duplicates > 0:
    print(f"  Removing {duplicates} duplicate rows...")
    df_work = df_work.drop_duplicates()
    print(f"  Removed {duplicates} duplicates")
else:
    print(f"  No duplicates found")

print(f"\nRows after duplicate removal: {len(df_work):,}")

Duplicate rows found: 0
  No duplicates found

Rows after duplicate removal: 195,196


Data Type Optimization

In [35]:
memory_before = df_work.memory_usage(deep=True).sum() / 1024**2
print(f"Memory before optimization: {memory_before:.2f} MB")

# Convert float64 to float32 where possible
float_cols = df_work.select_dtypes(include=['float64']).columns
for col in float_cols:
    if col != TARGET:
        df_work[col] = df_work[col].astype('float32')

# Convert int64 to int32 where possible
int_cols = df_work.select_dtypes(include=['int64']).columns
for col in int_cols:
    if col != TARGET:
        # Check if values fit in int32
        if df_work[col].max() < 2147483647 and df_work[col].min() > -2147483648:
            df_work[col] = df_work[col].astype('int32')

memory_after = df_work.memory_usage(deep=True).sum() / 1024**2
memory_saved = memory_before - memory_after
memory_saved_pct = (memory_saved / memory_before * 100)

Memory before optimization: 85.82 MB


In [36]:
summary = {
    'original_rows': len(df),
    'original_columns': len(df.columns),
    'cleaned_rows': len(df_work),
    'cleaned_columns': len(df_work.columns),
    'rows_removed': len(df) - len(df_work),
    'features_removed': len(cols_to_remove) if cols_to_remove else 0,
    'special_codes_converted': total_special if codes_found else 0,
    'duplicates_removed': duplicates,
    'final_missing_percentage': round((df_work.isnull().sum().sum() / (df_work.shape[0] * df_work.shape[1]) * 100), 2),
    'memory_saved_mb': round(memory_saved, 2),
    'target_variable': TARGET
}

In [37]:
output_path = 'data/cleaned_data.csv'
df_work.to_csv(output_path, index=False)