In [1]:
import pandas as pd
import sys
import os

# Path to the CSV file
csv_file = 'data/demographic.csv'

# Check if file exists
if not os.path.exists(csv_file):
    print(f"Error: File '{csv_file}' not found in the current directory.")
    sys.exit(1)

# Read the CSV file
print("Reading the CSV file...")
df = pd.read_csv(csv_file)

# Replace empty strings with NaN to treat them as missing values
df.replace('', pd.NA, inplace=True)

# Threshold for missing values (80%)
threshold = 0.8

# Find columns to drop
cols_to_drop = [col for col in df.columns if df[col].isna().sum() / len(df) > threshold]

print(f"Columns with more than {threshold * 100}% missing values: {cols_to_drop}")

# Drop the columns
if cols_to_drop:
    df.drop(columns=cols_to_drop, inplace=True)
    print(f"Dropped {len(cols_to_drop)} columns.")
else:
    print("No columns to drop.")

# Write the cleaned DataFrame back to the CSV file
df.to_csv(csv_file, index=False)
print(f"Cleaned CSV saved back to '{csv_file}'.")

Reading the CSV file...
Columns with more than 80.0% missing values: ['RIDAGEMN', 'DMQADFC', 'DMDYRSUS', 'RIDEXPRG']
Dropped 4 columns.
Cleaned CSV saved back to 'data/demographic.csv'.
