In [1]:
####################################################
import pandas as pd

# Load the CSV file into a DataFrame
input_file = "economists_v2.csv"
df = pd.read_csv(input_file)
df_original = pd.read_csv(input_file)

# Function to clean columns with similar issues
def clean_column(df, column_name):
    if column_name in df.columns:
        df[column_name] = (
            df[column_name]
            .str.strip("[]")  # Remove square brackets
            .str.replace("'", "")  # Remove single quotes
            .str.replace(",", ";")  # Replace commas with semicolons for clarity
            .str.strip()  # Remove extra spaces
        )

# Columns to clean
columns_to_clean = [
    'education',
    'alma_mater',
    'influences',
    'notable_ideas',
    'contributions',
    'doctoral_advisors',
    'doctoral_students',
    'image_url'
]

# Apply the cleaning function to each column
for col in columns_to_clean:
    clean_column(df, col)

# Remove parentheses from the 'died' column
if 'died' in df.columns:
    df['died'] = df['died'].str.replace(r"[()]", "", regex=True)

# Filter rows where the 'name' column is not null or empty
df = df[df['name'].notnull() & (df['name'].str.strip() != "")]

# Remove rows where both 'born' and 'died' columns are empty
if 'born' in df.columns and 'died' in df.columns:
    df = df[~(df['born'].isnull() & df['died'].isnull())]

# Display the filtered DataFrame
print(f"Original dataset size: {len(df_original)}")
print(f"Filtered dataset size: {len(df)}")

# Save the cleaned DataFrame to a new CSV file
output_file = "economists_v2_cleaned.csv"
df.to_csv(output_file, index=False, encoding="utf-8")
print(f"Cleaned data saved to {output_file}")


Original dataset size: 1348
Filtered dataset size: 990
Cleaned data saved to economists_v2_cleaned.csv
