In [6]:
import pandas as pd
import numpy as np

print("--- STEP 1: GENERATING RAW MESSY DATA ---")

# We create a messy dataset with typos, duplicates, and missing values
raw_data = {
    'Country': ['zimbabwe', 'Zambia', 'South Africa', 'YEMEN', 'Venezuela', 'zimbabwe', 'France'],
    'Independence_Year': [1980, 1964, 1910, np.nan, 1811, 1980, 843],
    'GDP_String': ['$26 Billion', '$23 Billion', '$351 Billion', '$21 Billion', '$482 Billion', '$26 Billion', '$2.9 Trillion'],
    'Population': [14862927, 18383955, 59308690, 29825968, 28435943, 14862927, 65273511]
}

df = pd.DataFrame(raw_data)
display(df)

print("\n--- STEP 2: CLEANING THE DATA ---")

# 1. Drop Duplicates
df_clean = df.drop_duplicates()
print("✓ Removed duplicates")

# 2. Fix Capitalization (zimbabwe -> Zimbabwe)
df_clean['Country'] = df_clean['Country'].str.title()
print("✓ Fixed country names")

# 3. Fill Missing Values (NaN -> 0)
df_clean['Independence_Year'] = df_clean['Independence_Year'].fillna(0).astype(int)
print("✓ Filled missing dates")

# 4. Clean Currency (Remove symbols to make it math-ready)
df_clean['GDP_Clean'] = df_clean['GDP_String'].str.replace('$', '').str.replace(' Billion', '').str.replace(' Trillion', '')
print("✓ Cleaned currency formatting")

print("\n--- FINAL CLEAN DATASET ---")
display(df_clean)






--- STEP 1: GENERATING RAW MESSY DATA ---


Unnamed: 0,Country,Independence_Year,GDP_String,Population
0,zimbabwe,1980.0,$26 Billion,14862927
1,Zambia,1964.0,$23 Billion,18383955
2,South Africa,1910.0,$351 Billion,59308690
3,YEMEN,,$21 Billion,29825968
4,Venezuela,1811.0,$482 Billion,28435943
5,zimbabwe,1980.0,$26 Billion,14862927
6,France,843.0,$2.9 Trillion,65273511



--- STEP 2: CLEANING THE DATA ---
✓ Removed duplicates
✓ Fixed country names
✓ Filled missing dates
✓ Cleaned currency formatting

--- FINAL CLEAN DATASET ---


Unnamed: 0,Country,Independence_Year,GDP_String,Population,GDP_Clean
0,Zimbabwe,1980,$26 Billion,14862927,26.0
1,Zambia,1964,$23 Billion,18383955,23.0
2,South Africa,1910,$351 Billion,59308690,351.0
3,Yemen,0,$21 Billion,29825968,21.0
4,Venezuela,1811,$482 Billion,28435943,482.0
6,France,843,$2.9 Trillion,65273511,2.9
