In [1]:
import pandas as pd

# Sample dataset with missing values and duplicates
data = {
    'CustomerID': [1, 2, 3, 4, 5, 5],
    'Name': ['Alice', 'Bob', 'Charlie', None, 'Eve', 'Eve'],
    'Age': [25, None, 35, 40, 28, 28],
    'Gender': ['Female', 'Male', None, 'Female', 'Male', 'Male'],
    'AnnualIncome': [50000, 60000, None, 80000, 75000, 75000]
}

df = pd.DataFrame(data)

# 1. Identify and count missing values
print("🔍 Missing Values Count:\n", df.isnull().sum(), "\n")

# 2. Handle missing values

# For numerical columns: fill with mean or median
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['AnnualIncome'].fillna(df['AnnualIncome'].median(), inplace=True)

# For categorical columns: fill with mode
df['Name'].fillna(df['Name'].mode()[0], inplace=True)
df['Gender'].fillna(df['Gender'].mode()[0], inplace=True)

print("✅ Missing values handled.\n")

# 3. Drop duplicate rows
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]

print(f"🧹 Dropped {before - after} duplicate row(s).")

# 4. Final cleaned dataset
print("\n🧼 Cleaned Dataset:\n", df)


🔍 Missing Values Count:
 CustomerID      0
Name            1
Age             1
Gender          1
AnnualIncome    1
dtype: int64 

✅ Missing values handled.

🧹 Dropped 1 duplicate row(s).

🧼 Cleaned Dataset:
    CustomerID     Name   Age  Gender  AnnualIncome
0           1    Alice  25.0  Female       50000.0
1           2      Bob  31.2    Male       60000.0
2           3  Charlie  35.0    Male       75000.0
3           4      Eve  40.0  Female       80000.0
4           5      Eve  28.0    Male       75000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['AnnualIncome'].fillna(df['AnnualIncome'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on whi