In [13]:
import pandas as pd
import numpy as np

# Creating a hypothetical dataset with null and noisy data
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', np.nan, 'Bob'],
    'Age': [20, 22, np.nan, 23, 25, 22,],
    'TestScore': [85, 90, 88, 92, 120, 90]
}

df = pd.DataFrame(data)
print("Original Dataset:")
print(df)


Original Dataset:
      Name   Age  TestScore
0    Alice  20.0         85
1      Bob  22.0         90
2  Charlie   NaN         88
3    David  23.0         92
4      NaN  25.0        120
5      Bob  22.0         90


In [4]:
# Step 1: Find Null Values
null_values = df.isnull()
print("\nNull Values:")
print(null_values)



Null Values:
    Name    Age  TestScore
0  False  False      False
1  False  False      False
2  False   True      False
3  False  False      False
4   True  False      False
5  False  False      False


In [5]:
# Step 2: Remove Rows with Null Values
df_cleaned = df.dropna()
print("\nDataset after removing rows with null values:")
print(df_cleaned)


Dataset after removing rows with null values:
    Name   Age  TestScore
0  Alice  20.0         85
1    Bob  22.0         90
3  David  23.0         92
5    Bob  22.0         90


In [6]:
# Step 3: Identify and Handle Noisy Data (e.g., outliers in 'TestScore')
noisy_data_indices = df_cleaned[df_cleaned['TestScore'] > 100].index
df_cleaned.loc[noisy_data_indices, 'TestScore'] = np.nan
print("\nDataset after handling noisy data:")
print(df_cleaned)


Dataset after handling noisy data:
    Name   Age  TestScore
0  Alice  20.0       85.0
1    Bob  22.0       90.0
3  David  23.0       92.0
5    Bob  22.0       90.0


In [7]:
# Step 4: Impute Null Values (fill with mean, median, etc.)
df_cleaned['Age'].fillna(df_cleaned['Age'].mean(), inplace=True)
print("\nDataset after imputing null values in 'Age' column:")
print(df_cleaned)



Dataset after imputing null values in 'Age' column:
    Name   Age  TestScore
0  Alice  20.0       85.0
1    Bob  22.0       90.0
3  David  23.0       92.0
5    Bob  22.0       90.0


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Age'].fillna(df_cleaned['Age'].mean(), inplace=True)


In [14]:

# Step 4: Impute Null Values using Mean
mean_age = df_cleaned['Age'].mean()
df_cleaned['Age'].fillna(mean_age, inplace=True)

print("\nDataset after imputing null values in 'Age' column using mean:")
print(df_cleaned)







Dataset after imputing null values in 'Age' column using mean:
    Name   Age  TestScore
0  Alice  20.0       85.0
1    Bob  22.0       90.0
3  David  23.0       92.0
5    Bob  22.0       90.0


In [15]:
from sklearn.impute import SimpleImputer

# Assuming df_cleaned is your DataFrame after handling noisy data
# Creating a SimpleImputer instance with strategy='median'
imputer = SimpleImputer(strategy='median')

# Fitting the imputer on the 'Age' column
imputer.fit(df_cleaned[['Age']])

# Transforming and replacing null values with the median
df_cleaned['Age'] = imputer.transform(df_cleaned[['Age']])

print("\nDataset after imputing null values in 'Age' column using SimpleImputer:")
print(df_cleaned)



Dataset after imputing null values in 'Age' column using SimpleImputer:
    Name   Age  TestScore
0  Alice  20.0       85.0
1    Bob  22.0       90.0
3  David  23.0       92.0
5    Bob  22.0       90.0
