In [13]:
import pandas as pd

def replace_outliers_iqr(df, column_name):
    q1 = df[column_name].quantile(0.25)
    q3 = df[column_name].quantile(0.75)
    iqr = q3 - q1

    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr

    df[column_name] = df[column_name].apply(lambda x: x if lower_bound <= x <= upper_bound else float('nan'))

    return df

def replace_outliers_zscore(df, column_name):
    z_scores = (df[column_name] - df[column_name].mean()) / df[column_name].std()

    df[column_name] = df[column_name].apply(lambda x: x if -3 <= x <= 3 else float('nan'))

    return df


In [17]:
import pandas as pd


df = pd.read_csv('boston.csv')

print("Original Dataset:")
print(df.head())

column_name = 'CRIM'  

df_iqr = replace_outliers_iqr(df.copy(), column_name)


df_zscore = replace_outliers_zscore(df.copy(), column_name)


print("\nModified Dataset - IQR:")
print(df_iqr.head())

print("\nModified Dataset - Z-Score:")
print(df_zscore.head())


Original Dataset:
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222.0   
4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222.0   

   PTRATIO       B  LSTAT  MEDV  
0     15.3  396.90   4.98  24.0  
1     17.8  396.90   9.14  21.6  
2     17.8  392.83   4.03  34.7  
3     18.7  394.63   2.94  33.4  
4     18.7  396.90   5.33  36.2  

Modified Dataset - IQR:
      CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296.0   
1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242.0   
2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242.0   
3  0.03237   0.0   2.1

In [18]:
print("\nNaN Values in Modified Dataset - IQR:")
print(df_iqr[column_name].isna().sum())

print("\nNaN Values in Modified Dataset - Z-Score:")
print(df_zscore[column_name].isna().sum())



NaN Values in Modified Dataset - IQR:
66

NaN Values in Modified Dataset - Z-Score:
133


In [None]:
#Functions:

#replace_outliers_iqr: Marks extreme values in a column as NaN based on quartiles and interquartile range.
#replace_outliers_zscore: Flags outliers in a column as NaN using Z-Scores.
#Usage:

#Load dataset with pd.read_csv('boston.csv').
#Apply functions to a chosen column, like replace_outliers_iqr(df, 'your_column').
#Scenarios:

#NaNs keep the dataset intact, useful when outlier info is valuable.
#Provides flexibility for downstream processes.