## Data Cleaning Pipeline for Object-Type Columns
This Python code demonstrates a data preprocessing pipeline for object-type columns in a Pandas DataFrame. It covers techniques to handle missing values using mode imputation, detect and manage outliers using z-score, standardize data format by converting strings to uppercase, and drop rows/columns with excessive missingness.

In [1]:
import pandas as pd
import numpy as np

In [15]:
file = pd.read_json("electronics.json")
df = file.apply(lambda x: x.str.strip()).replace('', 0)
print(df.isna().sum())

Customer_ID                      0
Age                              0
Gender                           0
Income_Level                     0
Address                          0
Transaction_ID                   0
Purchase_Date                    0
Product_ID                       0
Product_Category                 0
Brand                            0
Purchase_Amount                  0
Average_Spending_Per_Purchase    0
Purchase_Frequency_Per_Month     0
Brand_Affinity_Score             0
Product_Category_Preferences     0
Month                            0
Year                             0
Season                           0
dtype: int64


In [16]:
import pandas as pd
import numpy as np
from scipy.stats import zscore

# Assuming 'df' is your DataFrame

# Handling Missing Values
for col in df.select_dtypes(include='object').columns:
    if df[col].isnull().sum() > 0:
        # Fill missing values with mean or median
        if df[col].dtype == 'object':
            df[col].fillna(df[col].mode()[0], inplace=True)  # Filling with mode for object-type columns

# Analyzing Outliers
for col in df.select_dtypes(include='object').columns:
    if df[col].dtype != 'object':
        z_scores = zscore(df[col])
        outliers = (np.abs(z_scores) > 3)  # Change threshold as needed
        df.loc[outliers, col] = np.nan  # Replace outliers with NaN

# Handling inconsistencies in data format and encoding - example for uppercase
for col in df.select_dtypes(include='object').columns:
    if df[col].dtype == 'object':
        df[col] = df[col].str.upper()  # Convert to uppercase

# Dropping rows or columns with excessive missingness
threshold = 0.3  # Example threshold for missing values
df.dropna(axis=1, thresh=len(df) * threshold, inplace=True)  # Drop columns with more than 30% missing values

# Dropping rows with missing values
df.dropna(axis=0, inplace=True)  # Drop rows with any missing values

# Displaying the cleaned DataFrame
print(df.head())


                            Customer_ID Age  Gender Income_Level  \
0  B81EE6C9-2AE4-48A7-B283-220EAA244F43  40  FEMALE       MEDIUM   
3  878DCCBA-893A-48F9-8D34-6ED394FA3C9C  38  FEMALE       MEDIUM   
4  0AF0BD81-73CC-494E-AA5E-75C6D0B6D743  68   OTHER       MEDIUM   
5  5A4AC4CE-1E09-4ECB-805C-FB676F101385  26   OTHER         HIGH   
7  80A5D367-B3DC-4C18-8A93-584E7C5E7E29  23  FEMALE         HIGH   

                                             Address  \
0  43548 MURRAY ISLANDS SUITE 974\nAMYBERG, CT 13457   
3  02998 HALL MEADOWS SUITE 809\nNORTH ROBERTVILL...   
4  21411 TIMOTHY FORD APT. 320\nDAVISBOROUGH, AR ...   
5  843 JOHN KNOLL SUITE 876\nRODRIGUEZMOUTH, MO 3...   
7  203 TIFFANY HILL SUITE 971\nPORT JOSETOWN, VI ...   

                         Transaction_ID Purchase_Date  \
0  C6A6C712-E36B-406A-BFDE-F53BDCF4744F    2022-04-26   
3  3CFAFA02-6B34-4D77-9E05-D223DFAB64E8    2022-12-03   
4  0D8DC27A-0C8F-4A82-B57E-8BF54CEE9759    2020-06-08   
5  5B0D927B-A342-4DF1-8846

In [19]:
hidden_rows_mask = df.apply(lambda row: row.astype(str).str.contains('HIDDEN')).any(axis=1)

# Filter out the rows containing 'HIDDEN' and keep the rest
df = df[~hidden_rows_mask]

# Display the DataFrame after dropping rows containing 'HIDDEN'
print(df.head())
df.to_csv("Example_file.csv", sep=',')

                            Customer_ID Age  Gender Income_Level  \
0  B81EE6C9-2AE4-48A7-B283-220EAA244F43  40  FEMALE       MEDIUM   
3  878DCCBA-893A-48F9-8D34-6ED394FA3C9C  38  FEMALE       MEDIUM   
4  0AF0BD81-73CC-494E-AA5E-75C6D0B6D743  68   OTHER       MEDIUM   
5  5A4AC4CE-1E09-4ECB-805C-FB676F101385  26   OTHER         HIGH   
7  80A5D367-B3DC-4C18-8A93-584E7C5E7E29  23  FEMALE         HIGH   

                                             Address  \
0  43548 MURRAY ISLANDS SUITE 974\nAMYBERG, CT 13457   
3  02998 HALL MEADOWS SUITE 809\nNORTH ROBERTVILL...   
4  21411 TIMOTHY FORD APT. 320\nDAVISBOROUGH, AR ...   
5  843 JOHN KNOLL SUITE 876\nRODRIGUEZMOUTH, MO 3...   
7  203 TIFFANY HILL SUITE 971\nPORT JOSETOWN, VI ...   

                         Transaction_ID Purchase_Date  \
0  C6A6C712-E36B-406A-BFDE-F53BDCF4744F    2022-04-26   
3  3CFAFA02-6B34-4D77-9E05-D223DFAB64E8    2022-12-03   
4  0D8DC27A-0C8F-4A82-B57E-8BF54CEE9759    2020-06-08   
5  5B0D927B-A342-4DF1-8846