In [145]:
import pandas as pd

def createDataSet():
    file_path = 'data/electronics.json'

    try:
        df = pd.read_json(file_path)
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error reading JSON file: {e}")
        return None
    
df = createDataSet()

In [146]:
# Define the values to be considered as invalid
invalid_values = ['', 'Hidden']

#Columns to change data types
columns = ['Year', 'Month', 'Age', 'Gender']

df[columns] = df[columns].replace(invalid_values, pd.NA).fillna(method='ffill')

In [138]:
df['Purchase_Date'] = df['Purchase_Date'].dropna()

In [139]:
# Convert 'Purchase_Date' column to datetime format, handling errors
df['Purchase_Date'] = pd.to_datetime(df['Purchase_Date'], errors='coerce')
df.dropna(subset=['Purchase_Date'], inplace=True)

In [140]:
# Create a boolean mask for rows containing invalid values in any of the specified columns
mask = df[['Product_ID', 'Product_Category', 'Brand']].isin(invalid_values).any(axis=1)

# Drop rows based on the boolean mask
df = df[~mask]

In [141]:
#Conver Following Columns to neumaric values
columns = ['Purchase_Amount', 'Average_Spending_Per_Purchase', 'Purchase_Frequency_Per_Month', 'Brand_Affinity_Score']
# Convert invalid values to NaN
df[columns] = df[columns].replace(invalid_values, pd.NA)

# Convert specified columns to numeric
df[columns] = df[columns].apply(pd.to_numeric, errors='coerce')

#calculate medians
medians = df[columns].median()

df[columns] = df[columns].fillna(medians)

In [142]:
df.describe()

Unnamed: 0,Age,Purchase_Amount,Average_Spending_Per_Purchase,Purchase_Frequency_Per_Month,Brand_Affinity_Score
count,801.0,801.0,801.0,801.0,801.0
mean,49.594257,248.775281,52.204744,5.461923,5.337079
std,18.281108,137.820715,26.688444,2.771527,2.77556
min,18.0,10.0,5.0,1.0,1.0
25%,34.0,135.0,30.0,3.0,3.0
50%,50.0,239.0,50.0,5.0,5.0
75%,66.0,367.0,74.0,8.0,8.0
max,80.0,500.0,100.0,10.0,10.0


In [143]:
df.Purchase_Amount.median()

239.0

In [147]:
df.to_csv('cleaned/data.csv')