In [None]:
#import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler, StandardScaler

#Step-2: Load the dataset
df=pd.read_csv(r'sample_data.csv')

#Step-3: Identifying the null-values
print("Null values in each colum:\n",df.isnull().sum())

#Option-1: Replace null values with column mean for numeric values only
numeric_cols=df.select_dtypes(include=[np.number]).columns
df[numeric_cols]=df[numeric_cols].fillna(df[numeric_cols].mean())

#Step-4: Identify and handle empty values
print("Empty values in each column:\n",(df=="").sum())

#Option-2: Replace empty values with column median
#Replace empty values with NaN for easier handling
df.replace("",np.nan,inplace=True)

#Fill NaN values with the median for numeric columns only
numeric_cols=df.select_dtypes(include=[np.number]).columns
df[numeric_cols]=df[numeric_cols].fillna(df[numeric_cols].median())

#Identify the incorrect timestamp and handle them
if "time_stamp" in df.columns: 
    #Change 'time_stamp' column with actual timestamp column name
    df["time_stamp"]=pd.to_datetime(df["time_stamp"],errors="coerce")
    #Convert to datetime
    incorrect_timestamps=df["time_stamp"].isnull().sum()
    #Count incorrect timestamps
    print(f"Number of incorrect timestamps:{incorrect_timestamps}")
    df.dropna(subset=["timestamp_column"],inplace=True)
    #Option to drop rows with correct timestamps

#Step-6: Remove the duplicates 
df.drop_duplicates(inplace=True)
print("Data after removing duplicates:",df.shape)

#Step-7: Data Normalization(MinMaxScaling)
scaler=MinMaxScaler()
if(
    "numeric_cols" in df.columns
): #Change 'numeric_cols' to a relevant numeric column
    df[["numeric_column"]] = scaler.fit_transform([["numeric_column"]])

#Step-8: Data Standarization(Standard Scaling)
std_scaler=StandardScaler()
if (
    "numeric_column" in df.columns
):  # Change 'numeric_column' to a relevant numeric column
    df[["numeric_column"]] = std_scaler.fit_transform(df[["numeric_column"]])

#Step-9: Save the cleaned dataset
df.to_csv(r'cleaned_data.csv',index=False)
print("Cleaned dataset saved as 'cleaned_dataset.csv'.")

#Step-10: Visualize the cleaned dataset
# Example: Visualizing the distribution of a numeric column
if "numeric_column" in df.columns: # Change 'numeric_column' to a relevant column
     plt.hist(df["numeric_column"], bins=20, color="skyblue")
     plt.title("Distribution of Numeric Column")
     plt.xlabel("Value")
     plt.ylabel("Frequency")
     plt.show()

# Example: Visualizing a scatter plot of two numeric columns
if (
    "numeric_column1" in df.columns and "numeric_column2" in df.columns
):  # Change 'numeric_column1' and 'numeric_column2'
    plt.scatter(df["numeric_column1"], df["numeric_column2"], color="blue")
    plt.title("Scatter Plot of Numeric Columns")
    plt.xlabel("Column 1")
    plt.ylabel("Column 2")
    plt.show()

