In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/covid_toy.csv")

In [5]:
def get_data_summary(df):
    """
    Generates a structured overview of the dataset, including:
    - Data type of each column
    - Number and percentage of missing values
    - Number and percentage of unique values
    - Most frequent value (mode)
    - Summary statistics (for numerical features)
    - First few records (to inspect sample values)
    """
    # Get column-wise metadata
    missing_vals = df.isna().sum()
    unique_vals = df.nunique()
    mode_vals = df.mode().iloc[0]
    
    summary_df = pd.DataFrame({
        "Data Type": df.dtypes,
        "# Records": df.shape[0],
        "# Missing": missing_vals,
        "% Missing": (missing_vals / df.shape[0]) * 100,
        "# Unique": unique_vals,
        "% Unique": (unique_vals / df.shape[0]) * 100,
        "Most Frequent Value": mode_vals
    })
    
    # Function to compute numerical statistics
    def get_numeric_summary(df):
        return df.select_dtypes(include="number").agg(["min", "max", "mean", "median", "std"]).T
    
    # Merge numerical statistics only for numeric columns
    num_summary = get_numeric_summary(df)
    summary_df = summary_df.merge(num_summary, left_index=True, right_index=True, how="left")
    
    # Append first few records to inspect data values
    preview = df.head(3).T  # Transpose for better readability
    summary_df = pd.concat([summary_df, preview], axis=1)
    
    return summary_df

# Display with color formatting
display(get_data_summary(df).style.background_gradient(subset=["# Unique", "std", "median", "mean", "max", "min", "% Missing"], cmap="Oranges"))


Unnamed: 0,Data Type,# Records,# Missing,% Missing,# Unique,% Unique,Most Frequent Value,min,max,mean,median,std,0,1,2
age,int64,100,0,0.0,55,55.0,19,5.0,84.0,44.22,45.0,24.878931,60,27,42
gender,object,100,0,0.0,2,2.0,Female,,,,,,Male,Male,Male
fever,float64,100,10,10.0,7,7.0,98.000000,98.0,104.0,100.844444,101.0,2.054926,103.000000,100.000000,101.000000
cough,object,100,0,0.0,2,2.0,Mild,,,,,,Mild,Mild,Mild
city,object,100,0,0.0,4,4.0,Kolkata,,,,,,Kolkata,Delhi,Delhi
has_covid,object,100,0,0.0,2,2.0,No,,,,,,No,Yes,No
