In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt



In [60]:
def calculate_statistics(data):
    if data.dtype in ['int32']:
        data_mean = int(data.mean())
        data_median = int(data.median())
        data_std_dev = float(data.std())
        return {
            'mean': data_mean,
            'median': data_median,
            'std_dev': data_std_dev,
        }
    else:
        return {
            'mode': data.mode()[0],
            'unique_count': data.nunique()
        }

In [None]:
# Load the dataset
df = pd.read_csv("./data/housing_modified.csv")

# Total missing values
total_missing_counts = int(df.isnull().sum().sum())
print(f"Total missing values: {total_missing_counts}")

# Get the number of columns with missing values
total_columns_with_missing_values = int(df.isnull().any().sum())
print(f"Total columns with missing values: {total_columns_with_missing_values}")

# Replace missing values with 0
df.fillna(0, inplace=True)


for column in df:
    statistics = calculate_statistics(df[column])
    print(f"Column: {column}")
    if 'mode' in statistics:
        print("Categorical variable")
        print(f"Mode: {statistics['mode']}")
        print(f"Unique count: {statistics['unique_count']}")
    else:
        print("Numerical variable")
        print(f"Mean: {statistics.get('mean', 'N/A')}")
        print(f"Median: {statistics.get('median', 'N/A')}")
        print(f"Standard Deviation: {statistics.get('std_dev', 'N/A')}")
    print("-" * 40)


Total missing values: 20847
Total columns with missing values: 2
Column: longitude
Categorical variable
Mode: -118.31
Unique count: 844
----------------------------------------
Column: latitude
Categorical variable
Mode: 34.06
Unique count: 862
----------------------------------------
Column: housing_median_age
Categorical variable
Mode: 52.0
Unique count: 52
----------------------------------------
Column: total_rooms
Categorical variable
Mode: 1527.0
Unique count: 5926
----------------------------------------
Column: net_gain
Categorical variable
Mode: 0.0
Unique count: 1
----------------------------------------
Column: total_bedrooms
Categorical variable
Mode: 0.0
Unique count: 1924
----------------------------------------
Column: population
Categorical variable
Mode: 891.0
Unique count: 3888
----------------------------------------
Column: households
Categorical variable
Mode: 306.0
Unique count: 1815
----------------------------------------
Column: median_income
Categorical variab