In [None]:
# ======================================================
# 1. IMPORTS
# ======================================================
# These are the required libraries for data processing, visualization,
# and machine learning preprocessing.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer


In [None]:
# ======================================================
# 2. CLEANING FUNCTION
# ======================================================
# This function:
# - Drops unwanted columns
# - Removes columns with more than 60% missing values
# - Fills numeric missing values with median
# - Fills categorical missing values with mode
# - Removes numeric outliers using IQR
# The goal is to prepare a clean dataset for analysis.

def clean_df(df):
    df = df.copy()

    # Drop columns we don’t need
    columns_to_drop = ["number_of_reviews_ltm", "reviews_per_month"]
    df = df.drop(columns=columns_to_drop, errors="ignore")

    # Drop columns with >60% missing values
    thresh = len(df) * 0.40
    df = df.dropna(axis=1, thresh=thresh)

    # Fill missing values
    for col in df.columns:
        if df[col].dtype in ["float64", "int64"]:
            df[col] = df[col].fillna(df[col].median())
        else:
            mode_vals = df[col].mode()
            df[col] = df[col].fillna(mode_vals[0] if len(mode_vals) else "unknown")

    # Remove outliers with IQR
    numeric_cols = df.select_dtypes(include=["float64", "int64"]).columns
    for col in numeric_cols:
        Q1, Q3 = df[col].quantile(0.25), df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower, upper = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR
        df = df[(df[col] >= lower) & (df[col] <= upper)]

    return df


In [None]:
# ======================================================
# 3. FEATURE TYPE DETECTION
# ======================================================
# This helper function returns two lists:
# - Numeric columns
# - Categorical columns

def get_feature_types(df):
    numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
    return numeric_cols, categorical_cols


In [None]:
# ======================================================
# 4. PLOTTING FUNCTIONS
# ======================================================
# These helper functions visualize numeric and categorical columns.

def plot_numeric_features(df):
    numeric_cols, _ = get_feature_types(df)

    for col in numeric_cols:
        plt.figure(figsize=(10, 4))

        # Histogram
        plt.subplot(1, 2, 1)
        sns.histplot(df[col], kde=True)
        plt.title(f"Histogram: {col}")

        # Boxplot
        plt.subplot(1, 2, 2)
        sns.boxplot(x=df[col])
        plt.title(f"Boxplot: {col}")

        plt.tight_layout()
        plt.show()


def plot_categorical_features(df, max_unique=20):
    _, categorical_cols = get_feature_types(df)

    for col in categorical_cols:
        if df[col].nunique() <= max_unique:
            plt.figure(figsize=(10, 4))
            df[col].value_counts().plot(kind='bar')
            plt.title(f"Categorical: {col}")
            plt.xticks(rotation=45)
            plt.tight_layout()
            plt.show()


In [None]:
# ======================================================
# 5. LOAD & MERGE DATASETS
# ======================================================
# In this step:
# - We load CSV datasets (10% sample to reduce RAM usage)
# - Merge them using listing_id / id
# The output is a combined raw dataframe.

print("Sampling & merging datasets...")

df3 = pd.read_csv("calendar.csv.gz").sample(frac=0.1, random_state=42)
df2 = pd.read_csv("reviews.csv.gz").sample(frac=0.1, random_state=42)
df1 = pd.read_csv("listings.csv.gz").sample(frac=0.1, random_state=42)

df5 = pd.merge(df3, df1, how="inner", left_on="listing_id", right_on="id")
df = pd.merge(df5, df2, how="inner", on="listing_id")

print("Merged dataframe shape:", df.shape)


In [None]:
# ======================================================
# 6. CLEAN MERGED DATAFRAME
# ======================================================
# Apply our cleaning pipeline to the merged dataframe.

print("\nCleaning merged dataframe...")
df_clean = clean_df(df)
print("Cleaned dataframe shape:", df_clean.shape)


In [None]:
# ======================================================
# 7. ONE-HOT ENCODING + SCALING (ML READY)
# ======================================================
# This final step transforms:
# - Categorical columns → OneHotEncoder
# - Numeric columns → StandardScaler
# Output is ML‑ready.

print("\nApplying OneHotEncoder + StandardScaler (ML preprocessing)...")

numeric_cols, categorical_cols = get_feature_types(df_clean)

preprocessor = ColumnTransformer(
    transformers=[
        ("onehot", OneHotEncoder(handle_unknown="ignore", sparse=False), categorical_cols),
        ("scale", StandardScaler(), numeric_cols)
    ],
    remainder="drop"
)

df_processed = preprocessor.fit_transform(df_clean)

# Build feature name list
feature_names = (
    list(preprocessor.named_transformers_["onehot"]
         .get_feature_names_out(categorical_cols))
    +
    numeric_cols
)

# Convert to DataFrame
df_processed = pd.DataFrame(df_processed, columns=feature_names)

df_processed.to_csv("merged_encoded_scaled_ml_ready.csv", index=False)

print("\n ML-ready encoded + scaled dataset saved as: merged_encoded_scaled_ml_ready.csv")
print("Pipeline completed successfully!")
