In [0]:
import pyspark.sql.functions as f

df_airbnb = spark.read.table("airbnb.raw.listings")

columns_to_keep = [
    'host_id','host_since','host_is_superhost', 'host_response_time', 
    'host_response_rate', 'host_acceptance_rate', 'host_neighbourhood',
    'host_listings_count', 'host_identity_verified',
    'latitude','longitude','property_type','room_type','accommodates',
    'bathrooms','bathrooms_text','bedrooms','beds','amenities','price',
    'minimum_nights','maximum_nights', 'number_of_reviews',
    'first_review', 'last_review', 'review_scores_rating',
    'review_scores_cleanliness', 'review_scores_checkin', 
    'review_scores_communication', 'review_scores_location',
    'review_scores_value', 'license','instant_bookable','reviews_per_month'
]

df_subset = df_airbnb.select(columns_to_keep)

display(df_subset)

In [0]:
import pyspark.sql.functions as f

def collect_missing_value_stats(df):
    total_count = df.count()
    # Get column data types
    col_types = dict(df.dtypes)
    agg_expressions = []
    for c in df.columns:
        if col_types[c] in ["double", "float"]:
            expr = f.sum(
                f.when(f.col(c).isNull() | f.isnan(c), 1).otherwise(0)
            ).alias(c)
        else:
            expr = f.sum(
                f.when(f.col(c).isNull(), 1).otherwise(0)
            ).alias(c)
        agg_expressions.append(expr)
    df_missing_counts_wide = df.agg(*agg_expressions)
    missing_counts_row = df_missing_counts_wide.first()
    missing_data = []
    if missing_counts_row:
        for col_name in df.columns:
            null_count = missing_counts_row[col_name]
            percent_missing = (null_count / total_count) * 100
            missing_data.append((col_name, null_count, percent_missing))
    stats_schema = "column_name STRING, missing_count LONG, missing_percent DOUBLE"
    stats_df = spark.createDataFrame(missing_data, schema=stats_schema)
    return stats_df.orderBy(f.col("missing_percent").desc())

missing_stats_df = collect_missing_value_stats(df_subset)
display(missing_stats_df)

In [0]:
df_for_plotting = missing_stats_df.filter(
    (f.col("missing_percent") > 0) & (f.col("missing_percent") < 100)
)

display(df_for_plotting)

Databricks visualization. Run in Databricks to view.

In [0]:
import pyspark.sql.functions as f

df_pie_data = df_subset.select(
    f.when(f.col("host_neighbourhood").isNull(), "Missing")
     .otherwise("Not Missing")
     .alias("status")
).groupBy("status").count()

display(df_pie_data)

Databricks visualization. Run in Databricks to view.