In [21]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, array, concat, lit, when, count, stddev
from pyspark.sql.types import DoubleType, IntegerType, BooleanType

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [22]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("BasalamAnalysis") \
    .master("local[*]") \
    .getOrCreate()

# Load dataset
df = spark.read.csv('s3://bigdata-3400/BaSalam.products.csv', header=True, inferSchema=True)

# Show original schema and sample data
df.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- _id: integer (nullable = true)
 |-- _score: double (nullable = true)
 |-- sales_count_week: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- price: integer (nullable = true)
 |-- status_id: integer (nullable = true)
 |-- status_title: string (nullable = true)
 |-- stock: integer (nullable = true)
 |-- photo_MEDIUM: string (nullable = true)
 |-- photo_SMALL: string (nullable = true)
 |-- rating_average: double (nullable = true)
 |-- rating_count: integer (nullable = true)
 |-- rating_signals: integer (nullable = true)
 |-- primaryPrice: integer (nullable = true)
 |-- preparationDays: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- categoryId: integer (nullable = true)
 |-- has_delivery: boolean (nullable = true)
 |-- has_variation: boolean (nullable = true)
 |-- new_categoryId: integer (nullable = true)
 |-- navigation_id: integer (nullable = true)
 |-- vendor_name: string (nullable = true)
 |-- vendor_identifier: string (nullable = true

In [31]:
total_rows = df.count()
print(f"Total rows before cleaning: {total_rows}")

# Print null percentage for each column before cleaning
print("\nNull percentages before cleaning:")
for c in df.columns:
    missing_count = df.select(
        count(when(col(c).isNull(), c)).alias("missing")
    ).collect()[0]["missing"]
    
    missing_percent = missing_count / total_rows
    print(f"Column '{c}': {missing_percent:.2%} null")

# Initialize an empty list to collect final columns
final_cols = []
nan_threshold = 0.2

# Loop over the columns and filter by null threshold
for c in df.columns:
    missing_count = df.select(
        count(when(col(c).isNull(), c)).alias("missing")
    ).collect()[0]["missing"]
    
    missing_percent = missing_count / total_rows
    
    if missing_percent <= nan_threshold:
        final_cols.append(c)

# Select only columns that meet the threshold
df = df.select(final_cols)
print(f"\nColumns remaining after threshold filtering: {len(df.columns)}")

# Print null percentage for each column after column filtering
print("\nNull percentages after column filtering:")
for c in df.columns:
    missing_count = df.select(
        count(when(col(c).isNull(), c)).alias("missing")
    ).collect()[0]["missing"]
    
    missing_percent = missing_count / total_rows
    print(f"Column '{c}': {missing_percent:.2%} null")

# Drop rows with any null values
df_clean = df.dropna()

# Count remaining rows after removing nulls
clean_count = df_clean.count()
print(f"\nRows remaining after removing nulls: {clean_count} ({clean_count/total_rows:.2%} of original data)")

# Calculate overall null percentages
before_nulls = df.select([count(when(col(c).isNull(), c)) for c in df.columns]).collect()[0]
before_total_nulls = sum(before_nulls)
before_total_cells = total_rows * len(df.columns)
before_null_percent = before_total_nulls / before_total_cells

after_nulls = df_clean.select([count(when(col(c).isNull(), c)) for c in df_clean.columns]).collect()[0]
after_total_nulls = sum(after_nulls)
after_total_cells = clean_count * len(df_clean.columns)
after_null_percent = after_total_nulls / after_total_cells

print(f"\nOverall null percentage before cleaning: {before_null_percent:.2%}")
print(f"Overall null percentage after cleaning: {after_null_percent:.2%}")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Total rows before cleaning: 2411358

Null percentages before cleaning:
Column '_id': 0.00% null
Column '_score': 0.00% null
Column 'sales_count_week': 0.00% null
Column 'name': 0.00% null
Column 'price': 0.00% null
Column 'status_id': 0.00% null
Column 'status_title': 0.11% null
Column 'stock': 0.00% null
Column 'photo_MEDIUM': 0.02% null
Column 'photo_SMALL': 0.02% null
Column 'rating_average': 0.00% null
Column 'rating_count': 0.00% null
Column 'rating_signals': 0.00% null
Column 'primaryPrice': 0.00% null
Column 'preparationDays': 0.00% null
Column 'weight': 0.00% null
Column 'categoryId': 6.06% null
Column 'has_delivery': 6.06% null
Column 'has_variation': 0.00% null
Column 'new_categoryId': 0.00% null
Column 'vendor_name': 0.00% null
Column 'vendor_identifier': 0.00% null
Column 'vendor_statusId': 0.00% null
Column 'vendor_cityId': 0.00% null
Column 'vendor_provinceId': 0.00% null
Column 'vendor_has_delivery': 0.43% null
Column 'vendor_id': 0.00% null
Column 'vendor_status_id': 0.