In [3]:
# prompt: df.show() , instead write to a new csv file, make sure no data is lost

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, log, exp, when
from pyspark.sql.functions import skewness

# Initialize SparkSession
spark = SparkSession.builder.appName("SkewnessHandling").getOrCreate()

# Load the CSV file into a Spark DataFrame
df = spark.read.csv("/content/part-00000-f6621328-5892-4709-89ba-3ff15e6b235f-c000.csv", header=True, inferSchema=True)

# Specify the numeric columns that need skewness and outlier handling
numeric_inputs = ['odometer','car_name_sum']

# Create a dictionary to store the quantiles (1st and 99th percentiles)
d = {}
for col_name in numeric_inputs:
    # Calculate the 1st and 99th percentiles for each column
    d[col_name] = df.approxQuantile(col_name, [0.01, 0.99], 0.25)

# Handle skewness and outliers
for col_name in numeric_inputs:
    # Calculate skewness for each numeric column
    skew = df.agg(skewness(col(col_name))).collect()[0][0]

    # Clip values to handle outliers (below 1st quantile and above 99th quantile)
    clipped_col = when(df[col_name] < d[col_name][0], d[col_name][0])\
                  .when(df[col_name] > d[col_name][1], d[col_name][1])\
                  .otherwise(df[col_name])

    # If skewness is positive (right skew), apply log transformation
    if skew > 1:
        df = df.withColumn(col_name, log(clipped_col + 1).alias(col_name))  # Apply log transformation
        print(f"{col_name} has been treated for positive (right) skewness. (skew = {skew})")

    # If skewness is negative (left skew), apply exponential transformation
    elif skew < -1:
        df = df.withColumn(col_name, exp(clipped_col).alias(col_name))  # Apply exponential transformation
        print(f"{col_name} has been treated for negative (left) skewness. (skew = {skew})")

df.show()
# Write the transformed DataFrame to a new CSV file
df.write.csv("/content/transformed_data.csv", header=True, mode="overwrite")

print("Transformed data saved to transformed_data.csv")

# Stop the SparkSession
spark.stop()

odometer has been treated for positive (right) skewness. (skew = 45.85996984126468)
car_name_sum has been treated for positive (right) skewness. (skew = 3.8825305255308415)
+-------+----+------------+------------------+------------+-----------+-----------+-----+-------------------+--------+--------------------+-------------------+-------------+--------------------+------------------+
|  price|year|manufacturer|          odometer|transmission|       type|paint_color|state|       posting_date|zip_code|transmission_indexed|paint_color_indexed|state_indexed|manufacturer_indexed|      car_name_sum|
+-------+----+------------+------------------+------------+-----------+-----------+-----+-------------------+--------+--------------------+-------------------+-------------+--------------------+------------------+
| 7990.0|2012|      nissan|11.695721908987394|   automatic|        SUV|      white|   fl|2021-04-23 18:45:23|   33604|                 0.0|                0.0|          1.0|            