In [1]:
from pyspark.sql import SparkSession

In [2]:
import os

# Set the Python executable path
os.environ['PYSPARK_PYTHON'] = r'C:\\Users\\Manideep S\\AppData\\Local\\Programs\\Python\\Python312\\python.exe'
os.environ['PYSPARK_DRIVER_PYTHON'] =r'C:\\Users\\Manideep S\\AppData\\Local\\Programs\\Python\\Python312\\python.exe'


In [3]:
spark = SparkSession.builder.appName("session").getOrCreate()

In [4]:
# Define the number of partitions
num_partitions = 5

# Base path for partitions in DBFS
base_path = r"C:\\Users\\Manideep S\\OneDrive - COGNINE\\ML\\Assessments\\PySpark\\DataSkew-Salting\\"

# List to hold DataFrames for each partition
dfs = []


In [5]:
# Read each partition and add to the list
for i in range(1, num_partitions + 1):
    partition_path = f'{base_path}partition_{i:02}.csv'
    partition_df = spark.read.csv(partition_path, header=True, inferSchema=True)
    dfs.append(partition_df)

In [6]:
# Combine all partitions into a single DataFrame
combined_df = dfs[0]
for df in dfs[1:]:
    combined_df = combined_df.union(df)
# Show the combined DataFrame
combined_df.show()

+--------------------+--------------------+------+--------------------+---------+-----------------+
|       director_name|            ceremony|  year|            category|  outcome|original_language|
+--------------------+--------------------+------+--------------------+---------+-----------------+
|         Endre Hules|Montréal World Fi...|2011.0|       Golden Zenith|Nominated|               en|
|    Michael Lantieri|Academy of Scienc...|2002.0|        Saturn Award|      Won|               en|
|Alessandro Benvenuti|  Golden Ciak Awards|1994.0|         Golden Ciak|      Won|               it|
|         David Feiss|        Annie Awards|2017.0|               Annie|Nominated|               en|
|        Timo Novotny|  Diagonale, Austria|1998.0|Diagonale Youth J...|      Won|               en|
|      Kristian Petri|Göteborg Film Fes...|1996.0|        Dragon Award|      Won|               en|
|   Mario Van Peebles|Locarno Internati...|1995.0|      Silver Leopard|      Won|               en|


In [17]:
combined_df.describe()

DataFrame[summary: string, director_name: string, ceremony: string, year: string, category: string, outcome: string, original_language: string]

In [7]:
import pyspark.sql.functions as F

# Partioning already read data, repartitioning data and then repartitioning based on salt

### No of partitions of already manually partitioned data, experiencing skew

In [8]:
combined_df.groupBy(F.spark_partition_id()).count().show()

+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   0|53085|
|                   1|53112|
|                   2| 6640|
|                   3|53159|
|                   4| 3259|
|                   5|28209|
|                   6|14104|
|                   7| 7052|
+--------------------+-----+



### Data read from csv whithout repartioned

In [9]:
original_df = spark.read.csv(r"C:\Users\Manideep S\OneDrive - COGNINE\ML\Assessments\PySpark\DataSkew-Salting\220k_awards_by_directors.csv", header=True, inferSchema=True)

In [10]:
original_df.groupBy(F.spark_partition_id()).count().show()

+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   0|54137|
|                   1|53598|
|                   2|52625|
|                   3|51965|
|                   4|13350|
+--------------------+-----+



### Repartioning data without salt, evenly distributing the data

In [11]:
# Repartition
repartition_combined_df_rp = combined_df.repartition(10, 'director_name', 'ceremony', 'year', 'category', 'outcome', 'original_language')

In [12]:
repartition_combined_df_rp.groupBy(F.spark_partition_id()).count().show()

+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   0|21731|
|                   1|21947|
|                   2|21775|
|                   3|21662|
|                   4|21855|
|                   5|22095|
|                   6|21764|
|                   7|21927|
|                   8|21856|
|                   9|22008|
+--------------------+-----+



### Adding salt column and repartioning based on salt, shows more evenly distributed data around all the partitions

In [13]:
combined_df_slt = combined_df.withColumn('salt', F.rand())
repartition_combined_df_slt = combined_df_slt.repartition(10, 'salt')

# df = df.withColumn('salt', F.rand())
# df = df.repartition(8, 'salt')

In [14]:
repartition_combined_df_slt.groupBy(F.spark_partition_id()).count().show()

+--------------------+-----+
|SPARK_PARTITION_ID()|count|
+--------------------+-----+
|                   0|21833|
|                   1|21777|
|                   2|21893|
|                   3|22047|
|                   4|21943|
|                   5|22000|
|                   6|21843|
|                   7|21912|
|                   8|21971|
|                   9|21401|
+--------------------+-----+



# Partioning using salting while reading the data

In [15]:
from pyspark.sql.functions import col, concat, lit, expr, rand