# Understanding Bucketing

In [10]:
import findspark
findspark.init()

In [11]:
from pyspark import SparkConf
from pyspark.sql import SparkSession
conf = SparkConf() \
        .setAppName("Dyanmic Partitions") \
        .set("spark.driver.memory", "4g") \
        .set("spark.driver.cores", "5") \
        .set("spark.master", "local[5]")

spark = SparkSession.builder.config(conf=conf).getOrCreate()

## Create Dataframes

In [12]:
ratingDF = spark.read.option("header","true").csv("spark-data/ratings.csv")
tagsDF = spark.read.option("header","true").csv("spark-data/tags.csv")

## Joining Datasets without Bucketing

In [None]:
ratingDF.join(tagsDF,"movieId").explain("Formatted")

## Bucket Dataset on movieId

In [None]:
ratingDF.write.mode("overwrite").bucketBy(10, "movieId").saveAsTable("ratings_bucketed")

In [None]:
tagsDF.write.mode("overwrite").bucketBy(10, "movieId").saveAsTable("tags_bucketed")

In [None]:
tagsDF.printSchema

# Run Join Query on bucketed data

In [None]:
spark.sql("select * from ratings_bucketed r join tags_bucketed t on r.movieId=t.movieId").explain("Formatted")

In [None]:
# Save as Hive table, overwrite mode, partitioned by 'rating'
ratingDF.write \
    .mode("overwrite") \
    .partitionBy("rating") \
    .saveAsTable("ratings_partitioned")
