# RDD-vs-DataFrame

In [1]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
import time
from functools import wraps
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
findspark.init('/opt/spark')

In [2]:
conf = SparkConf().setAppName("rdd-vs-dataframe")
sc = SparkContext(conf=conf)

In [3]:
! hadoop fs -put ../datasets/f1

In [4]:
F1_RESULTS_PATH = "hdfs://node-master:9000/user/root/f1/results.csv"

In [5]:
F1_DRIVERS_PATH = "hdfs://node-master:9000/user/root/f1/drivers.csv"

## RDD

In [6]:
res_lines = sc.textFile(F1_RESULTS_PATH)

In [7]:
res_header = res_lines.take(1)[0].split(",")

In [8]:
drivers_num_races = res_lines.filter(
    lambda line : line != ",".join(res_header)             # skipping header line
).map(
    lambda line : tuple(zip(res_header, line.split(",")))  # transforming values into labeled values
).map(
    lambda result : (int(result[2][1]), 1)                 # transforming labeled values into (driverId, 1)
).countByKey()                                             # counting drivers show up

In [9]:
top_10_drivers_most_races = sorted(drivers_num_races.items(), key=lambda elem : -elem[1])[:10]

In [10]:
drivers_lines = sc.textFile(F1_DRIVERS_PATH)

In [11]:
drivers_header = drivers_lines.take(1)[0].split(",")

In [12]:
translate_driver_id = drivers_lines.filter(
    lambda line : line != ",".join(drivers_header)             # skipping header line
).map(
    lambda line : tuple(zip(drivers_header, line.split(",")))  # transforming values into labeled values
).map(
    lambda result : (int(result[0][1]), result[1][1])          # transforming labeled values into (driverId, driverRef)
).collect()

In [13]:
translate_driver_id = dict(translate_driver_id)

In [14]:
for (driver, amount) in top_10_drivers_most_races:
    print(f"{translate_driver_id[driver]} : {amount}")

barrichello : 326
button : 309
michael_schumacher : 308
alonso : 293
raikkonen : 273
massa : 271
patrese : 257
trulli : 256
coulthard : 247
fisichella : 231


## DataFrame

In [15]:
spark = SparkSession(sc)

In [16]:
dfr = spark.read.format("csv").option("header", "true").load(F1_RESULTS_PATH)

In [17]:
dfd = spark.read.format("csv").option("header", "true").load(F1_DRIVERS_PATH)

In [18]:
dfr.join(
    dfd, dfr.driverId == dfd.driverId, "inner"
).groupBy(
    dfr.driverId, dfd.driverRef
).agg(
    count(dfr.raceId).alias("races")
).orderBy(
    col("races").desc()
).limit(10).show()

+--------+------------------+-----+
|driverId|         driverRef|races|
+--------+------------------+-----+
|      22|       barrichello|  326|
|      18|            button|  309|
|      30|michael_schumacher|  308|
|       4|            alonso|  293|
|       8|         raikkonen|  273|
|      13|             massa|  271|
|     119|           patrese|  257|
|      15|            trulli|  256|
|      14|         coulthard|  247|
|      21|        fisichella|  231|
+--------+------------------+-----+

