# PySpark RDD-Based

In [9]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
import time
from functools import wraps
from pyspark import SparkContext, SparkConf, SQLContext
findspark.init('/opt/spark')

In [2]:
def timeit(func):
    @wraps(func)
    def timeit_wrapper(*args, **kwargs):
        start_time = time.perf_counter()
        result = func(*args, **kwargs)
        end_time = time.perf_counter()
        total_time = end_time - start_time
        print(f'Function {func.__name__}{args} {kwargs} Took {total_time:.4f} seconds')
        return result
    return timeit_wrapper

In [3]:
conf = SparkConf().setAppName("pyspark-rdd-based")
sc = SparkContext(conf=conf)

In [4]:
F1_RESULTS_PATH = "hdfs://node-master:9000/user/root/f1/results.csv"

In [5]:
F1_DRIVERS_PATH = "hdfs://node-master:9000/user/root/f1/drivers.csv"

In [5]:
%%time
res_lines = sc.textFile(F1_RESULTS_PATH)

CPU times: user 3.83 ms, sys: 1.09 ms, total: 4.92 ms
Wall time: 530 ms


In [5]:
start_time = time.time()
res_lines_2 = sc.textFile(F1_RESULTS_PATH)
end_time = time.time()
print(f"time spent: {end_time - start_time}")

time spent: 0.5345594882965088


In [6]:
@timeit
def load_csv(path):
    res_lines_3 = sc.textFile(F1_RESULTS_PATH)
    return res_lines_3

In [10]:
load_csv(F1_RESULTS_PATH)

Function load_csv('hdfs://node-master:9000/user/root/f1/results.csv',) {} Took 0.5374 seconds


hdfs://node-master:9000/user/root/f1/results.csv MapPartitionsRDD[1] at textFile at NativeMethodAccessorImpl.java:0

In [6]:
res_header = res_lines.take(1)[0].split(",")

In [7]:
res_header

['resultId',
 'raceId',
 'driverId',
 'constructorId',
 'number',
 'grid',
 'position',
 'positionText',
 'positionOrder',
 'points',
 'laps',
 'time',
 'milliseconds',
 'fastestLap',
 'rank',
 'fastestLapTime',
 'fastestLapSpeed',
 'statusId']

In [8]:
drivers_num_races = res_lines.filter(
    lambda line : line != ",".join(res_header)             # skipping header line
).map(
    lambda line : tuple(zip(res_header, line.split(",")))  # transforming values into labeled values
).map(
    lambda result : (int(result[2][1]), 1)                 # transforming labeled values into (driverId, 1)
).countByKey()                                             # counting drivers show up

In [9]:
top_10_drivers_most_races = sorted(drivers_num_races.items(), key=lambda elem : -elem[1])[:10]

In [10]:
top_10_drivers_most_races

[(22, 326),
 (18, 309),
 (30, 308),
 (4, 293),
 (8, 273),
 (13, 271),
 (119, 257),
 (15, 256),
 (14, 247),
 (21, 231)]

In [11]:
drivers_lines = sc.textFile(F1_DRIVERS_PATH)

In [12]:
drivers_header = drivers_lines.take(1)[0].split(",")

In [13]:
drivers_header

['driverId',
 'driverRef',
 'number',
 'code',
 'forename',
 'surname',
 'dob',
 'nationality',
 'url']

In [14]:
translate_driver_id = drivers_lines.filter(
    lambda line : line != ",".join(drivers_header)             # skipping header line
).map(
    lambda line : tuple(zip(drivers_header, line.split(",")))  # transforming values into labeled values
).map(
    lambda result : (int(result[0][1]), result[1][1])          # transforming labeled values into (driverId, driverRef)
).collect()

In [15]:
translate_driver_id = dict(translate_driver_id)

In [16]:
for (driver, amount) in top_10_drivers_most_races:
    print(f"{translate_driver_id[driver]} : {amount}")

barrichello : 326
button : 309
michael_schumacher : 308
alonso : 293
raikkonen : 273
massa : 271
patrese : 257
trulli : 256
coulthard : 247
fisichella : 231
