# CSV-vs-Parquet

In [1]:
# Making sure to link pyspark to the right Spark folder with findspark
import findspark
import time
import pandas as pd
from pathlib import Path
from functools import wraps
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
findspark.init('/opt/spark')

In [2]:
conf = SparkConf().setAppName("csv-vs-parquet")
sc = SparkContext(conf=conf)

In [3]:
! hadoop fs -put ../datasets/f1

In [4]:
F1_LAP_TIMES_PATH = "hdfs://node-master:9000/user/root/f1/lapTimes.csv"

In [5]:
F1_LAP_TIMES_DEST_PATH = "hdfs://node-master:9000/user/root/f1/lapTimes.parquet"

In [6]:
F1_LAP_TIMES_DEST_PATH_2 = "hdfs://node-master:9000/user/root/f1/lapTimes2.parquet"

In [7]:
! hadoop fs -ls /user/root/f1/

Found 13 items
-rw-r--r--   2 root supergroup       8667 2023-09-13 21:14 /user/root/f1/circuits.csv
-rw-r--r--   2 root supergroup     224140 2023-09-13 21:14 /user/root/f1/constructorResults.csv
-rw-r--r--   2 root supergroup     267256 2023-09-13 21:14 /user/root/f1/constructorStandings.csv
-rw-r--r--   2 root supergroup      15617 2023-09-13 21:14 /user/root/f1/constructors.csv
-rw-r--r--   2 root supergroup     768136 2023-09-13 21:14 /user/root/f1/driverStandings.csv
-rw-r--r--   2 root supergroup      79533 2023-09-13 21:14 /user/root/f1/drivers.csv
-rw-r--r--   2 root supergroup   12118621 2023-09-13 21:14 /user/root/f1/lapTimes.csv
-rw-r--r--   2 root supergroup     220898 2023-09-13 21:14 /user/root/f1/pitStops.csv
-rw-r--r--   2 root supergroup     315477 2023-09-13 21:14 /user/root/f1/qualifying.csv
-rw-r--r--   2 root supergroup     104843 2023-09-13 21:14 /user/root/f1/races.csv
-rw-r--r--   2 root supergroup    1176858 2023-09-13 21:14 /user/root/f1/results.cs

In [8]:
! hadoop fs -du -h /user/root/f1/lapTimes.csv

11.6 M  /user/root/f1/lapTimes.csv


## Pandas

In [9]:
! pip install pyarrow

Collecting pyarrow
  Downloading pyarrow-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (25.6 MB)
     |################################| 25.6 MB 181 kB/s            
Installing collected packages: pyarrow
Successfully installed pyarrow-6.0.1


In [10]:
filepath = Path("../datasets/f1/lapTimes.csv")

In [11]:
dest_folder = Path("../datasets/f1")

In [12]:
filepath.exists()

True

In [13]:
fields = [
    "raceId",
    "driverId",
    "lap",
    "position",
    "time",
    "milliseconds",
]

In [14]:
! pip freeze | grep pandas

pandas==1.1.5


In [15]:
df = pd.read_csv(filepath, dtype={field: str for field in fields})

In [23]:
df.dtypes

raceId           int64
driverId        object
lap             object
position        object
time            object
milliseconds    object
dtype: object

In [24]:
df.iloc[0]

raceId               841
driverId              20
lap                    1
position               1
time            1:38.109
milliseconds       98109
Name: 0, dtype: object

In [25]:
df["raceId"] = df["raceId"].str.replace(r'\D+', '', regex=True).astype('int')

AttributeError: Can only use .str accessor with string values!

In [26]:
df["driverId"] = df["driverId"].str.replace(r'\D+', '', regex=True).astype('int')
df["lap"] = df["lap"].str.replace(r'\D+', '', regex=True).astype('int')
df["position"] = df["position"].str.replace(r'\D+', '', regex=True).astype('int')
df["milliseconds"] = df["milliseconds"].str.replace(r'\D+', '', regex=True).astype('int')

In [27]:
df.dtypes

raceId           int64
driverId         int64
lap              int64
position         int64
time            object
milliseconds     int64
dtype: object

In [28]:
df.to_parquet(dest_folder / "lapTimes.parquet")

## DataFrame

In [6]:
spark = SparkSession(sc)

In [48]:
dflt = spark.read.format("csv").option("header", "true").load(F1_LAP_TIMES_PATH)

In [53]:
type(dflt.head(1)[0]["raceId"])

str

In [56]:
schema = StructType([
    StructField("raceId", IntegerType(), True),
    StructField("driverId", IntegerType(), True),
    StructField("lap", IntegerType(), True),
    StructField("position", IntegerType(), True),
    StructField("time", StringType(), True),
    StructField("milliseconds", IntegerType(), True),
])

In [57]:
dflt2 = spark.read.format("csv").option("header", "true").schema(schema).load(F1_LAP_TIMES_PATH)

In [58]:
dflt2.head(1)

[Row(raceId=841, driverId=20, lap=1, position=1, time='1:38.109', milliseconds=98109)]

In [65]:
dflt2.write.parquet(F1_LAP_TIMES_DEST_PATH)

In [61]:
! hadoop fs -du -h /user/root/f1/lapTimes.parquet

23/08/03 00:38:19 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
0      /user/root/f1/lapTimes.parquet/_SUCCESS
1.8 M  /user/root/f1/lapTimes.parquet/part-00000-6f9a116b-98ea-45a2-a81b-092a5b6cc4e7-c000.snappy.parquet
1.1 M  /user/root/f1/lapTimes.parquet/part-00001-6f9a116b-98ea-45a2-a81b-092a5b6cc4e7-c000.snappy.parquet


In [None]:
dflt2.coalesce(1).write.parquet(F1_LAP_TIMES_DEST_PATH_2)

In [66]:
! hadoop fs -du -h /user/root/f1/lapTimes2.parquet

23/08/03 00:41:13 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
0      /user/root/f1/lapTimes2.parquet/_SUCCESS
2.5 M  /user/root/f1/lapTimes2.parquet/part-00000-7bdf9fdc-635c-4ee2-943f-f54e15faf910-c000.snappy.parquet


In [11]:
dfr.join(
    dfd, dfr.driverId == dfd.driverId, "inner"
).groupBy(
    dfr.driverId, dfd.driverRef
).agg(
    count(dfr.raceId).alias("races")
).orderBy(
    col("races").desc()
).limit(10).show()

+--------+------------------+-----+
|driverId|         driverRef|races|
+--------+------------------+-----+
|      22|       barrichello|  326|
|      18|            button|  309|
|      30|michael_schumacher|  308|
|       4|            alonso|  293|
|       8|         raikkonen|  273|
|      13|             massa|  271|
|     119|           patrese|  257|
|      15|            trulli|  256|
|      14|         coulthard|  247|
|      21|        fisichella|  231|
+--------+------------------+-----+



In [12]:
dfr.registerTempTable("races")

In [13]:
dfd.registerTempTable("drivers")

In [14]:
spark.sql("""
select r.driverId, d.driverRef, count(0) races
from races r
  inner join drivers d on r.driverId = d.driverId
group by r.driverId, d.driverRef
order by races desc
limit 10
""").show()

+--------+------------------+-----+
|driverId|         driverRef|races|
+--------+------------------+-----+
|      22|       barrichello|  326|
|      18|            button|  309|
|      30|michael_schumacher|  308|
|       4|            alonso|  293|
|       8|         raikkonen|  273|
|      13|             massa|  271|
|     119|           patrese|  257|
|      15|            trulli|  256|
|      14|         coulthard|  247|
|      21|        fisichella|  231|
+--------+------------------+-----+

