In [1]:
from glob import glob

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [4]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)

In [5]:
sqlContext = SQLContext(sc)

In [6]:
path_datasets = "datasets/"
root_path = "telefonica/shared/"

all_files = glob(path_datasets + "[0-9]*/" + root_path + "*.CSV")
all_files

['datasets/20190311/telefonica/shared/filtered_Lst_Usos_20190305_20190307.CSV',
 'datasets/20190311/telefonica/shared/filtered_Lst_Usos_20190303_20190305.CSV',
 'datasets/20190311/telefonica/shared/filtered_Lst_Usos_20190302_20190304.CSV',
 'datasets/20190311/telefonica/shared/filtered_Lst_Usos_20190306_20190308.CSV',
 'datasets/20190311/telefonica/shared/filtered_Lst_Usos_20190304_20190306.CSV',
 'datasets/20190311/telefonica/shared/filtered_Lst_Usos_20190307_20190309.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190326_20190328.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190325_20190327.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190324_20190326.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190321_20190323.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190323_20190325.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190322_20190324.CSV']

In [13]:
df = sqlContext.read.csv(path=all_files, header=True, sep=";", encoding="cp1252").drop("_c6")

In [14]:
df.printSchema()

root
 |-- FECHAHORATRX: string (nullable = true)
 |-- CODIGOENTIDAD: string (nullable = true)
 |-- NOMBREENTIDAD: string (nullable = true)
 |-- CODIGOSITIO: string (nullable = true)
 |-- NOMBRESITIO: string (nullable = true)
 |-- NROTARJETA: string (nullable = true)



## Missing values

In [33]:
df.explain()

== Physical Plan ==
*(1) FileScan csv [FECHAHORATRX#57,CODIGOENTIDAD#58,NOMBREENTIDAD#59,CODIGOSITIO#60,NOMBRESITIO#61,NROTARJETA#62] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/nicolas/github/improve_transport/datasets/20190311/telefonica/shared..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<FECHAHORATRX:string,CODIGOENTIDAD:string,NOMBREENTIDAD:string,CODIGOSITIO:string,NOMBRESIT...


In [36]:
df.where(df["FECHAHORATRX"].isNull()).explain()

== Physical Plan ==
*(1) Filter isnull(FECHAHORATRX#57)
+- *(1) FileScan csv [FECHAHORATRX#57,CODIGOENTIDAD#58,NOMBREENTIDAD#59,CODIGOSITIO#60,NOMBRESITIO#61,NROTARJETA#62] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/nicolas/github/improve_transport/datasets/20190311/telefonica/shared..., PartitionFilters: [], PushedFilters: [IsNull(FECHAHORATRX)], ReadSchema: struct<FECHAHORATRX:string,CODIGOENTIDAD:string,NOMBREENTIDAD:string,CODIGOSITIO:string,NOMBRESIT...


In [22]:
missing_values = [(column, df.where(df[column].isNull()).count()) for column in df.columns]

In [23]:
missing_values

[('FECHAHORATRX', 0),
 ('CODIGOENTIDAD', 0),
 ('NOMBREENTIDAD', 0),
 ('CODIGOSITIO', 0),
 ('NOMBRESITIO', 0),
 ('NROTARJETA', 0)]

## Convert to parquet format

In [7]:
all_files = glob(path_datasets + "20190330/" + root_path + "*.CSV")
all_files

['datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190326_20190328.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190325_20190327.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190324_20190326.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190321_20190323.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190323_20190325.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190322_20190324.CSV']

In [39]:
df = sqlContext.read.csv(path=all_files, header=True, sep=";", encoding="cp1252").drop("_c6")

In [40]:
%%time
df.write.parquet("datasets/20190330.parquet")

CPU times: user 6.44 ms, sys: 1.69 ms, total: 8.14 ms
Wall time: 1min 5s


In [45]:
!du -hs datasets/*

2,9G	datasets/20190311
1,2G	datasets/20190311.zip
3,1G	datasets/20190330
595M	datasets/20190330.7z
1,9G	datasets/20190330.parquet
1,3G	datasets/20190330.zip


## Parquet vs CSV performance

In [42]:
ls -l datasets/

total 2758504
drwxrwxr-x 3 nicolas nicolas       4096 mar 30 20:47 [0m[01;34m20190311[0m/
-rw-rw-r-- 1 nicolas nicolas 1206821387 mar 11 18:54 [01;31m20190311.zip[0m
drwxrwxr-x 3 nicolas nicolas       4096 mar 30 20:50 [01;34m20190330[0m/
-rw-rw-r-- 1 nicolas nicolas  316154198 abr  1 02:22 [01;31m20190330.7z[0m
drwxrwxr-x 2 nicolas nicolas       4096 abr  1 02:17 [01;34m20190330.parquet[0m/
-rw-rw-r-- 1 nicolas nicolas 1301698260 mar 30 16:46 [01;31m20190330.zip[0m


### CSV missing values

In [13]:
%%time
df = sqlContext.read.csv(path=all_files, header=True, sep=";", encoding="cp1252").drop("_c6")
missing_values = [(column, df.where(df[column].isNull()).count()) for column in df.columns]
print(missing_values)

[('FECHAHORATRX', 0), ('CODIGOENTIDAD', 0), ('NOMBREENTIDAD', 0), ('CODIGOSITIO', 0), ('NOMBRESITIO', 0), ('NROTARJETA', 0)]
CPU times: user 26.1 ms, sys: 8.81 ms, total: 34.9 ms
Wall time: 4min 22s


### Parquet missing values

In [8]:
%%time
path_parquet = "datasets/20190330.parquet/"
df = sqlContext.read.load(path_parquet)
missing_values = [(column, df.where(df[column].isNull()).count()) for column in df.columns]
print(missing_values)

[('FECHAHORATRX', 0), ('CODIGOENTIDAD', 0), ('NOMBREENTIDAD', 0), ('CODIGOSITIO', 0), ('NOMBRESITIO', 0), ('NROTARJETA', 0)]
CPU times: user 4.57 ms, sys: 1.85 ms, total: 6.42 ms
Wall time: 5.36 s


In [9]:
df.printSchema()

root
 |-- FECHAHORATRX: string (nullable = true)
 |-- CODIGOENTIDAD: string (nullable = true)
 |-- NOMBREENTIDAD: string (nullable = true)
 |-- CODIGOSITIO: string (nullable = true)
 |-- NOMBRESITIO: string (nullable = true)
 |-- NROTARJETA: string (nullable = true)



4 * 60 + 22

### Conclusion

In [15]:
print("Using parquet format is %dx faster!!" % ((4 * 60 + 22) / 5.36)) 

Using parquet format is 48x faster!!
