In [1]:
from glob import glob

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("transport").getOrCreate()

In [5]:
type(spark)

pyspark.sql.session.SparkSession

In [6]:
path_datasets = "datasets/"
root_path = "telefonica/shared/"

all_files = glob(path_datasets + "20190330/" + root_path + "*.CSV")
all_files

['datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190326_20190328.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190325_20190327.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190324_20190326.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190321_20190323.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190323_20190325.CSV',
 'datasets/20190330/telefonica/shared/filtered_Lst_Usos_20190322_20190324.CSV']

In [7]:
df = spark.read.csv(path=all_files, header=True, sep=";", encoding="cp1252").drop("_c6")

In [9]:
df.explain()

== Physical Plan ==
*(1) FileScan csv [FECHAHORATRX#7,CODIGOENTIDAD#8,NOMBREENTIDAD#9,CODIGOSITIO#10,NOMBRESITIO#11,NROTARJETA#12] Batched: false, Format: CSV, Location: InMemoryFileIndex[file:/home/nicolas/github/improve_transport/datasets/20190330/telefonica/shared..., PartitionFilters: [], PushedFilters: [], ReadSchema: struct<FECHAHORATRX:string,CODIGOENTIDAD:string,NOMBREENTIDAD:string,CODIGOSITIO:string,NOMBRESIT...


In [10]:
df.printSchema()

root
 |-- FECHAHORATRX: string (nullable = true)
 |-- CODIGOENTIDAD: string (nullable = true)
 |-- NOMBREENTIDAD: string (nullable = true)
 |-- CODIGOSITIO: string (nullable = true)
 |-- NOMBRESITIO: string (nullable = true)
 |-- NROTARJETA: string (nullable = true)



In [12]:
%%time
missing_values = [(column, df.where(df[column].isNull()).count()) for column in df.columns]
print(missing_values)

[('FECHAHORATRX', 0), ('CODIGOENTIDAD', 0), ('NOMBREENTIDAD', 0), ('CODIGOSITIO', 0), ('NOMBRESITIO', 0), ('NROTARJETA', 0)]
CPU times: user 21.6 ms, sys: 116 µs, total: 21.7 ms
Wall time: 2min 3s


In [13]:
df.show()

+-------------------+-------------+------------------+-----------+-----------+--------------------+
|       FECHAHORATRX|CODIGOENTIDAD|     NOMBREENTIDAD|CODIGOSITIO|NOMBRESITIO|          NROTARJETA|
+-------------------+-------------+------------------+-----------+-----------+--------------------+
|12/03/2019 08:46:42|            5|U5 - Metropolitana|      15600|    RM-1008|b277725cfc0602849...|
|12/03/2019 08:47:02|            5|U5 - Metropolitana|      15600|    RM-1008|3bdf83a509b372b5e...|
|12/03/2019 08:47:49|            5|U5 - Metropolitana|      15600|    RM-1008|ba30b9314cea1d3dd...|
|12/03/2019 08:48:57|            5|U5 - Metropolitana|      15600|    RM-1008|984a562e74804df47...|
|12/03/2019 08:49:06|            5|U5 - Metropolitana|      15600|    RM-1008|bdf711db62f64e8ac...|
|12/03/2019 08:49:34|            5|U5 - Metropolitana|      15600|    RM-1008|3ca91a7e8c0f941d0...|
|12/03/2019 08:50:33|            5|U5 - Metropolitana|      15600|    RM-1008|5a271041e416388df...|


In [17]:
%%time
df.groupBy("NOMBREENTIDAD").count().collect()

CPU times: user 3.83 ms, sys: 5.81 ms, total: 9.64 ms
Wall time: 21.5 s


[Row(NOMBREENTIDAD='U2 - Su Bus', count=2197145),
 Row(NOMBREENTIDAD='U7 - STP', count=1364513),
 Row(NOMBREENTIDAD='METRO - OT', count=13169085),
 Row(NOMBREENTIDAD='Tren Central', count=381641),
 Row(NOMBREENTIDAD='U6 - Redbus', count=1700924),
 Row(NOMBREENTIDAD='U3 - Vule', count=3106718),
 Row(NOMBREENTIDAD='U4 - Express', count=3027622),
 Row(NOMBREENTIDAD='U5 - Metropolitana', count=3359492)]

## Using parquet (in snappy)

In [18]:
path_parquet = "datasets/20190330.parquet/"
df = spark.read.load(path_parquet)

In [19]:
%%time
df.groupBy("NOMBREENTIDAD").count().collect()

CPU times: user 4.13 ms, sys: 451 µs, total: 4.58 ms
Wall time: 2.37 s


[Row(NOMBREENTIDAD='U2 - Su Bus', count=2197145),
 Row(NOMBREENTIDAD='U7 - STP', count=1364513),
 Row(NOMBREENTIDAD='METRO - OT', count=13169085),
 Row(NOMBREENTIDAD='Tren Central', count=381641),
 Row(NOMBREENTIDAD='U6 - Redbus', count=1700924),
 Row(NOMBREENTIDAD='U3 - Vule', count=3106718),
 Row(NOMBREENTIDAD='U4 - Express', count=3027622),
 Row(NOMBREENTIDAD='U5 - Metropolitana', count=3359492)]

In [8]:
%%time
df.write.parquet("datasets/20190330-2.parquet", compression="gzip")

CPU times: user 13.8 ms, sys: 0 ns, total: 13.8 ms
Wall time: 1min 48s


In [10]:
!du -hs datasets/*

2,9G	datasets/20190311
1,2G	datasets/20190311.zip
3,1G	datasets/20190330
1,1G	datasets/20190330-2.parquet
1,1G	datasets/20190330.7z
1,9G	datasets/20190330.parquet
1,3G	datasets/20190330.zip


## Using parquet (in gzip)

In [6]:
path_parquet = "datasets/20190330-2.parquet/"
df = spark.read.load(path_parquet)

In [8]:
%%time
df.groupBy("NOMBREENTIDAD").count().collect()

CPU times: user 4.3 ms, sys: 6.49 ms, total: 10.8 ms
Wall time: 3.07 s


[Row(NOMBREENTIDAD='U2 - Su Bus', count=2197145),
 Row(NOMBREENTIDAD='U7 - STP', count=1364513),
 Row(NOMBREENTIDAD='METRO - OT', count=13169085),
 Row(NOMBREENTIDAD='Tren Central', count=381641),
 Row(NOMBREENTIDAD='U6 - Redbus', count=1700924),
 Row(NOMBREENTIDAD='U3 - Vule', count=3106718),
 Row(NOMBREENTIDAD='U4 - Express', count=3027622),
 Row(NOMBREENTIDAD='U5 - Metropolitana', count=3359492)]

In [7]:
%%time
missing_values = [(column, df.where(df[column].isNull()).count()) for column in df.columns]
print(missing_values)

[('FECHAHORATRX', 0), ('CODIGOENTIDAD', 0), ('NOMBREENTIDAD', 0), ('CODIGOSITIO', 0), ('NOMBRESITIO', 0), ('NROTARJETA', 0)]
CPU times: user 6.19 ms, sys: 218 µs, total: 6.41 ms
Wall time: 2.35 s


In [9]:
df.schema

StructType(List(StructField(FECHAHORATRX,StringType,true),StructField(CODIGOENTIDAD,StringType,true),StructField(NOMBREENTIDAD,StringType,true),StructField(CODIGOSITIO,StringType,true),StructField(NOMBRESITIO,StringType,true),StructField(NROTARJETA,StringType,true)))