In [1]:
import findspark
findspark.init()

In [2]:
import os
from io import BytesIO
from zipfile import ZipFile
from datetime import datetime
import re
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [3]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

In [4]:
def build_filename(raw_name):
    day, from_day, to_day = re.findall(r'\d+', raw_name)
    return "day=%s/from=%s/to=%s" % (day, from_day, to_day)

In [5]:
def extract_files(compressed_name, stream):
    """
    Extract .csv files from a .zip file and load in different DataFrames.
    """
    with BytesIO(stream) as tf:
        tf.seek(0)        
        # Read the file as a zipfile and process the members
        with ZipFile(tf, mode='r') as zipf:
            return [(build_filename(compressed_name + file_name), zipf.read(file_name)) for file_name in zipf.namelist()]

In [6]:
# datetime.strptime(row[0], "%d/%m/%Y %H:%M:%S")
def prepare_csv(file_name, table):
    return list(map(lambda row: [file_name, row[0], int(row[1]), 
                                 row[2], int(row[3]), row[4], 
                                 row[5]], table)) 

In [7]:
path = 'datasets/test-folder/*.zip'
rdd = sc.binaryFiles(path).flatMap(lambda a: extract_files(a[0], a[1]))
print(rdd.toDebugString().decode())

(3) PythonRDD[1] at RDD at PythonRDD.scala:53 []
 |  datasets/test-folder/*.zip BinaryFileRDD[0] at binaryFiles at NativeMethodAccessorImpl.java:0 []


In [8]:
rdd.getNumPartitions()

3

In [9]:
# Decode bytes and convert it in a list of strings
rdd = rdd.mapValues(lambda file: BytesIO(file).read().decode('cp1252').split('\n'))

In [10]:
# Convert string row to list row
rdd = rdd.mapValues(lambda a: list(map(lambda row: row.split(";")[:-1], a)))

In [11]:
# Drop header and last (and empty) row
rdd = rdd.mapValues(lambda table: table[1:-1])

In [12]:
rdd = rdd.flatMap(lambda a: prepare_csv(a[0], a[1]))

In [15]:
header = ['FILE_NAME',
            'FECHAHORATRX',
            'CODIGOENTIDAD',
            'NOMBREENTIDAD',
            'CODIGOSITIO',
            'NOMBRESITIO',
            'NROTARJETA']
header = list(map(lambda a: a.lower(), header))

In [16]:
%%time
df = rdd.toDF(header)

CPU times: user 245 ms, sys: 26.6 ms, total: 272 ms
Wall time: 48.9 s


In [17]:
df.printSchema()

root
 |-- file_name: string (nullable = true)
 |-- fechahoratrx: string (nullable = true)
 |-- codigoentidad: long (nullable = true)
 |-- nombreentidad: string (nullable = true)
 |-- codigositio: long (nullable = true)
 |-- nombresitio: string (nullable = true)
 |-- nrotarjeta: string (nullable = true)



In [18]:
%%time
df.show()

+--------------------+-------------------+-------------+-------------+-----------+-----------+--------------------+
|           file_name|       fechahoratrx|codigoentidad|nombreentidad|codigositio|nombresitio|          nrotarjeta|
+--------------------+-------------------+-------------+-------------+-----------+-----------+--------------------+
|day=20190330/from...|04/03/2019 08:03:18|            4| U4 - Express|       5481|    ZN-6170|b53396bd54f1a9074...|
|day=20190330/from...|04/03/2019 08:04:36|            4| U4 - Express|       5481|    ZN-6170|e36d3cc7c504835ee...|
|day=20190330/from...|04/03/2019 08:11:24|            4| U4 - Express|       5481|    ZN-6170|798decfe9fba0f696...|
|day=20190330/from...|04/03/2019 08:16:33|            4| U4 - Express|       5481|    ZN-6170|041eccfdbe14a4864...|
|day=20190330/from...|04/03/2019 08:28:24|            4| U4 - Express|       5481|    ZN-6170|617dea27d7ec8be7f...|
|day=20190330/from...|04/03/2019 08:34:14|            4| U4 - Express|  

In [19]:
%%time
days = [file.file_name for file in df.select('file_name').distinct().collect()]
days

CPU times: user 29.2 ms, sys: 68.8 ms, total: 97.9 ms
Wall time: 3min 41s


In [20]:
%%time
for directory in days:
    if not os.path.exists(directory):
        os.makedirs(directory)
    df_day = df.select(df.columns[1:]).where(df.file_name == directory)
    df_day.write.parquet(directory + "/data.parquet", compression="gzip")

CPU times: user 150 ms, sys: 1.11 s, total: 1.26 s
Wall time: 44min 47s
