In [8]:
import re
from io import BytesIO, StringIO
from boto3 import resource
from zipfile import ZipFile
from datetime import datetime
import pandas as pd

In [27]:
import findspark
findspark.init()

In [28]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, DateType, IntegerType

In [39]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [2]:
bucket_name = "dtpm-transactions"
s3 = resource("s3")
bucket = s3.Bucket(bucket_name)

In [2]:
files = []
path = "datasets/20190330.zip"    
# Read the file as a zipfile and process the members
with ZipFile(path, mode='r') as zipf:
    # for subfile in zipf.namelist():
    #     print(subfile)
    files = [(name, zipf.read(name)) for name in zipf.namelist()]

In [4]:
%%time
files = list(map(lambda file: (file[0], to_parquet(file[1])), files))

CPU times: user 4min 45s, sys: 4.75 s, total: 4min 50s
Wall time: 4min 50s


In [5]:
len(files)

6

In [15]:
files[5][0]

'home/telefonica/shared/filtered_Lst_Usos_20190326_20190328.CSV'

In [6]:
%%time
files = []
path = "datasets/20190330.zip"    
# Read the file as a zipfile and process the members
with ZipFile(path, mode='r') as zipf:
    # for subfile in zipf.namelist():
    #     print(subfile)
    files = [(name, to_parquet(zipf.read(name))) for name in zipf.namelist()]

CPU times: user 3min 5s, sys: 5.1 s, total: 3min 10s
Wall time: 3min 10s


In [None]:
def transform_dataset(zip_dataset):
    """
    Convert zip dataset into multiple parquet files.
    """
    files = []
    
    # Decompress .zip
    with BytesIO(zip_dataset) as tf:
        tf.seek(0)

        # Read the file as a zipfile and process the members
        with ZipFile(tf, mode='r') as zipf:
            # for subfile in zipf.namelist():
            #     print(subfile)
            files = [(name, zipf.read(name)) for name in zipf.namelist()]

In [5]:
def load_trx(csv):
    """
    Load .csv transactions dataset in pandas DataFrame.
    """
    dtype = {
        'CODIGOENTIDAD': 'int',
        'NOMBREENTIDAD': 'object',
        'CODIGOSITIO': 'int',
        'NOMBRESITIO': 'object',
        'NROTARJETA': 'object'
    }
    df = pd.read_csv(BytesIO(csv), sep=";", encoding="cp1252", usecols=[i for i in range(6)], dtype=dtype)
    df["FECHAHORATRX"] = pd.to_datetime(df["FECHAHORATRX"], format="%d/%m/%Y %H:%M:%S", errors='coerce')
    df.columns = [x.lower() for x in df.columns]
    return df

In [4]:
files[0][0]

'home/telefonica/shared/filtered_Lst_Usos_20190321_20190323.CSV'

In [52]:
# path = "datasets/day=20190330/from=20190321/to=20190323/data.parquet"
path = "datasets/20190330-pandas-2.parquet"

In [53]:
files[0][1].to_parquet(path, compression="gzip")

In [54]:
%%time
df = sqlContext.read.load(path)

CPU times: user 2.15 ms, sys: 47 µs, total: 2.2 ms
Wall time: 106 ms


In [55]:
df.printSchema()

root
 |-- fechahoratrx: timestamp (nullable = true)
 |-- codigoentidad: long (nullable = true)
 |-- nombreentidad: string (nullable = true)
 |-- codigositio: long (nullable = true)
 |-- nombresitio: string (nullable = true)
 |-- nrotarjeta: string (nullable = true)



In [56]:
df.show()

+-------------------+-------------+-------------+-----------+-----------+--------------------+
|       fechahoratrx|codigoentidad|nombreentidad|codigositio|nombresitio|          nrotarjeta|
+-------------------+-------------+-------------+-----------+-----------+--------------------+
|2019-03-04 05:03:18|            4| U4 - Express|       5481|    ZN-6170|b53396bd54f1a9074...|
|2019-03-04 05:04:36|            4| U4 - Express|       5481|    ZN-6170|e36d3cc7c504835ee...|
|2019-03-04 05:11:24|            4| U4 - Express|       5481|    ZN-6170|798decfe9fba0f696...|
|2019-03-04 05:16:33|            4| U4 - Express|       5481|    ZN-6170|041eccfdbe14a4864...|
|2019-03-04 05:28:24|            4| U4 - Express|       5481|    ZN-6170|617dea27d7ec8be7f...|
|2019-03-04 05:34:14|            4| U4 - Express|       5481|    ZN-6170|8c8b6eb2c34befa2f...|
|2019-03-04 05:39:49|            4| U4 - Express|       5481|    ZN-6170|3c5211b4589104bda...|
|2019-03-04 05:57:07|            4| U4 - Express| 

In [57]:
%%time
missing_values = [(column, df.where(df[column].isNull()).count()) for column in df.columns]
print(missing_values)

[('fechahoratrx', 0), ('codigoentidad', 0), ('nombreentidad', 0), ('codigositio', 0), ('nombresitio', 0), ('nrotarjeta', 0)]
CPU times: user 5.31 ms, sys: 102 µs, total: 5.41 ms
Wall time: 416 ms


### Without Arrow

In [40]:
%%time
df = sqlContext.read.load(path)

CPU times: user 2.33 ms, sys: 0 ns, total: 2.33 ms
Wall time: 142 ms


In [58]:
df.printSchema()

root
 |-- fechahoratrx: timestamp (nullable = true)
 |-- codigoentidad: long (nullable = true)
 |-- nombreentidad: string (nullable = true)
 |-- codigositio: long (nullable = true)
 |-- nombresitio: string (nullable = true)
 |-- nrotarjeta: string (nullable = true)



In [42]:
%%time
missing_values = [(column, df.where(df[column].isNull()).count()) for column in df.columns]
print(missing_values)

[('fechahoratrx', 0), ('codigoentidad', 0), ('nombreentidad', 0), ('codigositio', 0), ('nombresitio', 0), ('nrotarjeta', 0)]
CPU times: user 1.9 ms, sys: 7.96 ms, total: 9.86 ms
Wall time: 559 ms


### Save in S3

In [7]:
bucket_name = "transantiago/"
s3 = resource("s3")
bucket = s3.Bucket(bucket_name)
path = bucket_name + "test/"
filename = "20190330.parquet"

In [1]:
1 + 1

2

In [2]:
%%time
csv_buffer = StringIO()
files[0][1].to_parquet(csv_buffer, compression='gzip')

NameError: name 'StringIO' is not defined

In [None]:
%%time
s3.Object(path, filename).put(Body=csv_buffer.getvalue())

In [None]:
def to_s3(dataset, bucket, file_name):
    csv_buffer = StringIO()
    files[0][1].to_parquet(csv_buffer, compression='gzip')
    s3.Object(path, filename).put(Body=csv_buffer.getvalue())