In [1]:
import re
from io import BytesIO
from boto3 import resource
from zipfile import ZipFile
from datetime import datetime

In [2]:
import findspark
findspark.init()

In [3]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import Row, StructField, StructType, StringType, DateType, IntegerType

In [4]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

In [5]:
bucket_name = "dtpm-transactions"
s3 = resource("s3")
bucket = s3.Bucket(bucket_name)

In [6]:
for obj in bucket.objects.all():
    # Do not convert the respaldo or another folder.
    if bool(re.match(r"^[0-9]*.zip", obj.key)):
        print(obj.key)
        response = obj.get()
        dataset = response['Body'].read()
        # Uncompress .zip
        # Validate zip structure
        # Validate do not exist files with the same name

        # Create RDD or DataFrame

        # Build eschema

        # Save as parquet
        break

20180818.zip


In [7]:
with BytesIO(dataset) as tf:
    tf.seek(0)

    # Read the file as a zipfile and process the members
    with ZipFile(tf, mode='r') as zipf:
        # for subfile in zipf.namelist():
        #     print(subfile)
        data = {name: zipf.read(name) for name in zipf.namelist()}

In [8]:
file = 'home/telefonica/shared/filtered_Lst_Usos_20180812_20180814.CSV'

csv = BytesIO(data[file]).read().decode('cp1252').split('\n')

In [9]:
%%time
# rdd = sc.parallelize(csv).map(lambda a: a.split(";")[:-1])
rdd = sc.parallelize(BytesIO(data[file]).read().decode('cp1252').split('\n')).map(lambda a: a.split(";")[:-1])

CPU times: user 1.68 s, sys: 455 ms, total: 2.14 s
Wall time: 2.61 s


In [12]:
type(sc._conf)

pyspark.conf.SparkConf

In [13]:
sc._conf.getAll()

[('spark.master', 'local[4]'),
 ('spark.rdd.compress', 'True'),
 ('spark.app.name', 'transport'),
 ('spark.serializer.objectStreamReset', '100'),
 ('spark.executor.id', 'driver'),
 ('spark.submit.deployMode', 'client'),
 ('spark.app.id', 'local-1554245490907'),
 ('spark.driver.memory', '16g'),
 ('spark.ui.showConsoleProgress', 'true'),
 ('spark.driver.port', '33319'),
 ('spark.driver.host', '192.168.0.18')]

In [14]:
header = rdd.first()
header

['FECHAHORATRX',
 'CODIGOENTIDAD',
 'NOMBREENTIDAD',
 'CODIGOSITIO',
 'NOMBRESITIO',
 'NROTARJETA']

In [15]:
rdd.count()

2668726

In [16]:
rdd = rdd.filter(lambda row: row != header and len(row) == 6)

In [17]:
rdd.count()

2668724

In [18]:
schema = StructType([StructField("fechahoratrx", DateType(), False),
                     StructField("codigoentidad", IntegerType(), False),
                     StructField("nombreentidad", StringType(), False),
                     StructField("codigositio", IntegerType(), False),
                     StructField("nombresitio", StringType(), False),
                     StructField("nrotarjeta", StringType(), False)])

In [19]:
rdd.take(5)

[['05/08/2018 11:09:04',
  '5',
  'U5 - Metropolitana',
  '5896',
  'FLXR-60',
  '7c842ff9dfcdd0af7ab54c7b61b9b20c0d96efc1b4d7c860005b5906b7b10386'],
 ['05/08/2018 11:22:25',
  '5',
  'U5 - Metropolitana',
  '5896',
  'FLXR-60',
  '5b92190b6a286b211e09c56f58265ed33afb83323be553bc60f26c44ccd77b2e'],
 ['05/08/2018 11:38:20',
  '5',
  'U5 - Metropolitana',
  '5896',
  'FLXR-60',
  '011f68818a273394cbdd1ebd4d9031bdd73aab93bc5645b3ef157ea337d36b4c'],
 ['05/08/2018 11:38:23',
  '5',
  'U5 - Metropolitana',
  '5896',
  'FLXR-60',
  '1d220bc9fa949b093da03b4a3824235de9a3cfb6200aa9406107a962fa744741'],
 ['05/08/2018 12:38:59',
  '5',
  'U5 - Metropolitana',
  '5896',
  'FLXR-60',
  '655b6f560616f41a622e50f7bb8856bca2a32a2afc8eebacd61c7a6cec4de71c']]

In [20]:
rdd = rdd.map(lambda x: [datetime.strptime(x[0], "%d/%m/%Y %H:%M:%S"), int(x[1]), x[2], int(x[3]), x[4], x[5]])

In [21]:
rdd.take(5)

[[datetime.datetime(2018, 8, 5, 11, 9, 4),
  5,
  'U5 - Metropolitana',
  5896,
  'FLXR-60',
  '7c842ff9dfcdd0af7ab54c7b61b9b20c0d96efc1b4d7c860005b5906b7b10386'],
 [datetime.datetime(2018, 8, 5, 11, 22, 25),
  5,
  'U5 - Metropolitana',
  5896,
  'FLXR-60',
  '5b92190b6a286b211e09c56f58265ed33afb83323be553bc60f26c44ccd77b2e'],
 [datetime.datetime(2018, 8, 5, 11, 38, 20),
  5,
  'U5 - Metropolitana',
  5896,
  'FLXR-60',
  '011f68818a273394cbdd1ebd4d9031bdd73aab93bc5645b3ef157ea337d36b4c'],
 [datetime.datetime(2018, 8, 5, 11, 38, 23),
  5,
  'U5 - Metropolitana',
  5896,
  'FLXR-60',
  '1d220bc9fa949b093da03b4a3824235de9a3cfb6200aa9406107a962fa744741'],
 [datetime.datetime(2018, 8, 5, 12, 38, 59),
  5,
  'U5 - Metropolitana',
  5896,
  'FLXR-60',
  '655b6f560616f41a622e50f7bb8856bca2a32a2afc8eebacd61c7a6cec4de71c']]

In [22]:
df = sqlContext.createDataFrame(rdd, schema)
df.show()

+------------+-------------+------------------+-----------+-----------+--------------------+
|fechahoratrx|codigoentidad|     nombreentidad|codigositio|nombresitio|          nrotarjeta|
+------------+-------------+------------------+-----------+-----------+--------------------+
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60|7c842ff9dfcdd0af7...|
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60|5b92190b6a286b211...|
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60|011f68818a273394c...|
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60|1d220bc9fa949b093...|
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60|655b6f560616f41a6...|
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60|af7438f07ec20a016...|
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60|4c0f58654923c35e6...|
|  2018-08-05|            5|U5 - Metropolitana|       5896|    FLXR-60

In [24]:
df.printSchema()

root
 |-- fechahoratrx: date (nullable = false)
 |-- codigoentidad: integer (nullable = false)
 |-- nombreentidad: string (nullable = false)
 |-- codigositio: integer (nullable = false)
 |-- nombresitio: string (nullable = false)
 |-- nrotarjeta: string (nullable = false)



In [25]:
df = rdd.toDF(list(map(lambda a: a.lower(), header)))
df.show()

+-------------------+-------------+------------------+-----------+-----------+--------------------+
|       fechahoratrx|codigoentidad|     nombreentidad|codigositio|nombresitio|          nrotarjeta|
+-------------------+-------------+------------------+-----------+-----------+--------------------+
|2018-08-05 11:09:04|            5|U5 - Metropolitana|       5896|    FLXR-60|7c842ff9dfcdd0af7...|
|2018-08-05 11:22:25|            5|U5 - Metropolitana|       5896|    FLXR-60|5b92190b6a286b211...|
|2018-08-05 11:38:20|            5|U5 - Metropolitana|       5896|    FLXR-60|011f68818a273394c...|
|2018-08-05 11:38:23|            5|U5 - Metropolitana|       5896|    FLXR-60|1d220bc9fa949b093...|
|2018-08-05 12:38:59|            5|U5 - Metropolitana|       5896|    FLXR-60|655b6f560616f41a6...|
|2018-08-05 12:47:27|            5|U5 - Metropolitana|       5896|    FLXR-60|af7438f07ec20a016...|
|2018-08-05 12:47:37|            5|U5 - Metropolitana|       5896|    FLXR-60|4c0f58654923c35e6...|


In [26]:
df.printSchema()

root
 |-- fechahoratrx: timestamp (nullable = true)
 |-- codigoentidad: long (nullable = true)
 |-- nombreentidad: string (nullable = true)
 |-- codigositio: long (nullable = true)
 |-- nombresitio: string (nullable = true)
 |-- nrotarjeta: string (nullable = true)



In [27]:
import pandas as pd

In [28]:
df_p = pd.read_csv(BytesIO(data[file]), sep=";", encoding="cp1252", usecols=[i for i in range(6)])

In [29]:
df_p.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2668724 entries, 0 to 2668723
Data columns (total 6 columns):
FECHAHORATRX     object
CODIGOENTIDAD    int64
NOMBREENTIDAD    object
CODIGOSITIO      int64
NOMBRESITIO      object
NROTARJETA       object
dtypes: int64(2), object(4)
memory usage: 122.2+ MB


In [31]:
df_toPandas = df.toPandas()

In [32]:
df_toPandas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2668724 entries, 0 to 2668723
Data columns (total 6 columns):
fechahoratrx     datetime64[ns]
codigoentidad    int64
nombreentidad    object
codigositio      int64
nombresitio      object
nrotarjeta       object
dtypes: datetime64[ns](1), int64(2), object(3)
memory usage: 122.2+ MB


In [None]:
df.toPandas() == df_p

In [19]:
sc.stop()