In [1]:
import findspark
findspark.init()

In [26]:
from zipfile import ZipFile
from datetime import datetime
from io import BytesIO
import re
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

In [3]:
conf = SparkConf().setMaster("local[4]").setAppName("transport")
sc = SparkContext(conf=conf)

sqlContext = SQLContext(sc)

In [31]:
def build_file_name(raw_name):
    day, from_day, to_day = re.findall(r'\d+', raw_name)
    return "day=%s/from=%s/to=%s" % (day, from_day, to_day)

In [23]:
def extract_files(compressed_name, stream):
    """
    Extract .csv files from a .zip file and load in different DataFrames.
    """
    with BytesIO(stream) as tf:
        tf.seek(0)        
        # Read the file as a zipfile and process the members
        with ZipFile(tf, mode='r') as zipf:
            
            return [(compressed_name + name, zipf.read(name)) for name in zipf.namelist()]

In [24]:
path = 'datasets/test-folder/*.zip'
rdd = sc.binaryFiles(path).map(lambda a: extract_files(a[0], a[1]))
print(rdd.toDebugString().decode())

(1) PythonRDD[15] at RDD at PythonRDD.scala:53 []
 |  datasets/test-folder/*.zip BinaryFileRDD[14] at binaryFiles at NativeMethodAccessorImpl.java:0 []


In [27]:
re.findall(r'\d+', 'file:/home/nicolas/github/improve_transport/datasets/test-folder/20190311.ziphome/telefonica/shared/filtered_Lst_Usos_20190307_20190309.CSV')

['20190311', '20190307', '20190309']

In [25]:
rdd.collect()

[[('file:/home/nicolas/github/improve_transport/datasets/test-folder/20190311.ziphome/telefonica/shared/filtered_Lst_Usos_20190307_20190309.CSV',
   b'FECHAHORATRX;CODIGOENTIDAD;NOMBREENTIDAD;CODIGOSITIO;NOMBRESITIO;NROTARJETA;\n01/03/2019 01:20:44;16;U3 - Vule;2308;BJFZ-75;37d3a2cd6ba280f3a3f4652d7cc6690bd638dda0611c98c4040dd6dc7c1b895e;\n01/03/2019 01:20:47;16;U3 - Vule;2308;BJFZ-75;7464f0fb6fe00e3472ae7c74b33053ac8c1e8fa50d59c0c25463d4e2198c8fb6;\n01/03/2019 01:20:50;16;U3 - Vule;2308;BJFZ-75;3ce1dbdd2d904a9531634caeebfb4c70301aabdd87c7a2392800b508e684c092;\n01/03/2019 01:20:53;16;U3 - Vule;2308;BJFZ-75;dd9730ec384f81487a60171a2244a445ed13fa92f9c71243ef00b5f01367633f;\n01/03/2019 01:21:00;16;U3 - Vule;2308;BJFZ-75;7f4ded632ab214988891f77786cea55bf6fbe52c62476956133244e6500cbeba;\n01/03/2019 01:21:06;16;U3 - Vule;2308;BJFZ-75;0a9c6837b38eddffbddd0e27f05faa28615d4eb84cc4c10ae9763bfb5d4872ab;\n01/03/2019 01:21:10;16;U3 - Vule;2308;BJFZ-75;8ff9b3bc9e295708522b091ff36854599d62192a27d038f

In [14]:
rdd.getNumPartitions()

1

In [9]:
def csv_to_df(dict_files, sc):
    for file_name in dict_files:
        print(file_name)
        csv = BytesIO(dict_files[file_name]).read().decode('cp1252').split('\n')
        rdd = sc.parallelize(csv).map(lambda a: a.split(";")[:-1])
        """header = rdd.first()
        rdd = rdd.filter(lambda row: row != header and len(row) == 6)    
        rdd = rdd.map(lambda x: [datetime.strptime(x[0], "%d/%m/%Y %H:%M:%S"), int(x[1]), x[2], int(x[3]), x[4], x[5]])
        df = rdd.toDF(list(map(lambda a: a.lower(), header)))
        dict_files[file_name] = df"""
    return dict_files

In [21]:
rdd.map(lambda s: s[1]).collect() 

[[('home/telefonica/shared/filtered_Lst_Usos_20190307_20190309.CSV',
   b'FECHAHORATRX;CODIGOENTIDAD;NOMBREENTIDAD;CODIGOSITIO;NOMBRESITIO;NROTARJETA;\n01/03/2019 01:20:44;16;U3 - Vule;2308;BJFZ-75;37d3a2cd6ba280f3a3f4652d7cc6690bd638dda0611c98c4040dd6dc7c1b895e;\n01/03/2019 01:20:47;16;U3 - Vule;2308;BJFZ-75;7464f0fb6fe00e3472ae7c74b33053ac8c1e8fa50d59c0c25463d4e2198c8fb6;\n01/03/2019 01:20:50;16;U3 - Vule;2308;BJFZ-75;3ce1dbdd2d904a9531634caeebfb4c70301aabdd87c7a2392800b508e684c092;\n01/03/2019 01:20:53;16;U3 - Vule;2308;BJFZ-75;dd9730ec384f81487a60171a2244a445ed13fa92f9c71243ef00b5f01367633f;\n01/03/2019 01:21:00;16;U3 - Vule;2308;BJFZ-75;7f4ded632ab214988891f77786cea55bf6fbe52c62476956133244e6500cbeba;\n01/03/2019 01:21:06;16;U3 - Vule;2308;BJFZ-75;0a9c6837b38eddffbddd0e27f05faa28615d4eb84cc4c10ae9763bfb5d4872ab;\n01/03/2019 01:21:10;16;U3 - Vule;2308;BJFZ-75;8ff9b3bc9e295708522b091ff36854599d62192a27d038f5d791d6f630edffe2;\n01/03/2019 01:21:17;16;U3 - Vule;2308;BJFZ-75;7b65ee8a740

In [34]:
rdd.collect()

Traceback (most recent call last):
  File "/opt/spark/python/pyspark/serializers.py", line 587, in dumps
    return cloudpickle.dumps(obj, 2)
  File "/opt/spark/python/pyspark/cloudpickle.py", line 863, in dumps
    cp.dump(obj)
  File "/opt/spark/python/pyspark/cloudpickle.py", line 260, in dump
    return Pickler.dump(self, obj)
  File "/usr/lib/python3.6/pickle.py", line 409, in dump
    self.save(obj)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/usr/lib/python3.6/pickle.py", line 751, in save_tuple
    save(element)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self, obj) # Call unbound method with explicit self
  File "/opt/spark/python/pyspark/cloudpickle.py", line 406, in save_function
    self.save_function_tuple(obj)
  File "/opt/spark/python/pyspark/cloudpickle.py", line 549, in save_function_tuple
    save(state)
  File "/usr/lib/python3.6/pickle.py", line 476, in save
    f(self,

PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.