# Exercices 

## Question 0

In [8]:
import os
from tempfile import NamedTemporaryFile, TemporaryDirectory
import json
import tarfile
from shutil import copyfile

In [9]:
def download_blob(blob: str) -> TemporaryDirectory:
    with NamedTemporaryFile() as f:
        temp_dir = TemporaryDirectory()
        target_dir = temp_dir.name
        with tarfile.open(blob, "r:gz") as tf:
            tf.extractall(path=target_dir)

        return temp_dir

def make_tarfile(output_filename, source_dir):
    with tarfile.open(output_filename, "w:gz") as tar:
        tar.add(source_dir, arcname=os.path.basename(source_dir))

In [10]:
path = 'blobs/'
blobs_files = os.listdir(path)
#Creating the folder if it does not exist already
target_directory = 'clean_blobs_1/'
if not os.path.exists(target_directory):
    os.makedirs(target_directory)

#iterating through all compressed files
for blob_name in blobs_files:
    blob_path = os.path.join(path, blob_name)
    temp_dir = download_blob(blob_path)
    meta_path = os.path.join(temp_dir.name, "metadata.json")
    result_path = os.path.join(temp_dir.name, "result.json")
    extracted_data = {}

    name = blob_name.split('-')[0]

    with open(meta_path, "r") as f:
        meta = json.load(f)

    with open(result_path, "r") as f:
        result = json.load(f)

    target_file = os.path.join(target_directory, blob_name)
    if meta['check_name'] != name:
        meta['check_name'] = name

        #We need to compress the blob and save it to the target directory
        with open('metadata.json', 'w') as outfile:
            json.dump(meta, outfile)

        with open('result.json', 'w') as outfile:
            json.dump(result, outfile)
            
        with tarfile.open(target_file,"w:gz") as tar:
            tar.add(os.path.basename('metadata.json'))
            tar.add(os.path.basename('result.json'))
    else:
        copyfile(blob_path, target_file)

    

## Question 2

A typical Beam driver program works as follows:

1. Create a Pipeline object and set the pipeline execution options, including the Pipeline Runner.
2. Create an initial PCollection for pipeline data, either using the IOs to read data from an external storage system, or using a Create transform to build a PCollection from in-memory data.
3. Apply PTransforms to each PCollection. Transforms can change, filter, group, analyze, or otherwise process the elements in a PCollection. A transform creates a new output PCollection without modifying the input collection. A typical pipeline applies subsequent transforms to each new output PCollection in turn until processing is complete. However, note that a pipeline does not have to be a single straight line of transforms applied one after another: think of PCollections as variables and PTransforms as functions applied to these variables: the shape of the pipeline can be an arbitrarily complex processing graph.
4. Use IOs to write the final, transformed PCollection(s) to an external source.
5. Run the pipeline using the designated Pipeline Runner.

In [5]:
import apache_beam as beam
from apache_beam.options.pipeline_options import PipelineOptions


In [None]:
options = PipelineOptions()
pipeline = beam.Pipeline(options=options)

class MyOptions(PipelineOptions):
@classmethod
    def _add_argparse_args(cls, parser):
        parser.add_argument('--input',
                            help='Input for the pipeline',
                            default='./blobs/')
        parser.add_argument('--output',
                            help='Output for the pipeline',
                            default='./clean_blobs_2/')