In [1]:
import os
import json
import re
import numpy as np
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession, SQLContext
import pandas as pd

In [2]:
LANDED = './landed/'
RAW = './raw/'
MODELED = './modeled/'

In [3]:
def search_json(x):
    directory = LANDED+x
    if os.path.isdir(directory):
        files = []
        for y in os.listdir(directory):
            filename = directory + '/' + y
            if os.path.isfile(filename) & filename.find('.json') > -1:
                files.append(filename)
        return files

In [4]:
builder = SparkSession.builder.appName("Parser")
builder.config(
    "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
builder.config("spark.speculation", "false")
builder.config("spark.sql.parquet.compression.codec", "gzip")
builder.config("spark.debug.maxToStringFields", "100")
builder.config("spark.driver.memory", "2g")
builder.config("spark.driver.cores", "1")
builder.config("spark.executor-memory", "6g")
builder.config("spark.executor.cores", "5")
builder.master("local[*]")

spark = builder.getOrCreate()

In [5]:
list_of_paths = spark.sparkContext.parallelize(os.listdir(LANDED)).flatMap(search_json).collect()
list_of_paths

['./landed/192.168.99.100/aluno-0.json.gz',
 './landed/192.168.99.100/aluno-1.json.gz',
 './landed/192.168.99.100/aluno-2.json.gz',
 './landed/192.168.99.100/curso-0.json.gz',
 './landed/192.168.99.100/logizz-0.json.gz',
 './landed/192.168.99.100/logizz-1.json.gz',
 './landed/192.168.99.100/logizz-10.json.gz',
 './landed/192.168.99.100/logizz-11.json.gz',
 './landed/192.168.99.100/logizz-12.json.gz',
 './landed/192.168.99.100/logizz-13.json.gz',
 './landed/192.168.99.100/logizz-14.json.gz',
 './landed/192.168.99.100/logizz-15.json.gz',
 './landed/192.168.99.100/logizz-16.json.gz',
 './landed/192.168.99.100/logizz-17.json.gz',
 './landed/192.168.99.100/logizz-18.json.gz',
 './landed/192.168.99.100/logizz-19.json.gz',
 './landed/192.168.99.100/logizz-2.json.gz',
 './landed/192.168.99.100/logizz-20.json.gz',
 './landed/192.168.99.100/logizz-21.json.gz',
 './landed/192.168.99.100/logizz-22.json.gz',
 './landed/192.168.99.100/logizz-23.json.gz',
 './landed/192.168.99.100/logizz-24.json.gz',

In [6]:
def analyse_json(paths):
    nDf = pd.DataFrame([])
    for path in paths:
        domain = re.sub('/.*', '', re.sub(LANDED, '',path))
        model = re.sub('-.*', '', re.sub('.*/', '', re.sub(LANDED, '',path))) 
        if (os.stat(path).st_size > 0):
            full_report = pd.read_json(path)
            full_report['domain'] = domain
            nDf = pd.concat([nDf, full_report], sort=False)
    with open(RAW+model+'.json', 'w') as f:
        f.write(nDf.to_json(orient='records'))

In [7]:
def is_json(myjson):
  try:
    json_object = json.loads(open(myjson).read())
  except ValueError as e:
    return False
  return True

In [8]:
def extract_models(path):
    return re.sub('-.*', '', re.sub('.*/', '', re.sub(LANDED, '',path)))

In [9]:
list_of_models = spark.sparkContext.parallelize(list_of_paths).map(extract_models).distinct().collect()

In [10]:
dictionary = {}
for model in list_of_models:
    dictionary[model] = [item for item in list_of_paths if model in re.sub('./.*/', '', item)]

In [11]:
spark.sparkContext.parallelize(dictionary).map(lambda x: analyse_json(dictionary[x])).collect()

[None, None, None, None, None, None, None, None]