In [19]:
import os
import requests
import json
import re
import glob
import numpy as np
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import *
import pandas as pd

In [2]:
RAW = './raw/'
MODELED = './modeled/'

In [3]:
builder = SparkSession.builder.appName("Data scrub")
builder.config(
    "spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version", "2")
builder.config("spark.speculation", "false")
builder.config("spark.sql.parquet.compression.codec", "gzip")
builder.config("spark.debug.maxToStringFields", "100")
builder.config("spark.driver.memory", "9g")
builder.config("spark.driver.cores", "4")
builder.config("spark.executor-memory", "20g")
builder.config("spark.executor.cores", "4")
builder.master("local[*]")

spark = builder.getOrCreate()

In [4]:
spark

In [7]:
files = glob.glob(RAW + '/*.json')

In [9]:
files

['./raw/aluno.json',
 './raw/curso.json',
 './raw/logIzz.json',
 './raw/log_aula.json',
 './raw/matricula.json',
 './raw/pedido.json',
 './raw/transacao_pagseguro.json',
 './raw/user.json']

In [53]:
schemas = {}
schemas['aluno'] = StructType([
    StructField("cep", StringType(), True),
    StructField("cidade", StringType(), True),
    StructField("data_nascimento", StringType(), True),
    StructField("data_registro", StringType(), True),
    StructField("domain", StringType(), True),
    StructField("estado", StringType(), True),
    StructField("id", IntegerType(), True),
    StructField("logradouro", StringType(), True),
    StructField("user_id", IntegerType(), True),
])

In [30]:
schemas

{'aluno': StructType(List(StructField(cep,IntegerType,true),StructField(cidade,StringType,true),StructField(data_nascimento,DateType,true),StructField(data_registro,TimestampType,true),StructField(domain,StringType,true),StructField(estado,StringType,true),StructField(id,IntegerType,true),StructField(logradouro,TimestampType,true),StructField(user_id,IntegerType,true)))}

In [54]:
df = spark.read.schema(schemas['aluno']).json(files[0])
df

DataFrame[cep: int, cidade: string, data_nascimento: string, data_registro: string, domain: string, estado: string, id: int, logradouro: string, user_id: int]

In [42]:
df.printSchema()

root
 |-- cep: integer (nullable = true)
 |-- cidade: string (nullable = true)
 |-- data_nascimento: date (nullable = true)
 |-- data_registro: timestamp (nullable = true)
 |-- domain: string (nullable = true)
 |-- estado: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- logradouro: timestamp (nullable = true)
 |-- user_id: integer (nullable = true)



In [55]:
df.show(n=10)

+----+------+---------------+-------------+------+------+----+----------+-------+
| cep|cidade|data_nascimento|data_registro|domain|estado|  id|logradouro|user_id|
+----+------+---------------+-------------+------+------+----+----------+-------+
|null|  null|           null|         null|  null|  null|null|      null|   null|
+----+------+---------------+-------------+------+------+----+----------+-------+

