# Generate a Raw Data

## Import Libraries

In [2]:
import os
import sys
import json
import random
from datetime import datetime, timedelta
from pyspark.sql import SparkSession, DataFrame, functions as F

## Create a Spark Session

In [6]:
spark = SparkSession.builder.appName("Delta-App").getOrCreate()
WORK_DIR = f'{os.getenv("HOME")}/work'

## Create a Functions

In [8]:
def row_data(
    delta_create: int, delta_update: int, qtt: int, list_options: list
) -> dict:
    date_now = datetime.now()
    create = date_now + timedelta(days=delta_create)
    update = date_now + timedelta(days=delta_update)
    format_date = "%Y/%m/%d"
    return {
        "created": create.strftime(format_date),
        "updated": update.strftime(format_date),
        "id": random.randrange(qtt),
        "value": random.choice(list_options),
    }


def generate_data(
    delta_create: int, delta_update: int, qtt: int, list_options: list
) -> DataFrame:
    list_data = []
    for val in range(qtt):
        list_data.append(row_data(delta_create, delta_update, qtt, list_options))
    data = spark.sparkContext.parallelize(list_data, 10)
    return spark.read.json(data).drop_duplicates(["id"])


def create_cols_partition_YMD(df: DataFrame, col_name: str) -> DataFrame:
    return (
        df.withColumn("year", F.split(F.col(col_name), "/").getItem(0))
        .withColumn("month", F.split(F.col(col_name), "/").getItem(1))
        .withColumn("day", F.split(F.col(col_name), "/").getItem(2))
    )

def gen_raw_zone(
    delta_create: int,
    delta_update: int,
    qtt: int,
    range_generate: int,
    list_options: list,
    local_output: str,
    col_partition: str = "updated",
) -> None:
    for val in range(range_generate):
        df = generate_data(delta_create, delta_update+val, qtt, list_options)
        df = create_cols_partition_YMD(df, col_partition)
        df.write.partitionBy("year", "month", "day").mode("append").parquet(
            f"{local_output}/data"
        )

## Create a Fake Raw Data

In [9]:
list_fruits = ['banana', 'maça', 'morango', 'abacaxi', 'limão']
output = f'{WORK_DIR}/datalake/raw'
size = 5000

In [10]:
gen_raw_zone(0,1,size,10,list_fruits,output)