In [1]:
from pyspark.sql import SparkSession
# import pandas as pd # Remove a importação de pandas se não for mais necessário

# Cria uma sessão Spark. O appName é o nome da sua aplicação Spark.
spark = SparkSession.builder.appName("ReadParquet").getOrCreate()

In [2]:
file_path = r'..\data\processed\base_consolidada.parquet'

spark_df = spark.read.parquet(file_path)

In [3]:
spark_df.show(5)

+-------------------+-----------+--------------+-------+-------------------+-------------------+--------------+----------------+--------------+--------+------------------+------------------+------------------+------------------+------------------+-------------------+-----------------+--------------------+-----------------+-----+-------------------+--------------------+--------------------+
|                pdv|    premise| categoria_pdv|zipcode|  internal_store_id|internal_product_id|distributor_id|transaction_date|reference_date|quantity|       gross_value|         net_value|      gross_profit|          discount|             taxes|            produto|        categoria|           descricao|            tipos|label|       subcategoria|               marca|          fabricante|
+-------------------+-----------+--------------+-------+-------------------+-------------------+--------------+----------------+--------------+--------+------------------+------------------+------------------+-----

In [4]:
from datetime import datetime, timedelta

def calculate_week_of_month(transaction_date, reference_date):
    """
    Calculates the week number within the month for a given transaction date.

    Args:
        transaction_date (date): The date of the transaction (YYYY-MM-DD).
        reference_date (date): A reference date within the month (YYYY-MM-DD).

    Returns:
        int: The week number within the month (starting from 1).
    """
    # Ensure dates are datetime objects
    if isinstance(transaction_date, str):
        transaction_date = datetime.strptime(transaction_date, '%Y-%m-%d').date()
    if isinstance(reference_date, str):
        reference_date = datetime.strptime(reference_date, '%Y-%m-%d').date()

    # Convert reference_date to the first day of the month
    first_day_of_month = reference_date.replace(day=1)

    # Calculate the difference in days
    day_difference = (transaction_date - first_day_of_month).days

    # Calculate the week number (starting from 1)
    week_number = (day_difference // 7) + 1

    return week_number

In [5]:
from pyspark.sql.functions import udf, col
from pyspark.sql.types import IntegerType

# Define the Spark UDF
calculate_week_of_month_udf = udf(calculate_week_of_month, IntegerType(), useArrow=True)

# Apply the UDF to the DataFrame to create the new column
joined_df = spark_df.withColumn(
    'week_of_month',
    calculate_week_of_month_udf(col('transaction_date'), col('reference_date'))
)

In [8]:
joined_df.show(5, truncate=False)

+-------------------+-----------+--------------+-------+-------------------+-------------------+--------------+----------------+--------------+--------+------------------+------------------+------------------+------------------+------------------+-------------------+-----------------+----------------------------------------+-----------------+-----+-------------------+--------------------------------------+---------------------+-------------+
|pdv                |premise    |categoria_pdv |zipcode|internal_store_id  |internal_product_id|distributor_id|transaction_date|reference_date|quantity|gross_value       |net_value         |gross_profit      |discount          |taxes             |produto            |categoria        |descricao                               |tipos            |label|subcategoria       |marca                                 |fabricante           |week_of_month|
+-------------------+-----------+--------------+-------+-------------------+-------------------+------------

pyspark.sql.dataframe.DataFrame