In [261]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext

import re

import warnings
# suprimir future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
spark = SparkSession.builder\
    .master("local[*]") \
    .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

In [262]:
def parse_to_int(some_value):
    if some_value is None:
        return None
    try:
        return int(some_value)
    except ValueError:
        return None

# Consultas propias

### 1. Monto total recaudado por ventas de los 5 productos con más reseñas positivas.

#### Hipótesis

- Se considera que una reseña es positiva cuando el rating de la misma es mayor que 3.
- Si una reseña tiene un valor nulo en el *rating*, no se considera.
- Si un *order_item* tiene valor nulo en *line_total*, el valor puede ser inferido a través del precio unitario y la cantidad comprada.

#### Limpieza

In [263]:
def retain_reviews_columns(row: Row):
    rating = 0 if row.rating is None else row.rating
    return (
        parse_to_int(row.product_id),
        rating,
    )
    
reviewsIdx = {
    "product_id": 0,
    "rating": 1,
}

In [264]:
reviews = sqlContext.read.csv(
    'data/reviews.csv',
    header=True, inferSchema=True
)
reviewsRDD = reviews.rdd.map(retain_reviews_columns).cache()

In [290]:
def retain_products_columns(row: Row):
    product_name = "UNDEFINED" if row.product_name is None else row.product_name
    brand = "UNDEFINED" if row.brand is None else row.brand
    brand = brand.strip().upper()
    return (
        parse_to_int(row.product_id),
        product_name,
        brand,
        parse_to_int(row.category_id),
    )
    
productsIdx = {
    "id": 0,
    "name": 1,
    "brand": 2,
    "category_id": 3,
}

In [291]:
products = sqlContext.read.csv(
    'data/products.csv',
    header=True, inferSchema=True
)
productsRDD = products.rdd.map(retain_products_columns).cache()

In [267]:
def retain_items_columns(row: Row):
    qty = parse_to_int(row.quantity)
    quantity = 0 if qty is None else qty
    line_total = row.line_total if row.line_total is not None else infer_line_total(row)
    return (
        parse_to_int(row.product_id),
        quantity,
        line_total
    )
    
def infer_line_total(row: Row):
    u_price = parse_to_int(row.unit_price)
    qty = parse_to_int(row.quantity)
    if (
        u_price is not None
        and qty is not None
    ):
        return u_price * qty
    else:
        return 0.0
    
itemsIdx = {
    "product_id": 0,
    "quantity": 1,
    "line_total": 2,
}

In [268]:
items = sqlContext.read.csv(
    'data/order_items.csv',
    header=True, inferSchema=True
)
itemsRDD = items.rdd.map(retain_items_columns).cache()

#### Resolución

In [269]:
# no hay missing values en reviews.rating
top_5_products = reviewsRDD.filter(lambda row: row[reviewsIdx["rating"]] > 3) \
    .map(lambda row: (row[reviewsIdx["product_id"]], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .takeOrdered(5, key=lambda x: -x[1])

25/10/05 00:55:06 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, customer_id, product_id, rating, title, comment, is_verified_purchase, helpful_votes, created_at
 Schema: _c0, review_id, customer_id, product_id, rating, title, comment, is_verified_purchase, helpful_votes, created_at
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/reviews.csv
                                                                                

In [270]:
top_5_products_ids = [prod[0] for prod in top_5_products]
top_5_products_sells = itemsRDD.filter(lambda row: row[itemsIdx["product_id"]] in top_5_products_ids) \
    .map(lambda row: (row[itemsIdx["product_id"]], row[itemsIdx["line_total"]])) \
    .reduceByKey(lambda a, b: a + b) \
    .collect()

25/10/05 00:55:08 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , order_item_id, order_id, product_id, quantity, unit_price, line_total, discount_amount
 Schema: _c0, order_item_id, order_id, product_id, quantity, unit_price, line_total, discount_amount
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/order_items.csv
                                                                                

In [271]:
top_5_products_names = productsRDD.filter(lambda row: row[productsIdx["id"]] in top_5_products_ids) \
    .map(lambda row: (row[productsIdx["id"]], row[productsIdx["name"]])) \
    .collectAsMap()

25/10/05 00:55:09 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , product_id, product_name, category_id, brand, price, cost, stock_quantity, weight_kg, dimensions, description, is_active, created_at
 Schema: _c0, product_id, product_name, category_id, brand, price, cost, stock_quantity, weight_kg, dimensions, description, is_active, created_at
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/products.csv
                                                                                

In [None]:
print("Top 5 productos con más reseñas positivas y monto de sus ventas totales:")
for prod in top_5_products_sells:
    print(f"Producto {top_5_products_names[prod[0]]}: ${prod[1]:.2f}")

Top 5 productos con más reseñas positivas y monto de sus ventas totales:
Producto Fully-configurable high-level circuit: $8320.50
Producto Persevering logistical help-desk: $12220.00
Producto Innovative solution-oriented installation: $291.20
Producto Seamless radical architecture: $13547.28
Producto Robust cohesive utilization: $271.51


### 2. Durante 2024 ¿Qué porcentaje de las órdenes `REFUNDED` fueron órdenes con descuento? ¿La mayoría eran de usuarios activos? ¿Qué segmento de usuario realizó la mayor cantidad de reembolsos? 

#### Hipótesis

- Si el valor del campo *discount_amount* en *orders* es nulo, se asume que la órden no tuvo descuento.
- Si el usuario de una órden no está en la tabla de *customers*, se asume que no es usuario activo.

#### Limpieza

In [274]:
import pandas as pd
def retain_orders_columns(row: Row):
    datetime = get_orders_datetime(row)
    year = datetime.year if datetime is not None else None
    status = "UNDEFINED" if row.status is None else row.status.strip().upper()
    discount = 0.0 if row.discount_amount is None else row.discount_amount
    return (
        row.customer_id,
        discount,
        status,
        year,
    )
    
def get_orders_datetime(row: Row):
    return pd.to_datetime(row.order_date, format="%Y-%m-%dT%H:%M:%S.%f", errors="coerce")
    
ordersIdx = {
    "customer_id": 0,
    "discount_amount": 1,
    "status": 2,
    "year": 3,
}

In [275]:
orders = sqlContext.read.csv(
    'data/orders.csv',
    header=True, inferSchema=True
)
ordersRDD = orders.rdd.map(retain_orders_columns)

                                                                                

In [276]:
def retain_customers_columns(row: Row):
    segment = "UNDEFINED" if row.customer_segment is None else row.customer_segment.strip().upper()
    is_active = False if row.is_active is None else row.is_active
    return (
        row.customer_id,
        segment,
        is_active,
    )
    
customersIdx = {
    "id": 0,
    "segment": 1,
    "is_active": 2,
}

In [277]:
customers = sqlContext.read.csv(
    'data/customers.csv',
    header=True, inferSchema=True
)
customersRDD = customers.rdd.map(retain_customers_columns).cache()

#### Resolución

In [278]:
orders_user_active_and_segment = ordersRDD \
    .filter(lambda row: row[ordersIdx["status"]] == "REFUNDED" and row[ordersIdx["year"]] == 2024) \
    .map(lambda row: (row[ordersIdx["customer_id"]], row)) \
    .leftOuterJoin(customersRDD.map(lambda row: (row[customersIdx["id"]], row))) \
    .map(lambda row: (
        row[1][0][ordersIdx["discount_amount"]],
        row[1][0][ordersIdx["status"]],
        row[1][0][ordersIdx["year"]],
        row[1][1][customersIdx["segment"]] if row[1][1] is not None else "UNDEFINED",
        row[1][1][customersIdx["is_active"]] if row[1][1] is not None else False,
    )).cache()


In [279]:
discount_total_and_active_users = orders_user_active_and_segment \
    .map(
        lambda row: (
            1 if row[0] > 0 else 0, # tiene descuento
            1 if row[4] else 0, # es de usuario activo
            1,  # ordenes totales
        )
    ).reduce(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2]))

discount_refunded_orders_percentaje = (discount_total_and_active_users[0] / discount_total_and_active_users[2]) * 100
active_user_percentaje = (discount_total_and_active_users[1] / discount_total_and_active_users[2]) * 100
    

25/10/05 00:55:19 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , order_id, customer_id, order_date, status, payment_method, shipping_address, billing_address, discount_amount, tax_amount, shipping_cost, total_amount, currency, created_at, updated_at, subtotal
 Schema: _c0, order_id, customer_id, order_date, status, payment_method, shipping_address, billing_address, discount_amount, tax_amount, shipping_cost, total_amount, currency, created_at, updated_at, subtotal
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/orders.csv
25/10/05 00:56:56 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , customer_id, email, first_name, last_name, phone, date_of_birth, gender, country, city, postal_code, address, registration_date, last_login, is_active, customer_segment, marketing_consent
 Schema: _c0, customer_id, email, first_name, last_name, phone, date_of_birth, gender, country, city, postal_code, address

In [280]:
most_refunded_segment = orders_user_active_and_segment \
    .map(lambda row: (row[3], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .reduce(lambda a, b: a if a[1] > b[1] else b)

In [281]:
active_user_percentaje

90.21286097052695

In [282]:
print(f"El {discount_refunded_orders_percentaje:.2f}% de las órdenes REFUNDED durante 2024 fueron órdenes con descuento.")
print("La mayoría eran de usuarios activos." if active_user_percentaje > 50 else "La mayoría no eran de usuarios activos.")
print(f"El segmento que más órdenes REFUNDED tuvo fue {most_refunded_segment[0]} con {most_refunded_segment[1]} órdenes.")

El 21.29% de las órdenes REFUNDED durante 2024 fueron órdenes con descuento.
La mayoría eran de usuarios activos.
El segmento que más órdenes REFUNDED tuvo fue REGULAR con 7284 órdenes.


### 3. ¿Cuáles son las 3 marcas que vendieron menos unidades de productos durante 2025? Mostrar los nombres de los productos que más ingresos generaron de esas marcas.

#### Hipótesis

#### Limpieza

#### Resolución

In [292]:
items_products_joined = itemsRDD \
    .map(
        lambda row: (
            row[itemsIdx["product_id"]], 
            (row[itemsIdx["quantity"]], row[itemsIdx["line_total"]])
        )
    ).join(productsRDD.map( # como me interesa la información de marca, hago inner join
        lambda row: (row[productsIdx["id"]], (row[productsIdx["brand"]]))
    )).cache()

In [293]:
items_products_joined.take(5)

25/10/05 01:00:24 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , product_id, product_name, category_id, brand, price, cost, stock_quantity, weight_kg, dimensions, description, is_active, created_at
 Schema: _c0, product_id, product_name, category_id, brand, price, cost, stock_quantity, weight_kg, dimensions, description, is_active, created_at
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/products.csv
                                                                                

[(945804, ((10, 9550.9), 'IKEA')),
 (945804, ((8, 7640.72), 'IKEA')),
 (945804, ((8, 7640.72), 'IKEA')),
 (945804, ((2, 1910.18), 'IKEA')),
 (965772, ((7, 834.54), 'UNDER ARMOUR'))]

In [None]:
less_sells_brands = items_products_joined.map(
        lambda row: (
            row[1][1], # brand
            row[1][0][0], # quantity
        )
    ).reduceByKey(lambda a, b: a + b) \
    .takeOrdered(3, key=lambda x: x[1])
less_sells_brands_names = [brand[0] for brand in less_sells_brands]

                                                                                

In [295]:
less_sells_brands

[('APPLE', 4942), ('ASHLEY FURNITURE', 5132), ('CASTROL', 5135)]

In [None]:
itemsRDD.join(productsRDD.map(
    lambda row: (row[productsIdx["id"]], (row[productsIdx["brand"]]))
)).filter(
    lambda row: row[1][1] in less_sells_brands_names
).map(
    lambda row: (((row[1][1], row[0]), row[1][0][1])) # (brand, product_id), line_total
).reduceByKey(lambda a, b: a + b) \
.collect()

### 4. Rating promedio de los productos pertenecientes a la categoría más vendida durante 2024.

#### Hipótesis

#### Limpieza

#### Resolución

### 5. Obtener los 3 productos de tecnología con más movimientos por daños, y el promedio de cambios en la cantidad para los movimientos dañados.

#### Hipótesis

#### Limpieza

#### Resolución