In [1]:
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext

import re

import warnings
# suprimir future warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
spark = SparkSession.builder\
    .master("local[*]") \
    .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/10/06 12:54:47 WARN Utils: Your hostname, LENOVOID-ubuntu, resolves to a loopback address: 127.0.1.1; using 192.168.10.209 instead (on interface wlp2s0)
25/10/06 12:54:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/06 12:54:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
def parse_to_int(some_value):
    if some_value is None:
        return None
    try:
        return int(some_value)
    except ValueError:
        return None

# Consultas propias

### 1. Monto total recaudado por ventas de los 5 productos con más reseñas positivas.

#### Hipótesis

- Se considera que una reseña es positiva cuando el rating de la misma es mayor que 3.
- Si una reseña tiene un valor nulo en el *rating*, no se considera.
- Si un *order_item* tiene valor nulo en *line_total*, el valor puede ser inferido a través del precio unitario y la cantidad comprada.

#### Limpieza

In [3]:
def retain_reviews_columns(row: Row):
    rating = 0 if row.rating is None else row.rating
    return (
        parse_to_int(row.product_id),
        rating,
    )
    
reviewsIdx = {
    "product_id": 0,
    "rating": 1,
}

In [4]:
reviews = sqlContext.read.csv(
    'data/reviews.csv',
    header=True, inferSchema=True
)
reviewsRDD = reviews.rdd.map(retain_reviews_columns).cache()

                                                                                

In [5]:
def retain_products_columns(row: Row):
    product_name = "UNDEFINED" if row.product_name is None else row.product_name
    brand = "UNDEFINED" if row.brand is None else row.brand
    brand = brand.strip().upper()
    return (
        parse_to_int(row.product_id),
        product_name,
        brand,
        parse_to_int(row.category_id),
    )
    
productsIdx = {
    "id": 0,
    "name": 1,
    "brand": 2,
    "category_id": 3,
}

In [6]:
products = sqlContext.read.csv(
    'data/products.csv',
    header=True, inferSchema=True
)
productsRDD = products.rdd.map(retain_products_columns).cache()

                                                                                

In [7]:
def retain_items_columns(row: Row):
    qty = parse_to_int(row.quantity)
    quantity = 0 if qty is None else qty
    line_total = row.line_total if row.line_total is not None else infer_line_total(row)
    return (
        parse_to_int(row.product_id),
        quantity,
        line_total
    )
    
def infer_line_total(row: Row):
    u_price = parse_to_int(row.unit_price)
    qty = parse_to_int(row.quantity)
    if (
        u_price is not None
        and qty is not None
    ):
        return u_price * qty
    else:
        return 0.0
    
itemsIdx = {
    "product_id": 0,
    "quantity": 1,
    "line_total": 2,
}

In [8]:
items = sqlContext.read.csv(
    'data/order_items.csv',
    header=True, inferSchema=True
)
itemsRDD = items.rdd.map(retain_items_columns).cache()

#### Resolución

In [9]:
# no hay missing values en reviews.rating
top_5_products = reviewsRDD.filter(lambda row: row[reviewsIdx["rating"]] > 3) \
    .map(lambda row: (row[reviewsIdx["product_id"]], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .takeOrdered(5, key=lambda x: -x[1])

25/10/06 12:54:57 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , review_id, customer_id, product_id, rating, title, comment, is_verified_purchase, helpful_votes, created_at
 Schema: _c0, review_id, customer_id, product_id, rating, title, comment, is_verified_purchase, helpful_votes, created_at
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/reviews.csv
                                                                                

In [10]:
top_5_products_ids = [prod[0] for prod in top_5_products]
top_5_products_sells = itemsRDD.filter(lambda row: row[itemsIdx["product_id"]] in top_5_products_ids) \
    .map(lambda row: (row[itemsIdx["product_id"]], row[itemsIdx["line_total"]])) \
    .reduceByKey(lambda a, b: a + b) \
    .collect()

25/10/06 12:54:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , order_item_id, order_id, product_id, quantity, unit_price, line_total, discount_amount
 Schema: _c0, order_item_id, order_id, product_id, quantity, unit_price, line_total, discount_amount
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/order_items.csv
                                                                                

In [11]:
top_5_products_names = productsRDD.filter(lambda row: row[productsIdx["id"]] in top_5_products_ids) \
    .map(lambda row: (row[productsIdx["id"]], row[productsIdx["name"]])) \
    .collectAsMap()

25/10/06 12:55:00 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , product_id, product_name, category_id, brand, price, cost, stock_quantity, weight_kg, dimensions, description, is_active, created_at
 Schema: _c0, product_id, product_name, category_id, brand, price, cost, stock_quantity, weight_kg, dimensions, description, is_active, created_at
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/products.csv
                                                                                

In [12]:
print("Top 5 productos con más reseñas positivas y monto de sus ventas totales:")
for prod in top_5_products_sells:
    print(f"Producto {top_5_products_names[prod[0]]}: ${prod[1]:.2f}")

Top 5 productos con más reseñas positivas y monto de sus ventas totales:
Producto Fully-configurable high-level circuit: $8320.50
Producto Persevering logistical help-desk: $12220.00
Producto Innovative solution-oriented installation: $291.20
Producto Seamless radical architecture: $13547.28
Producto Robust cohesive utilization: $271.51


### 2. Durante 2024 ¿Qué porcentaje de las órdenes `REFUNDED` fueron órdenes con descuento? ¿La mayoría eran de usuarios activos? ¿Qué segmento de usuario realizó la mayor cantidad de reembolsos? 

#### Hipótesis

- Si el valor del campo *discount_amount* en *orders* es nulo, se asume que la órden no tuvo descuento.
- Si el usuario de una órden no está en la tabla de *customers*, se asume que no es usuario activo.

#### Limpieza

In [93]:
status_dict = {}
with open("status.txt", "r") as f:
    valid_statuses = [line.strip() for line in f.readlines()]
    id = 0
    for status in valid_statuses:
        status_dict[status] = id
        id += 1

statuses = sc.broadcast(status_dict)

In [94]:
import pandas as pd
def retain_orders_columns(row: Row):
    id = parse_to_int(row.customer_id)
    datetime = get_orders_datetime(row)
    year = datetime.year if datetime is not None else None
    status_str = "UNDEFINED" if row.status is None else row.status.strip().upper()
    status = statuses.value[status_str]
    discount = 0.0 if row.discount_amount is None else row.discount_amount
    return (
        id,
        discount,
        status,
        year,
    )
    
def get_orders_datetime(row: Row):
    return pd.to_datetime(row.order_date, format="%Y-%m-%dT%H:%M:%S.%f", errors="coerce")
    
ordersIdx = {
    "customer_id": 0,
    "discount_amount": 1,
    "status": 2,
    "year": 3,
}

In [95]:
orders = sqlContext.read.csv(
    'data/orders.csv',
    header=True, inferSchema=True
)
ordersRDD = orders.rdd.map(retain_orders_columns)

                                                                                

In [104]:
segments_dict = {}
with open("segments.txt", "r") as f:
    valid_segments = [line.strip() for line in f.readlines()]
    id = 0
    for segments in valid_segments:
        segments_dict[segments] = id
        id += 1

segments = sc.broadcast(segments_dict)

segment_id_to_name = {v: k for k, v in segments_dict.items()}

In [97]:
def retain_customers_columns(row: Row):
    id = parse_to_int(row.customer_id)
    segment_str = "UNDEFINED" if row.customer_segment is None else row.customer_segment.strip().upper()
    segment = segments.value[segment_str]
    is_active = False if row.is_active is None else row.is_active
    return (
        id,
        segment,
        is_active,
    )
    
customersIdx = {
    "id": 0,
    "segment": 1,
    "is_active": 2,
}

In [98]:
customers = sqlContext.read.csv(
    'data/customers.csv',
    header=True, inferSchema=True
)
customersRDD = customers.rdd.map(retain_customers_columns)

#### Resolución

In [99]:
orders_user_active_and_segment = ordersRDD \
    .filter(lambda row: row[ordersIdx["status"]] == statuses.value["REFUNDED"] and row[ordersIdx["year"]] == 2024) \
    .map(lambda row: (row[ordersIdx["customer_id"]], row)) \
    .leftOuterJoin(customersRDD.map(lambda row: (row[customersIdx["id"]], row))) \
    .map(lambda row: (
        row[1][0][ordersIdx["discount_amount"]],
        row[1][1][customersIdx["segment"]] if row[1][1] is not None else "UNDEFINED",
        row[1][1][customersIdx["is_active"]] if row[1][1] is not None else False,
    )).cache()


In [100]:
discount_total_and_active_users = orders_user_active_and_segment \
    .map(
        lambda row: (
            1 if row[0] > 0 else 0, # tiene descuento
            1 if row[2] else 0, # es de usuario activo
            1,  # ordenes totales
        )
    ).reduce(lambda a, b: (a[0] + b[0], a[1] + b[1], a[2] + b[2]))

discount_refunded_orders_percentaje = (discount_total_and_active_users[0] / discount_total_and_active_users[2]) * 100
active_user_percentaje = (discount_total_and_active_users[1] / discount_total_and_active_users[2]) * 100
    

25/10/06 13:14:23 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , order_id, customer_id, order_date, status, payment_method, shipping_address, billing_address, discount_amount, tax_amount, shipping_cost, total_amount, currency, created_at, updated_at, subtotal
 Schema: _c0, order_id, customer_id, order_date, status, payment_method, shipping_address, billing_address, discount_amount, tax_amount, shipping_cost, total_amount, currency, created_at, updated_at, subtotal
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/orders.csv
25/10/06 13:15:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , customer_id, email, first_name, last_name, phone, date_of_birth, gender, country, city, postal_code, address, registration_date, last_login, is_active, customer_segment, marketing_consent
 Schema: _c0, customer_id, email, first_name, last_name, phone, date_of_birth, gender, country, city, postal_code, address

In [101]:
most_refunded_segment = orders_user_active_and_segment \
    .map(lambda row: (row[1], 1)) \
    .reduceByKey(lambda a, b: a + b) \
    .reduce(lambda a, b: a if a[1] > b[1] else b)

In [102]:
active_user_percentaje

90.21286097052695

In [105]:
refunded_segment = segment_id_to_name[most_refunded_segment[0]]
print(f"El {discount_refunded_orders_percentaje:.2f}% de las órdenes REFUNDED durante 2024 fueron órdenes con descuento.")
print("La mayoría eran de usuarios activos." if active_user_percentaje > 50 else "La mayoría no eran de usuarios activos.")
print(f"El segmento que más órdenes REFUNDED tuvo fue {refunded_segment} con {most_refunded_segment[1]} órdenes.")

El 21.29% de las órdenes REFUNDED durante 2024 fueron órdenes con descuento.
La mayoría eran de usuarios activos.
El segmento que más órdenes REFUNDED tuvo fue REGULAR con 7284 órdenes.


### 3. ¿Cuáles son las 3 marcas que vendieron menos unidades de productos durante 2025? Mostrar los nombres de los productos que más ingresos generaron de esas marcas.

#### Hipótesis

- No se tienen en cuenta para este análisis las ventas cuyo producto no está registrado en la tabla de *products*

#### Resolución

In [None]:
items_products_joined = itemsRDD \
    .map(
        lambda row: (
            row[itemsIdx["product_id"]], 
            (row[itemsIdx["quantity"]], row[itemsIdx["line_total"]])
        )
    ).join(productsRDD.map( # como me interesa la información de marca, hago inner join
        lambda row: (row[productsIdx["id"]], (row[productsIdx["brand"]]))
    ))

In [None]:
less_sells_brands = items_products_joined.map(
        lambda row: (
            row[1][1], # brand
            row[1][0][0], # quantity
        )
    ).reduceByKey(lambda a, b: a + b) \
    .takeOrdered(3, key=lambda x: x[1])
less_sells_brands_names = [brand[0] for brand in less_sells_brands]

                                                                                

In [None]:
brand_sells = itemsRDD.map(
    lambda row: (row[itemsIdx["product_id"]], row[itemsIdx["line_total"]])
).join(productsRDD.map(
    lambda row: (row[productsIdx["id"]], row[productsIdx["brand"]]))
)

In [None]:
top_products_per_brand = brand_sells.filter(
    lambda row: row[1][1] in less_sells_brands_names
).map(
    lambda row: ((row[1][1], row[0]), row[1][0])  # ((brand, product_id), line_total)
).reduceByKey(lambda a, b: a + b) \
.map(
    lambda row: (row[0][0], (row[0][1], row[1]))  # (brand, (product_id, total_line))
).reduceByKey(lambda a, b: a if a[1] > b[1] else b) \
.collect()

                                                                                

In [None]:
top_products_per_brand_ids = [prod[1][0] for prod in top_products_per_brand]
top_products_per_brand_names = productsRDD.filter(
    lambda row: row[productsIdx["id"]] in top_products_per_brand_ids
).map(
    lambda row: (row[productsIdx["id"]], row[productsIdx["name"]])
).collectAsMap()

In [None]:
print("Las marcas con menos unidades vendidas en 2025 son:")
for brand, amount in less_sells_brands:
    print(f" - {brand}: {amount} unidades vendidas")
    
print("\nEl producto más vendido de cada una de esas marcas es:")
for row in top_products_per_brand:
    brand = row[0]
    product_id = row[1][0]
    total_line = row[1][1]
    product_name = top_products_per_brand_names[product_id]
    print(f" - {brand}: {product_name} con ${total_line:.2f} en ventas")

Las marcas con menos unidades vendidas en 2025 son:
 - APPLE: 4942 unidades vendidas
 - ASHLEY FURNITURE: 5132 unidades vendidas
 - CASTROL: 5135 unidades vendidas

El producto más vendido de cada una de esas marcas es:
 - CASTROL: Grass-roots directional success  con $83496.97 en ventas
 - ASHLEY FURNITURE: Distributed interactive neural-net con $30814.72 en ventas
 - APPLE: FACE-TO-FACE TANGIBLE STRATEGY con $133905.24 en ventas


### 4. Rating promedio de los productos pertenecientes a la categoría más vendida durante 2024.

#### Hipótesis

#### Limpieza

In [None]:
def retain_categories_columns(row: Row):
    id = parse_to_int(row.category_id)
    name = "UNDEFINED" if row.category_name is None else row.category_name.strip().upper()
    parent = "UNDEFINED" if row.parent_category is None else row.parent_category.strip().upper()
    return (
        id,
        name,
        parent
    )
    
categoriesIdx = {
    "id": 0,
    "name": 1,
    "parent": 2,
}

In [None]:
categories = sqlContext.read.csv(
    'data/categories.csv',
    header=True, inferSchema=True
)
categoriesRDD = categories.rdd.map(retain_categories_columns).cache()

#### Resolución

In [None]:
category_sells = itemsRDD.map(
    lambda row: (row[itemsIdx["product_id"]], row[itemsIdx["line_total"]])
).join(
    productsRDD.filter(
        lambda row: row[productsIdx["category_id"]] is not None
    ).map(
        lambda row: (row[productsIdx["id"]], row[productsIdx["category_id"]])
    )
)

In [None]:
most_selled_category = category_sells.map(
    lambda row: (row[1][1], row[1][0])  # (category_id, line_total)
).reduceByKey(lambda a, b: a + b) \
.reduce(lambda a, b: a if a[1] > b[1] else b)

                                                                                

In [None]:
most_selled_category_name = categoriesRDD.filter(
    lambda row: row[categoriesIdx["id"]] == most_selled_category[0]
).first()[1]

25/10/06 12:56:44 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , category_id, category_name, parent_category, created_at
 Schema: _c0, category_id, category_name, parent_category, created_at
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/categories.csv


In [None]:
category_products = productsRDD.filter(
        lambda row: (
            row[productsIdx["category_id"]] is not None
            and row[productsIdx["category_id"]] == most_selled_category[0]
        )
    ).map(
        lambda row: row[productsIdx["id"]]
    ).collect()

In [None]:
category_mean_rating = reviewsRDD.filter(
    lambda row: row[reviewsIdx["product_id"]] in category_products
).map(
    lambda row: row[reviewsIdx["rating"]]
).mean()

                                                                                

### 5. Obtener los 3 productos `ELECTRONICS` con más movimientos por daños, y el promedio de cambios en la cantidad para los movimientos dañados.

#### Hipótesis

#### Limpieza

In [None]:
reasons_dict = {}
with open("reasons.txt", "r") as f:
    reasons = [line.strip() for line in f.readlines()]
    id = 0
    for reason in reasons:
        reasons_dict[reason] = id
        id += 1

reasons = sc.broadcast(reasons_dict)

In [None]:
def retain_inventory_columns(row: Row):
    product_id = parse_to_int(row.product_id)
    reason_str = "UNDEFINED" if row.reason is None else row.reason.strip().upper()
    reason = reasons.value[reason_str]
    quantity = parse_to_int(row.quantity_change)
    return (
        product_id,
        reason,
        0 if quantity is None else quantity,
    )
    
inventoryIdx = {
    "product_id": 0,
    "reason": 1,
    "quantity": 2,
}

In [None]:
inventory = sqlContext.read.csv(
    'data/inventory_logs.csv',
    header=True, inferSchema=True
)
inventoryRDD = inventory.rdd.map(retain_inventory_columns)

#### Resolución

In [None]:
electronics_categories_id = categoriesRDD.filter(
    lambda row: row[categoriesIdx["parent"]] == "ELECTRONICS"
).map(
    lambda row: row[categoriesIdx["id"]]
).collect()

electronics_products_ids = productsRDD.filter(
    lambda row: (
        row[productsIdx["category_id"]] is not None
        and row[productsIdx["category_id"]] in electronics_categories_id
    )
).map(
    lambda row: row[productsIdx["id"]]
).collect()

In [None]:
damaged_movements = inventoryRDD.filter(
    lambda row: row[inventoryIdx["reason"]] == reasons.value["DAMAGE"]
).cache()

In [None]:
damaged_electronics_movements_by_product = damaged_movements \
.filter(lambda row: row[inventoryIdx["product_id"]] in electronics_products_ids) \
.map(
    lambda row:
        (row[inventoryIdx["product_id"]], (1, row[inventoryIdx["quantity"]]))  # (product_id, (total, quantity_change))
).reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

In [None]:
most_damaged_products = damaged_electronics_movements_by_product \
    .takeOrdered(3, key=lambda x: -x[1][0])

most_damaged_products_ids = [prod[0] for prod in most_damaged_products]

most_damaged_products_names = productsRDD.filter(
    lambda row: row[productsIdx["id"]] in most_damaged_products_ids
).map(
    lambda row: (row[productsIdx["id"]], row[productsIdx["name"]])
).collectAsMap()

25/10/06 13:02:38 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , log_id, product_id, movement_type, quantity_change, reason, timestamp, reference_id, notes
 Schema: _c0, log_id, product_id, movement_type, quantity_change, reason, timestamp, reference_id, notes
Expected: _c0 but found: 
CSV file: file:///home/pat/Documents/GitHub/datos-tp2/data/inventory_logs.csv
                                                                                

In [None]:
damaged_movements_quantity_mean = damaged_movements.map(
    lambda row: row[inventoryIdx["quantity"]]
).mean()

In [None]:
print("Los 3 productos ELECTRONICS con más movimientos por daños son:")
print("ID\tTotal Movs.\tCambio tot. en Cant.\tNombre")
for prod in most_damaged_products:
    product_id = prod[0]
    total_damages = prod[1][0]
    quantity_change = prod[1][1]
    product_name = most_damaged_products_names[product_id].strip()
    print(f"{product_id}\t{total_damages}\t\t{quantity_change}\t\t\t{product_name}")
    
print(f"\nEl promedio de cambios en la cantidad para los movimientos dañados es de {damaged_movements_quantity_mean:.2f} unidades.")

Los 3 productos ELECTRONICS con más movimientos por daños son:
ID	Total Movs.	Cambio tot. en Cant.	Nombre
909144	4		349			Self-enabling discrete open system
986689	4		327			UNDEFINED
969306	4		-647			Pre-emptive zero tolerance encryption

El promedio de cambios en la cantidad para los movimientos dañados es de -0.00 unidades.
