In [1]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.CRITICAL)

import findspark
findspark.init()
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, sum, when, split, col, lit, max, min, expr
from pyspark.sql.functions import to_date, var_samp, variance, var_pop, month, to_timestamp, dayofweek
from pyspark.sql.types import NumericType, IntegerType, FloatType
from pyspark.sql.functions import col, round, concat_ws, desc, when, concat
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder \
    .appName("Maestria_evidencia1") \
    .config("spark.driver.memory", "64g") \
    .config("spark.executor.memory", "32g") \
    .config("spark.sql.shuffle.partitions", "32") \
    .config("spark.default.parallelism", "16") \
    .config("spark.driver.maxResultSize", "16g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Mejores tablas
spark

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/11 15:07:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/11 15:07:18 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

####
#### La base de datos ha sido limpiada y modificada a este punto
#### 
####
file_path = "/Users/pauescalante/Documents/Maestria/Trimestre 7/BigData/big-data-act/DataModified/expanded_database_ecommerce"
df = spark.read.csv(file_path, header=True, inferSchema=True)

df.head()

  from .autonotebook import tqdm as notebook_tqdm
25/05/11 15:07:33 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors
                                                                                

Row(event_time=datetime.date(2019, 11, 17), event_type='view', product_id=5300440, category_id=2053013563173241677, brand='vitek', price=17.76, user_id=513341639, user_session='d9544029-2739-4d16-9cac-79650460d9f0', event_time_ts=datetime.datetime(2019, 11, 17, 5, 35, 32), parent_category='None', subcategory='None', price_bucket='low', day_of_week=1, is_weekend=True)

In [4]:
df = df.selectExpr(
    'cast(event_time as timestamp) event_time',
    'cast(event_type as string) event_type',
    'cast(product_id as int) product_id',
    'cast(category_id as long) category_id',
    'cast(brand as string) brand',
    'cast(price as float) price',
    'cast(user_id as int) user_id',
    'cast(user_session as string) user_session',
    'cast(parent_category as string) parent_category',
    'cast(subcategory as string) subcategory',
    'cast(price_bucket as string) price_bucket',
    'cast(event_time_ts as timestamp) event_time_ts',
    'cast(day_of_week as int) day_of_week',
    'cast(is_weekend as boolean) is_weekend',
)

df.printSchema()

root
 |-- event_time: timestamp (nullable = true)
 |-- event_type: string (nullable = true)
 |-- product_id: integer (nullable = true)
 |-- category_id: long (nullable = true)
 |-- brand: string (nullable = true)
 |-- price: float (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- user_session: string (nullable = true)
 |-- parent_category: string (nullable = true)
 |-- subcategory: string (nullable = true)
 |-- price_bucket: string (nullable = true)
 |-- event_time_ts: timestamp (nullable = true)
 |-- day_of_week: integer (nullable = true)
 |-- is_weekend: boolean (nullable = true)



In [5]:
# Crear el data frame de las variables de caracterización 
df_characterization = df.select("event_type", "price_bucket")

In [6]:
total_count = df_characterization.count()
print(total_count)

                                                                                

109950731


In [7]:
# Probabilidades de event_type
event_type_probs = (
    df_characterization.groupBy("event_type")
      .count()
      .withColumn("probabilidad", col("count") / total_count)
)

event_type_probs.show()



+----------+---------+--------------------+
|event_type|    count|        probabilidad|
+----------+---------+--------------------+
|      cart|  3955434|0.035974603934193036|
|  purchase|  1659788|0.015095743201561799|
|      view|104335509|  0.9489296528642451|
+----------+---------+--------------------+



                                                                                

In [8]:
price_bucket_probs = (
    df_characterization.groupBy("price_bucket")
      .count()
      .withColumn("probabilidad", col("count") / total_count)
)

price_bucket_probs.show()



+------------+--------+-------------------+
|price_bucket|   count|       probabilidad|
+------------+--------+-------------------+
|         low|37649697| 0.3424233441431144|
|      medium|39579818| 0.3599777613120189|
|        high|32721216|0.29759889454486665|
+------------+--------+-------------------+



                                                                                

In [9]:
# Metodo alternativo de calcular las probabilidades conjuntas
prob_df_characterization = df_characterization.groupBy("event_type", "price_bucket").agg(
    F.count("*").alias("conteo"),
    (round(F.count("*") / F.lit(total_count),4)).alias("probabilidad")
)

In [10]:
prob_df_characterization = prob_df_characterization.orderBy(desc("event_type"))
prob_df_characterization.show()



+----------+------------+--------+------------+
|event_type|price_bucket|  conteo|probabilidad|
+----------+------------+--------+------------+
|      view|      medium|37219285|      0.3385|
|      view|         low|36051044|      0.3279|
|      view|        high|31065180|      0.2825|
|  purchase|      medium|  688662|      0.0063|
|  purchase|         low|  476250|      0.0043|
|  purchase|        high|  494876|      0.0045|
|      cart|         low| 1122403|      0.0102|
|      cart|      medium| 1671871|      0.0152|
|      cart|        high| 1161160|      0.0106|
+----------+------------+--------+------------+



                                                                                

In [11]:
# Obtener los valores únicos de cada columna como listas
df_characterization = df_characterization.withColumn("combined_col", concat(df_characterization["event_type"], lit(","), df_characterization["price_bucket"]))

In [12]:
df_characterization.head(3)

[Row(event_type='view', price_bucket='low', combined_col='view,low'),
 Row(event_type='view', price_bucket='high', combined_col='view,high'),
 Row(event_type='view', price_bucket='high', combined_col='view,high')]

In [13]:
distinct_values = df_characterization.select("combined_col").distinct().orderBy(desc("combined_col"))
print(distinct_values)



+---------------+
|   combined_col|
+---------------+
|    view,medium|
|       view,low|
|      view,high|
|purchase,medium|
|   purchase,low|
|  purchase,high|
|    cart,medium|
|       cart,low|
|      cart,high|
+---------------+



                                                                                

In [14]:
# Probando usando el muestreo aleatorio simple (SRS)
sample_df = df.sample(fraction=0.0001)
total_count_sample = sample_df.count()
print(total_count_sample)



11161


                                                                                

In [15]:
# verificar las probabilidades del data set
prob_df_sample = sample_df.groupBy("event_type", "price_bucket").agg(
    F.count("*").alias("conteo"),
    (round(F.count("*") / F.lit(total_count_sample),4)).alias("probabilidad")
)

In [16]:
# agregar una columna de probabilidad del data set entero previamente calculado
previous_probability = prob_df_characterization.withColumnRenamed("probabilidad", "previous_calculated_probability")
result_df = prob_df_sample.join(previous_probability.select("event_type", "price_bucket", "previous_calculated_probability"), on=["event_type", "price_bucket"], how="left")

result_df = result_df.orderBy(desc("event_type"))
result_df.show()



+----------+------------+------+------------+-------------------------------+
|event_type|price_bucket|conteo|probabilidad|previous_calculated_probability|
+----------+------------+------+------------+-------------------------------+
|      view|      medium|  3815|      0.3418|                         0.3385|
|      view|         low|  3637|      0.3259|                         0.3279|
|      view|        high|  3113|      0.2789|                         0.2825|
|  purchase|      medium|    82|      0.0073|                         0.0063|
|  purchase|         low|    47|      0.0042|                         0.0043|
|  purchase|        high|    66|      0.0059|                         0.0045|
|      cart|      medium|   165|      0.0148|                         0.0152|
|      cart|        high|   130|      0.0116|                         0.0106|
|      cart|         low|   106|      0.0095|                         0.0102|
+----------+------------+------+------------+-------------------

                                                                                

In [18]:
# Checando los top 5 valores
df.groupBy("parent_category").count().orderBy("count", ascending=False).show(5, truncate=False)



+---------------+--------+
|parent_category|count   |
+---------------+--------+
|electronics    |40141700|
|None           |35413777|
|appliances     |13457119|
|computers      |6505575 |
|apparel        |4554025 |
+---------------+--------+
only showing top 5 rows



                                                                                

In [19]:
df.groupBy("subcategory").count().orderBy("count", ascending=False).show(5, truncate=False)



+-----------+--------+
|subcategory|count   |
+-----------+--------+
|None       |35413777|
|smartphone |27882227|
|clocks     |3397998 |
|video.tv   |3321794 |
|notebook   |3318177 |
+-----------+--------+
only showing top 5 rows



                                                                                