In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, sum, when, col, lit

spark = SparkSession.builder \
    .appName("Maestria_tarea3") \
    .config("spark.driver.memory", "64g") \
    .config("spark.executor.memory", "32g") \
    .config("spark.sql.shuffle.partitions", "32") \
    .config("spark.default.parallelism", "16") \
    .config("spark.driver.maxResultSize", "16g") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "true").getOrCreate()

In [17]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

####
#### Opcion 1 para conseguir datos
#### Bajar el archivo de Kaggle
####
####
file_path = "/Users/pauescalante8/Documents/BigData/Datos/2019-Nov.csv"
df_pandas = spark.read.csv(file_path, header=True, inferSchema=True)
spark_df = df_pandas[['event_type', 'user_id', 'product_id', 'category_id', 'price']]

####
####

####
#### Opcion 2 para conseguir datos
#### Bajar el archivo de Kaggle
####
####

#file_path = "2019-Nov.csv"

#df_pandas = kagglehub.load_dataset(
#    KaggleDatasetAdapter.PANDAS,
#    "mkechinov/ecommerce-behavior-data-from-multi-category-store",
#    file_path
#)

#spark_df = spark.createDataFrame(
#    df_pandas[['event_type', 'user_id', 'product_id', 'category_id', 'price']]
#)

####
####

df_pandas.head()

                                                                                

Row(event_time=datetime.datetime(2019, 11, 1, 9, 0), event_type='view', product_id=1003461, category_id=2053013555631882655, category_code='electronics.smartphone', brand='xiaomi', price=489.07, user_id=520088904, user_session='4d3b30da-a5e4-49df-b1a8-ba5943f1dd33')

In [18]:
spark_df.createOrReplaceTempView("events")

In [19]:
resultado = spark_df.groupBy("event_type").count()
resultado.show()



+----------+--------+
|event_type|   count|
+----------+--------+
|      cart| 3028930|
|  purchase|  916939|
|      view|63556110|
+----------+--------+



                                                                                

### Cuantos eventos de cada tipo tenemos?

In [20]:
%%time
query = """
SELECT event_type, COUNT(event_type)
FROM events
GROUP BY event_type
"""
event_resume = spark.sql(query)
event_resume.show()



+----------+-----------------+
|event_type|count(event_type)|
+----------+-----------------+
|      cart|          3028930|
|  purchase|           916939|
|      view|         63556110|
+----------+-----------------+

CPU times: user 11.7 ms, sys: 9.9 ms, total: 21.6 ms
Wall time: 13 s


                                                                                

### Resumen por usuario

In [21]:
# Tabla de los primeros 20 clientes con información de las columnas de views + purchases
query = """
SELECT user_id, sum(if(event_type='view', 1, 0)) as views, sum(if(event_type='purchase', 2, 0)) as purchases
FROM events
GROUP BY user_id
"""
user_resume = spark.sql(query)
user_resume.show()



+---------+-----+---------+
|  user_id|views|purchases|
+---------+-----+---------+
|556727865|   41|        2|
|565921426|    1|        0|
|566138387|    4|        0|
|516426931|   59|       22|
|533312397|   47|        0|
|560134869|  382|       26|
|559199036|   14|        0|
|555365433|    6|        0|
|566281616|    1|        0|
|566281584|    1|        0|
|566281648|    1|        0|
|543906197|   49|        0|
|518691953|  457|        0|
|560796792|    2|        0|
|514967216|    6|        0|
|566282420|    1|        0|
|562855774|   47|        0|
|566282561|    3|        0|
|554869204|    1|        0|
|547211786|  168|       26|
+---------+-----+---------+
only showing top 20 rows



                                                                                

### ¿Que % de usuarios compran?

In [34]:
total_users = user_resume.count()

# Número de usuarios con purchases > 0
users_with_purchases = user_resume.filter(col("purchases") > 0).count()

# Porcentaje
purchase_percentage = (users_with_purchases / total_users) * 100

print(f"Porcentaje de usuarios con al menos una compra: {purchase_percentage:.2f}%")



Porcentaje de usuarios con al menos una compra: 11.95%


                                                                                

### Pandas Analisis

In [24]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
df_pd = pd.read_csv('/Users/pauescalante8/Documents/BigData/Datos/2019-Nov.csv')

In [25]:
df_pd.head(5)

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-11-01 00:00:00 UTC,view,1003461,2053013555631882655,electronics.smartphone,xiaomi,489.07,520088904,4d3b30da-a5e4-49df-b1a8-ba5943f1dd33
1,2019-11-01 00:00:00 UTC,view,5000088,2053013566100866035,appliances.sewing_machine,janome,293.65,530496790,8e5f4f83-366c-4f70-860e-ca7417414283
2,2019-11-01 00:00:01 UTC,view,17302664,2053013553853497655,,creed,28.31,561587266,755422e7-9040-477b-9bd2-6a6e8fd97387
3,2019-11-01 00:00:01 UTC,view,3601530,2053013563810775923,appliances.kitchen.washer,lg,712.87,518085591,3bfb58cd-7892-48cc-8020-2f17e6de6e7f
4,2019-11-01 00:00:01 UTC,view,1004775,2053013555631882655,electronics.smartphone,xiaomi,183.27,558856683,313628f1-68b8-460d-84f6-cec7a8796ef2


In [26]:
df_pd.describe()

Unnamed: 0,product_id,category_id,price,user_id
count,67501980.0,67501980.0,67501980.0,67501980.0
mean,12514060.0,2.057898e+18,292.4593,538639700.0
std,17257410.0,2.012549e+16,355.6745,22885160.0
min,1000365.0,2.053014e+18,0.0,10300220.0
25%,1305977.0,2.053014e+18,69.24,516476200.0
50%,5100568.0,2.053014e+18,165.77,535057300.0
75%,17300750.0,2.053014e+18,360.34,561079400.0
max,100028600.0,2.187708e+18,2574.07,579969900.0


In [27]:
df_pd.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67501979 entries, 0 to 67501978
Data columns (total 9 columns):
 #   Column         Dtype  
---  ------         -----  
 0   event_time     object 
 1   event_type     object 
 2   product_id     int64  
 3   category_id    int64  
 4   category_code  object 
 5   brand          object 
 6   price          float64
 7   user_id        int64  
 8   user_session   object 
dtypes: float64(1), int64(3), object(5)
memory usage: 4.5+ GB


In [28]:
df_pd.describe()

Unnamed: 0,product_id,category_id,price,user_id
count,67501980.0,67501980.0,67501980.0,67501980.0
mean,12514060.0,2.057898e+18,292.4593,538639700.0
std,17257410.0,2.012549e+16,355.6745,22885160.0
min,1000365.0,2.053014e+18,0.0,10300220.0
25%,1305977.0,2.053014e+18,69.24,516476200.0
50%,5100568.0,2.053014e+18,165.77,535057300.0
75%,17300750.0,2.053014e+18,360.34,561079400.0
max,100028600.0,2.187708e+18,2574.07,579969900.0


In [29]:
df_pd.isnull().sum()

event_time              0
event_type              0
product_id              0
category_id             0
category_code    21898171
brand             9224078
price                   0
user_id                 0
user_session           10
dtype: int64

In [30]:
df_pd.count

<bound method DataFrame.count of                        event_time event_type  product_id          category_id  \
0         2019-11-01 00:00:00 UTC       view     1003461  2053013555631882655   
1         2019-11-01 00:00:00 UTC       view     5000088  2053013566100866035   
2         2019-11-01 00:00:01 UTC       view    17302664  2053013553853497655   
3         2019-11-01 00:00:01 UTC       view     3601530  2053013563810775923   
4         2019-11-01 00:00:01 UTC       view     1004775  2053013555631882655   
...                           ...        ...         ...                  ...   
67501974  2019-11-30 23:59:58 UTC       view    15700137  2053013559733912211   
67501975  2019-11-30 23:59:58 UTC       view    28719425  2053013565639492569   
67501976  2019-11-30 23:59:59 UTC       view     1004833  2053013555631882655   
67501977  2019-11-30 23:59:59 UTC       view     2701706  2053013563911439225   
67501978  2019-11-30 23:59:59 UTC       view     1004233  205301355563188265

In [31]:
print(df_pd.apply(lambda col: col.unique()))

event_time       [2019-11-01 00:00:00 UTC, 2019-11-01 00:00:01 ...
event_type                                  [view, cart, purchase]
product_id       [1003461, 5000088, 17302664, 3601530, 1004775,...
category_id      [2053013555631882655, 2053013566100866035, 205...
category_code    [electronics.smartphone, appliances.sewing_mac...
brand            [xiaomi, janome, creed, lg, hp, rondell, miche...
price            [489.07, 293.65, 28.31, 712.87, 183.27, 360.09...
user_id          [520088904, 530496790, 561587266, 518085591, 5...
user_session     [4d3b30da-a5e4-49df-b1a8-ba5943f1dd33, 8e5f4f8...
dtype: object


In [32]:
pd.Series({col:df_pd[col].nunique() for col in df_pd})

event_time        2549559
event_type              3
product_id         190662
category_id           684
category_code         129
brand                4200
price               60435
user_id           3696117
user_session     13776050
dtype: int64

In [None]:
spark.stop()