<a href="https://colab.research.google.com/github/luismiguelmartinluengo/PySpark_Demos/blob/main/Labo_RDD_Grosery_store.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from pyspark.sql import SparkSession

In [None]:
dataPath = '/content/drive/MyDrive/Colab Notebooks/data/grocery_store_data.csv'

In [None]:
sparkSession = SparkSession.builder.appName('Grocery Store Data').getOrCreate()

In [None]:
#Carga de datos y conversión a RDD
dataFrame = sparkSession.read.csv(dataPath, header=True, inferSchema=True)
rdd = dataFrame.rdd
rdd.first()

Row(ItemID=1001, ItemName='Cheese', Category='Fruits', Quantity=15, PricePerUnit=1.64, TotalSales=24.599999999999998)

In [None]:
#Extracción de RDD para contabilizar cantidades por producto
rddItemQuantity = rdd.map(lambda x: (x['ItemName'], x['Quantity']))
rddItemQuantity.takeSample(False, 5)

[('Bread', 38), ('Beef', 60), ('Milk', 23), ('Bread', 40), ('Milk', 12)]

In [None]:
#Filtro de ventas con más de 10 productos vendidos
rddHighSales = rddItemQuantity.filter(lambda x: x[1] > 10)
rddHighSales.top(5, lambda x: -x[1])

[('Apple', 11), ('Bread', 11), ('Milk', 11), ('Orange', 11), ('Bread', 11)]

In [None]:
#Extraigo las distintas categorías existentes
rddCategories = rdd.map(lambda x: x['Category']).distinct()
rddCategories.collect()

['Fruits', 'Bakery', 'Meat', 'Dairy']

In [None]:
rddDistinct = rdd.distinct()
print('El RDD original tiene ', rdd.count(), 'registros')
print('El RDD distinct tiene ', rddDistinct.count(), 'registros')
rddDistinct.unpersist()

El RDD original tiene  1000 registros
El RDD distinct tiene  1000 registros


PythonRDD[103] at RDD at PythonRDD.scala:53

In [None]:
#Filtro la categoria Diary
rddDairy = rdd.filter(lambda x: x['Category'] == 'Dairy')
rddDairy.take(5)

[Row(ItemID=1016, ItemName='Bread', Category='Dairy', Quantity=8, PricePerUnit=18.7, TotalSales=149.6),
 Row(ItemID=1020, ItemName='Banana', Category='Dairy', Quantity=62, PricePerUnit=15.82, TotalSales=980.84),
 Row(ItemID=1022, ItemName='Apple', Category='Dairy', Quantity=62, PricePerUnit=8.49, TotalSales=526.38),
 Row(ItemID=1025, ItemName='Chicken', Category='Dairy', Quantity=84, PricePerUnit=11.65, TotalSales=978.6),
 Row(ItemID=1028, ItemName='Orange', Category='Dairy', Quantity=6, PricePerUnit=7.56, TotalSales=45.36)]

In [None]:
#Calculo de ventas totales por Item (aunque este valor ya está en el fichero de origen)
rddItemSales = rdd.map(lambda x: (x['ItemID'], x['ItemName'], x['Quantity'] * x['PricePerUnit']))
rddItemSales.take(5)

[(1001, 'Cheese', 24.599999999999998),
 (1002, 'Milk', 687.9599999999999),
 (1003, 'Butter', 695.64),
 (1004, 'Bread', 700.7),
 (1005, 'Cheese', 1191.0)]

In [None]:
#Total ventas por ItemName
rddTotalSalesByItemName = rddItemSales.map(lambda x: (x[1],x[2])).reduceByKey(lambda x, y: x + y)
rddTotalSalesByItemName.take(5)

[('Cheese', 44547.240000000005),
 ('Milk', 47475.599999999984),
 ('Butter', 44820.42),
 ('Bread', 51383.13),
 ('Beef', 45694.43000000002)]

In [None]:
#Top 5 Items que generan mayor beneficio
print(rddTotalSalesByItemName.top(5, lambda x: x[1]))
#Otra forma
print(rddTotalSalesByItemName.takeOrdered(5, lambda x: -x[1])) #takeOrdered muestra en orden ascendente por defecto
#Otra forma
print(rddTotalSalesByItemName.sortBy(lambda x: x[1], ascending = False).take(5))

[('Eggs', 54960.07), ('Apple', 54690.859999999986), ('Orange', 53005.270000000026), ('Bread', 51383.13), ('Milk', 47475.599999999984)]
[('Eggs', 54960.07), ('Apple', 54690.859999999986), ('Orange', 53005.270000000026), ('Bread', 51383.13), ('Milk', 47475.599999999984)]
[('Eggs', 54960.07), ('Apple', 54690.859999999986), ('Orange', 53005.270000000026), ('Bread', 51383.13), ('Milk', 47475.599999999984)]


In [None]:
#Total ventas por categoria (usando TotalSales existente en la tabla original)
rddTotalSalesByCategory = rdd.map(lambda x: (x['Category'],x['TotalSales'])).reduceByKey(lambda x, y: x + y)
rddTotalSalesByCategory.take(5)

[('Fruits', 133819.93000000002),
 ('Bakery', 117350.73000000001),
 ('Meat', 120920.83999999997),
 ('Dairy', 112181.76000000008)]