# Data analysis with Apache Spark - Dataframe 1

In [1]:
import findspark

In [2]:
findspark.init("/home/alumno/spark-3.2.2-bin-hadoop2.7")
from pyspark import SparkContext, SparkConf
conf = SparkConf().setAppName("intro").setMaster("local")
sc = SparkContext(conf=conf)
from pyspark.sql import SparkSession
spark = SparkSession(sc)


In [3]:
c = spark.read.option("inferSchema", "true").option("header", "true").csv("/home/alumno/Descargas/customers.csv")

In [4]:
c.printSchema()
c.show(5)

root
 |-- date: string (nullable = true)
 |-- time: string (nullable = true)
 |-- customer: integer (nullable = true)
 |-- product: integer (nullable = true)
 |-- quantity: integer (nullable = true)
 |-- price: integer (nullable = true)

+----------+--------+--------+-------+--------+-----+
|      date|    time|customer|product|quantity|price|
+----------+--------+--------+-------+--------+-----+
|05/10/2018| 2:20 PM|     100|      1|      10|  816|
|06/10/2018| 3:30 PM|     100|      1|      10|    1|
|07/10/2018| 5:20 PM|     100|      1|      10|   10|
|04/08/2018|11:38 PM|     100|      2|       8|   79|
|25/03/2018| 3:52 AM|     100|      3|       1|   91|
+----------+--------+--------+-------+--------+-----+
only showing top 5 rows



## 1. Easy questions with select and expressions

Which are the products in my dataset

In [5]:
from pyspark.sql.functions import expr, desc, asc

In [6]:
#select transformation to use 1 or multiple columns
c.select("customer", "product").show(10)

#select expression to change name of a column
c.select(expr("customer"), expr("product"), expr("quantity as q")).show()

+--------+-------+
|customer|product|
+--------+-------+
|     100|      1|
|     100|      1|
|     100|      1|
|     100|      2|
|     100|      3|
|     100|      4|
|     100|      5|
|     100|      6|
|     100|      7|
|     100|      8|
+--------+-------+
only showing top 10 rows

+--------+-------+---+
|customer|product|  q|
+--------+-------+---+
|     100|      1| 10|
|     100|      1| 10|
|     100|      1| 10|
|     100|      2|  8|
|     100|      3|  1|
|     100|      4|  3|
|     100|      5|  8|
|     100|      6|  8|
|     100|      7|  4|
|     100|      8|  5|
|     100|      9|  9|
|     100|     10|  9|
|     100|      1|  3|
|     100|      2|  6|
|     100|      3|  1|
|     100|      4|  7|
|     100|      5|  3|
|     100|      6|  5|
|     100|      7|  7|
|     100|      8|  0|
+--------+-------+---+
only showing top 20 rows



Which are my products and quantities in my dataset?

In [7]:
c.select("product", "quantity").show()

#whole table
c.select(expr("*")).show()

+-------+--------+
|product|quantity|
+-------+--------+
|      1|      10|
|      1|      10|
|      1|      10|
|      2|       8|
|      3|       1|
|      4|       3|
|      5|       8|
|      6|       8|
|      7|       4|
|      8|       5|
|      9|       9|
|     10|       9|
|      1|       3|
|      2|       6|
|      3|       1|
|      4|       7|
|      5|       3|
|      6|       5|
|      7|       7|
|      8|       0|
+-------+--------+
only showing top 20 rows

+----------+--------+--------+-------+--------+-----+
|      date|    time|customer|product|quantity|price|
+----------+--------+--------+-------+--------+-----+
|05/10/2018| 2:20 PM|     100|      1|      10|  816|
|06/10/2018| 3:30 PM|     100|      1|      10|    1|
|07/10/2018| 5:20 PM|     100|      1|      10|   10|
|04/08/2018|11:38 PM|     100|      2|       8|   79|
|25/03/2018| 3:52 AM|     100|      3|       1|   91|
|24/07/2018|11:37 AM|     100|      4|       3|   59|
|10/01/2018| 9:17 PM|     100|  

Which products have been purchased more than 1 time?

In [8]:
c.select(expr("quantity > 1")).show(5)
#with the product id
c.select(expr("product"), expr("quantity > 1")).show(5)
c.select(expr("*"), expr("quantity > 1")).show(5)

+--------------+
|(quantity > 1)|
+--------------+
|          true|
|          true|
|          true|
|          true|
|         false|
+--------------+
only showing top 5 rows

+-------+--------------+
|product|(quantity > 1)|
+-------+--------------+
|      1|          true|
|      1|          true|
|      1|          true|
|      2|          true|
|      3|         false|
+-------+--------------+
only showing top 5 rows

+----------+--------+--------+-------+--------+-----+--------------+
|      date|    time|customer|product|quantity|price|(quantity > 1)|
+----------+--------+--------+-------+--------+-----+--------------+
|05/10/2018| 2:20 PM|     100|      1|      10|  816|          true|
|06/10/2018| 3:30 PM|     100|      1|      10|    1|          true|
|07/10/2018| 5:20 PM|     100|      1|      10|   10|          true|
|04/08/2018|11:38 PM|     100|      2|       8|   79|          true|
|25/03/2018| 3:52 AM|     100|      3|       1|   91|         false|
+----------+--------

In [9]:
#shorter way
c.selectExpr("*", "quantity > 1").show(5)
#combine different columns expressions
c.selectExpr("*", "customer = 100", "price > 10").show(5)
#use columns to compare their values with arithmetic operations
c.selectExpr("*", "price > quantity*10").show(5)

+----------+--------+--------+-------+--------+-----+--------------+
|      date|    time|customer|product|quantity|price|(quantity > 1)|
+----------+--------+--------+-------+--------+-----+--------------+
|05/10/2018| 2:20 PM|     100|      1|      10|  816|          true|
|06/10/2018| 3:30 PM|     100|      1|      10|    1|          true|
|07/10/2018| 5:20 PM|     100|      1|      10|   10|          true|
|04/08/2018|11:38 PM|     100|      2|       8|   79|          true|
|25/03/2018| 3:52 AM|     100|      3|       1|   91|         false|
+----------+--------+--------+-------+--------+-----+--------------+
only showing top 5 rows

+----------+--------+--------+-------+--------+-----+----------------+------------+
|      date|    time|customer|product|quantity|price|(customer = 100)|(price > 10)|
+----------+--------+--------+-------+--------+-----+----------------+------------+
|05/10/2018| 2:20 PM|     100|      1|      10|  816|            true|        true|
|06/10/2018| 3:30 

## 2. Aggregation values of our DataFrame with selectExpr

In [10]:
#specify aggregations over the entire dataFrame by using:
c.selectExpr("sum(price)").show()
c.selectExpr("avg(price)").show()

+----------+
|sum(price)|
+----------+
|     51052|
+----------+

+----------------+
|      avg(price)|
+----------------+
|50.9500998003992|
+----------------+



What is the main difference between these two?

In [11]:
c.selectExpr("count(customer)").show()  
c.selectExpr("count(distinct customer)").show() 

#combinations of aggregations
c.selectExpr("avg(price)", "count(customer)").show()

+---------------+
|count(customer)|
+---------------+
|           1002|
+---------------+

+------------------------+
|count(DISTINCT customer)|
+------------------------+
|                      31|
+------------------------+

+----------------+---------------+
|      avg(price)|count(customer)|
+----------------+---------------+
|50.9500998003992|           1002|
+----------------+---------------+



## 4. Sorting elements of the Dataset

In [12]:
#sort the data
c.orderBy("price").show(5)
#sort our registers first with one column, then, for those elements with the same value, use a secondary to sort them
c.orderBy("price", "customer").show(5)

#incremental or decremental sorting
c.orderBy(desc("customer"), asc("customer")).show(5)
c.where(expr("date > '16/09/2018'")).orderBy(desc("customer"), desc("price")).show(5)


+----------+--------+--------+-------+--------+-----+
|      date|    time|customer|product|quantity|price|
+----------+--------+--------+-------+--------+-----+
|23/06/2018|12:49 AM|     110|      8|       8|    0|
|22/08/2018| 3:38 PM|     124|      5|       4|    0|
|07/07/2018| 6:23 PM|     112|      6|       4|    0|
|16/03/2018|12:21 PM|     107|      3|       7|    0|
|03/12/2017| 2:38 PM|     113|      9|       3|    0|
+----------+--------+--------+-------+--------+-----+
only showing top 5 rows

+----------+--------+--------+-------+--------+-----+
|      date|    time|customer|product|quantity|price|
+----------+--------+--------+-------+--------+-----+
|24/04/2018| 2:01 PM|     106|      4|       0|    0|
|16/03/2018|12:21 PM|     107|      3|       7|    0|
|23/06/2018|12:49 AM|     110|      8|       8|    0|
|07/07/2018| 6:23 PM|     112|      6|       4|    0|
|03/12/2017| 2:38 PM|     113|      9|       3|    0|
+----------+--------+--------+-------+--------+-----+
onl

## 5. Generating groups of data elements

In [13]:
#How many purchases, regardless of products, we have for each customer
c.groupBy("customer").count().show(3) 

#Total number of prodducts each customer has bought
c.groupBy("customer").agg(expr("sum(quantity)")).show(3)

#general aggregation with any of the aggregation functions
c.groupBy("customer").avg().show(3) 

#the key is the columns and the value is the aggregation function that we need
c.groupBy("customer").agg(expr("avg(quantity)"), expr("stddev_pop(quantity)")).show(3)

c.groupBy("customer").agg(expr("avg(quantity)"), expr("max(price)")).show(3)

+--------+-----+
|customer|count|
+--------+-----+
|     108|   33|
|     101|   33|
|     115|   33|
+--------+-----+
only showing top 3 rows

+--------+-------------+
|customer|sum(quantity)|
+--------+-------------+
|     108|          129|
|     101|          196|
|     115|          143|
+--------+-------------+
only showing top 3 rows

+--------+-------------+-----------------+------------------+------------------+
|customer|avg(customer)|     avg(product)|     avg(quantity)|        avg(price)|
+--------+-------------+-----------------+------------------+------------------+
|     108|        108.0|5.545454545454546| 3.909090909090909| 49.93939393939394|
|     101|        101.0|5.454545454545454|5.9393939393939394| 49.27272727272727|
|     115|        115.0|5.636363636363637| 4.333333333333333|49.666666666666664|
+--------+-------------+-----------------+------------------+------------------+
only showing top 3 rows

+--------+------------------+--------------------+
|customer|   

## 6. Joining Dataframes

In [14]:
products = spark.read.option("inferSchema","true").option("header", "true").csv("products.csv") 

stock = spark.read.option("inferSchema","true").option("header", "true").csv("stock.csv") 


Show which elements in stock could be a target for a discount

In [15]:
joined = products.join(stock, products["id"] == stock["id"], "inner") 
joined.show()

+----+-----+-----+----+-----+--------+
|  ID| NAME|COLOR|  ID|PRICE|STOCKNUM|
+----+-----+-----+----+-----+--------+
|1234|chair| blue|1234|  125|    1000|
|   1|table|black|   1|  816|     100|
|   2|  jar|white|   2|   46|       1|
|   3|  pan|  red|   3|   54|      22|
+----+-----+-----+----+-----+--------+



- products is our base table 
- stock is our joining table 
- product("id") === stock("id") is our joining condition. That is, all joining product ids must exist in both product and stock tables 
- "inner" is the kind of join operation we are using for the operation 

We have many kinds of join operations, some of them are: 

- natural joins: join by matching the columns between left and right with the same names 
- inner joins: keep rows with keys in the left and right
- outer joins: keep rows with keys in either the left or the right 
- left outer/right outer: keep rows in the left (or right) when keys are in the left (or right) dataset 
- left_anti: keep rows in the left where they do not appear in the righ

## Questions to solve with cistomers.csv dataset

Use any of the previous transformations to find the answer to these questions. You can export your Jupyter notebook final version as part of the deliverable. For each question, you must provide the following information: 
- What command are you going to use? Why? 
- Which is your Spark operation to solve the question? 
- Which output is providing your Spark command (3 lines max.) 

### Quetion 1:
### How many elements can we find (in our DataFrame)? 

We will use the command count() to find how many elements we have in the DataFrame.

The operation is c.count()

In [16]:
elements_count = c.count()
print(f"Number of elements in the DataFrame: {elements_count}")

Number of elements in the DataFrame: 1002


### Question 2:
### How many unique customers? 


We will use the command distinct() and count()

In [17]:
unique_customers = c.select("customer").distinct().count()

print("Unique customers: %d" % unique_customers)

Unique customers: 31


### Question 3:
### How many products were purchased by each customer? 

We will use the command groupBy() and agg()

In [18]:
products_per_customer = c.groupBy("customer").agg(expr("sum(quantity) as total_quantity"))
products_per_customer.show(3)


+--------+--------------+
|customer|total_quantity|
+--------+--------------+
|     108|           129|
|     101|           196|
|     115|           143|
+--------+--------------+
only showing top 3 rows



### Question 4:
### Sort customers by quantity 

In [19]:
sorted_customers_by_quantity = products_per_customer.orderBy(desc("total_quantity"))

sorted_customers_by_quantity.show(3)

+--------+--------------+
|customer|total_quantity|
+--------+--------------+
|     101|           196|
|     122|           179|
|     117|           176|
+--------+--------------+
only showing top 3 rows



### Question 5:
### How many times customer id number 100 has purchased more than 5 items? 

In [20]:
customer100_5 = c.where(expr("customer = 100")).where(expr("quantity > 5"))
customer100_5.show(3)

counter = customer100_5.count()
print("Customer with id 100 has purchased", counter, "times more than 5 items.")


+----------+-------+--------+-------+--------+-----+
|      date|   time|customer|product|quantity|price|
+----------+-------+--------+-------+--------+-----+
|05/10/2018|2:20 PM|     100|      1|      10|  816|
|06/10/2018|3:30 PM|     100|      1|      10|    1|
|07/10/2018|5:20 PM|     100|      1|      10|   10|
+----------+-------+--------+-------+--------+-----+
only showing top 3 rows

Customer with id 100 has purchased 16 times more than 5 items.


### Question 6:
### Which were the products bought by customer with the largest number of transactions? We are interested in the customer that has done more purchases. You do not need to consider quantities of products, just how many times a customer has done a transaction.

In [21]:
transactions = c.groupBy("customer").count().orderBy(desc("count"))
transactions.show(3)

first_customer_transactions = transactions.head(1)[0]["customer"]
print(f"The customer with the largest number of transactions is: {first_customer_transactions}")


prod = c.where(expr("customer") == first_customer_transactions).select("product").distinct()
prod.show()

+--------+-----+
|customer|count|
+--------+-----+
|     100|   35|
|     108|   33|
|     115|   33|
+--------+-----+
only showing top 3 rows

The customer with the largest number of transactions is: 100
+-------+
|product|
+-------+
|      1|
|      6|
|      3|
|      5|
|      9|
|      4|
|      8|
|      7|
|     10|
|      2|
+-------+

