In [1]:
import pyspark
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.sql.functions import *
from pyspark.sql import DataFrameStatFunctions, DataFrame
from pyspark.sql.types import *
import time

In [2]:
# https://spark.apache.org/docs/latest/configuration.html
conf = SparkConf()
conf.set("spark.driver.memory", "16g")
conf.set("spark.driver.cores", 4)
conf.set("spark.driver.memoryOverhead", 0.9)
conf.set("spark.executor.memory", "32g")
conf.set("spark.executor.cores", 11)

<pyspark.conf.SparkConf at 0x7f0980a83c50>

In [3]:
sc = SparkContext(master = "local", sparkHome="/usr/local/spark/", 
                  appName="examen-ma-1-3", conf=conf)
sql = SQLContext(sc)

In [4]:
ratings = sql.read.csv("movielens/ratings.csv", header=True, inferSchema=True)
ratings.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



In [5]:
ratings_hll = ratings.select("userId", "timestamp").\
    orderBy(col("timestamp")).limit(1000000)
ratings_hll.printSchema()
ratings_hll.show(5)

root
 |-- userId: integer (nullable = true)
 |-- timestamp: integer (nullable = true)

+------+---------+
|userId|timestamp|
+------+---------+
| 28507|789652004|
|131160|789652009|
|131160|789652009|
|131160|789652009|
| 20821|822873600|
+------+---------+
only showing top 5 rows



## Conteo Probabilístico

In [6]:
start = time.time()
ratings_hll.select(approxCountDistinct("userId", rsd=0.05).alias("Conteo_Aprox")).show()
end = time.time()
print("Tiempo que tomó el cálculo: %d segundos"% (end-start))

+------------+
|Conteo_Aprox|
+------------+
|       16262|
+------------+

Tiempo que tomó el cálculo: 47 segundos


## Cuantiles

In [7]:
start = time.time()
ratings_hll.approxQuantile("userId", [0.1, 0.5, 0.9], 0.05)
end = time.time()
print("Tiempo que tomó el cálculo: %d segundos"% (end-start))

Tiempo que tomó el cálculo: 41 segundos


In [8]:
sc.stop()

## Referencias

* <https://databricks.com/blog/2016/05/19/approximate-algorithms-in-apache-spark-hyperloglog-and-quantiles.html>
* [PySpark Cheat Sheet](https://s3.amazonaws.com/assets.datacamp.com/blog_assets/PySpark_Cheat_Sheet_Python.pdf)
* <https://blog.insightdatascience.com/using-jupyter-on-apache-spark-step-by-step-with-a-terabyte-of-reddit-data-ef4d6c13959a>
* <https://spark.apache.org/docs/1.6.1/sql-programming-guide.html>
* <https://stackoverflow.com/questions/45287832/pyspark-approxquantile-function>
* [Holden Karau, Rachel Warren. June 2017. _High Performance Spark_: O'Reilly Media](http://liuchengxu.org/books/src/Spark/High-Performance-Spark.pdf)