## RDD

#### Transformations:

https://spark.apache.org/docs/latest/rdd-programming-guide.html#transformations

#### Actions

https://spark.apache.org/docs/latest/rdd-programming-guide.html#actions

In [1]:
from pyspark import RDD
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Sesion_3").master("local[*]").getOrCreate()
sc = spark.sparkContext

In [2]:
spark

In [3]:
pokemon_rdd: RDD[str] = sc.textFile(name = "../../resources/data/csv/pokemon.csv") # \
    #.flatMap(lambda line: [line + "," + str(i) for i in range(0, 10)])

pokemon_rdd.foreach(lambda item: print(item))

In [9]:
import re
## re es para aplicar Expresiones Regulares
clean_pokemon_rdd = pokemon_rdd\
    .map(lambda line: line.split(","))\
    .filter(lambda array_str: True if re.match("^[0-9]+$", array_str[0]) else False)\
    .map(lambda array_str: [int(array_str[0]), str(array_str[1]), str(array_str[2]), str(array_str[3]),
                            int(array_str[4]), int(array_str[5]), int(array_str[6]), int(array_str[7]),
                            int(array_str[8]), int(array_str[9]), int(array_str[10]), int(array_str[11]),
                            True if array_str[12] == "True" else False
                            ])

clean_pokemon_rdd.foreach(lambda item: print(item))

In [5]:
# Cuantos pokemon legendarios existen?
count_legendary = clean_pokemon_rdd \
    .map(lambda array: [array[12], 1]) \
    .filter(lambda item: item[0]) \
    .map(lambda item: item[1]) \
    .reduce(lambda item_1, item_2: item_1 + item_2)

print(count_legendary)

count_legendary = clean_pokemon_rdd \
        .map(lambda array: [array[12], 1]) \
        .reduceByKey(lambda item_1, item_2: item_1 + item_2) \
        .filter(lambda item: item[0]) \
        .collect()[0][1]

print(count_legendary)

## Lo mismo que lo anterior, pero más simplificado
count_legendary = clean_pokemon_rdd \
        .filter(lambda item: item[12]) \
        .count()

print(count_legendary)

65
65
65


In [6]:
# Valor promedio de ataque por cada type_1
clean_pokemon_rdd \
    .map(lambda array: (array[2], array[6])) \
    .groupByKey() \
    .mapValues(lambda values: (sum(list(values)), len(list(values)))) \
    .mapValues(lambda items: items[0]/items[1]) \
    .foreach(lambda item: print(item))

clean_pokemon_rdd \
    .map(lambda array: (array[2], array[6])) \
    .groupByKey() \
    .map(lambda values: (values[0], sum(list(values[1])) / len(list(values[1])))) \
    .foreach(lambda item: print(item))

clean_pokemon_rdd \
    .map(lambda array: (array[2], (array[6], 1))) \
    .reduceByKey(lambda tuple_1, tuple_2: (tuple_1[0] + tuple_2[0], tuple_1[1] + tuple_2[1])) \
    .map(lambda results: (results[0], results[1][0]/results[1][1])) \
    .foreach(lambda item: print(item))

In [7]:
#Pokemon con mejor score en Total POR cada TYPE 1 Y POR GENERACIÓN OMITIENDO LOS LEGENDARIOS
clean_pokemon_rdd \
    .filter(lambda array: ~array[12]) \
    .map(lambda array: ((array[2], array[11]), (array[1], array[4]))) \
    .reduceByKey(lambda item_1, item_2: item_1 if item_1[1] > item_2[1] else item_2) \
    .foreach(lambda item: print(item))

In [17]:
#Top 3 Pokemon con mejor score en Total POR cada TYPE 1 Y POR GENERACIÓN OMITIENDO LOS LEGENDARIOS
clean_pokemon_rdd \
    .filter(lambda array: ~array[12]) \
    .map(lambda array: ((array[2], array[11]), (array[1], array[4]))) \
    .groupByKey() \
    .mapValues(lambda items: sorted(list(items), key=lambda x: x[1], reverse=True)[0:3]) \
    .flatMapValues(lambda results: results) \
    .foreach(lambda item: print(item))