# Spark RDD Data Operations

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

## 1. Word Count from File

In [None]:
input_data_rdd = sc.textFile('./big_data_intro.txt')
print(input_data_rdd.getNumPartitions())

In [None]:
print(input_data_rdd.count())

In [None]:
#word_count_rdd = input_data_rdd.flatMap(lambda line: line.split()).map(lambda word: (word, 1)).reduceByKey(lambda a,b:a+b)

splitted_word_rdd = input_data_rdd.flatMap(lambda line: line.split())
print(splitted_word_rdd.count())
print(splitted_word_rdd.collect())

In [None]:
word_map_rdd = splitted_word_rdd.map(lambda word: (word, 1))
print(word_map_rdd.count())
print(word_map_rdd.collect())

In [None]:
word_count_rdd = word_map_rdd.reduceByKey(lambda a,b:a+b)

print(word_count_rdd.count())
print(word_count_rdd.collect())

In [None]:
#put the data in the folder and run by yourself
word_to_count_pair_rdd = sc.textFile('./Shakespeare.txt').flatMap(lambda line: line.split(" ")).map(lambda word: (word, 1)).reduceByKey(lambda a,b: a+b)

print(word_to_count_pair_rdd.take(10))

## 2. Data Operations on Rio Olympics Player Dataset

In [None]:
df_olympia = spark.read.options(header=True).csv('./olympia2016_athletes.csv', )
rdd_olympia = df_olympia.rdd
df_olympia.show(10)

In [None]:
rdd_olympia.take(10)

### Transformations

#### .map(...)

The method is applied to each element of the RDD: you can think of this as a transformation of each row.

In [None]:
#get a list of players
rdd_player = rdd_olympia.map(lambda row: row[1])
rdd_player.take(10)

You can combine more columns.

In [None]:
#get the tuple of (player_name, gold_medal)
rdd_player_gold = rdd_olympia.map(lambda row: (row[1], int(row[8])))
rdd_player_gold.take(10)

#### .filter(...)

The `.filter(...)` method allows you to select elements of your dataset that fit specified criteria.

In [None]:
rdd_filtered = rdd_olympia.filter(lambda row: row[3] == 'female' and int(row[8]) > 0)
rdd_filtered.count()

In [None]:
rdd_filtered.take(10)

#### .flatMap(...)

The `.flatMap(...)` method works similarly to `.map(...)` but returns a flattened results instead of a list. 

In [None]:
rdd_flat = rdd_olympia.flatMap(lambda row: (row[1], int(row[8])))
print(rdd_flat.take(10))

#### .distinct()

This method returns a list of distinct values in a specified column.

In [None]:
distinct_gender = rdd_olympia.map(lambda row: row[3]).distinct().collect()
distinct_gender

#### .sample(...)

The `.sample()` method returns a randomized sample from the dataset.

In [None]:
fraction = 0.1
rdd_sample = rdd_olympia.sample(False, fraction, 666)

rdd_sample.take(10)

Let's confirm that we really got 10% of all the records.

In [None]:
print('Original dataset: {}, sample: {}'.format(rdd_olympia.count(), rdd_sample.count()))

#### .leftOuterJoin(...)

Left outer join, just like the SQL world, joins two RDDs based on the values found in both datasets, and returns records from the left RDD with records from the right one appended where the two RDDs match.

In [None]:
rdd1 = sc.parallelize([('a', 1), ('b', 4), ('c',10)])
rdd2 = sc.parallelize([('a', 4), ('a', 1), ('b', '6'), ('d', 15)])

rdd3 = rdd1.leftOuterJoin(rdd2)
rdd3.take(5)

In [None]:
rdd_player_silver = rdd_olympia.map(lambda row: (row[1], int(row[9])))

rdd_player_gold_only = rdd_player_gold.filter(lambda pair: pair[1] > 0)
rdd_player_silver_only = rdd_player_silver.filter(lambda pair: pair[1] > 0)

rdd_player_gold_silver = rdd_player_gold_only.leftOuterJoin(rdd_player_silver_only)
rdd_player_gold_silver.take(10)

In [None]:
rdd4 = rdd1.join(rdd2)
rdd4.collect()

If we used `.join(...)` method instead we would have gotten only the values for `'a'` and `'b'` as these two values intersect between these two RDDs.

In [None]:
rdd_player_gold_silver_2 = rdd_player_gold_only.join(rdd_player_silver_only)
rdd_player_gold_silver_2.take(10)

Another useful method is the `.intersection(...)` that returns the records that are *equal* in both RDDs.

In [None]:
rdd5 = rdd1.intersection(rdd2)
rdd5.collect()

In [None]:
rdd_player_gold_silver_3 = rdd_player_gold_only.intersection(rdd_player_silver_only)
rdd_player_gold_silver_3.take(10)

#### .repartition(...)

Repartitioning the dataset changes the number of partitions the dataset is divided into.

In [None]:
rdd_olympia.getNumPartitions()

In [None]:
rdd_re = rdd_olympia.repartition(4)

len(rdd_re.glom().collect())
rdd_re.getNumPartitions()

### Actions

#### .take(...)

The method returns `n` top rows from a single data partition.

In [None]:
data_first = rdd_olympia.take(5)
data_first

If you want somewhat randomized records you can use `.takeSample(...)` instead.

In [None]:
data_take_sampled = rdd_olympia.takeSample(False, 5, 667)
data_take_sampled

#### .reduce(...)

Another action that processes your data, the `.reduce(...)` method *reduces* the elements of an RDD using a specified method.

In [None]:
rdd1 = sc.parallelize([('a', 1), ('b', 4), ('c',10)])
print(rdd1.map(lambda row: row[1]).reduce(lambda x,y: x+y))

rdd_olympia.map(lambda row: int(row[8])).reduce(lambda x,y: x+y)

The `.reduceByKey(...)` method works in a similar way to the `.reduce(...)` method but performs a reduction on a key-by-key basis.

In [None]:
#count the number of gold medal players for each country
country_medal = rdd_olympia.map(lambda row:(row[2], int(row[8])))
country_medal.reduceByKey(lambda x,y: x+y).take(10)

#### .count()

The `.count()` method counts the number of elements in the RDD.

In [None]:
rdd_olympia.count()

`.count()` has the same effect as the method below but does not require shifting the data to the driver. When data is large, you should use `.count()` rather than `len(rdd.collect())` as the later will return all data to driver, which can be overloaded.

In [None]:
#This is wrong - you should not do this
len(rdd_olympia.collect())

If your dataset is in a form of a *key-value* you can use the `.countByKey()` method to get the counts of distinct keys.

In [None]:
#number of records (players) in each country
country_medal.countByKey().items()

#### .saveAsTextFile(...)

As the name suggests, the `.saveAsTextFile()` the RDD and saves it to text files: each partition to a separate file.

In [None]:
country_medal.saveAsTextFile('./country_medal.txt')
rdd_re.saveAsTextFile('./rdd_re.txt')

In [None]:
country_medal_reread = sc.textFile('./country_medal.txt')
country_medal_reread.take(10)

In [None]:
country_medal_reread = sc.textFile('./country_medal.txt')
country_medal_reread.take(10)

**.foreach(...)**

A method that applies the same function to each element of the RDD in an iterative way to get some results.

In [None]:
#the output is in your terminal
def f(x):
    print(x)

country_medal.foreach(f)

In [None]:
def k(x): 
    print(x[0])

country_medal.foreach(k)

## 3. Your Exercise
3.1 Calculate the average player height by country

3.2 Calculate the ratio of medals for each country (total medals by number of players)

3.3 Obtain the countries where female players won more medals 