In [1]:
import pyspark
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import sys
import os

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

conf = SparkConf().setAppName("Spark - RDD par")
sc = SparkContext(conf=conf)
spark = SparkSession.builder.appName("Spark - RDD par").getOrCreate()

In [2]:
import os
def remove_file(file):
    if os.path.exists(file):
        os.remove(file)
remove_file("ign.csv")

In [3]:
import requests
r = requests.get("https://jankiewicz.pl/bigdata/bigdata-sp/ign.csv", allow_redirects=True)
open('ign.csv', 'wb').write(r.content)

2019628

In [4]:
rawIgn = sc.textFile("ign.csv")
rawIgn.count()

18626

In [5]:
rawIgn.first()

',score_phrase,title,url,platform,score,genre,editors_choice,release_year,release_month,release_day'

In [6]:
import re
tabIgn = rawIgn.map(lambda line: re.split(",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)",line))
tabIgn.count()

18626

In [7]:
tabIgn.first()

['',
 'score_phrase',
 'title',
 'url',
 'platform',
 'score',
 'genre',
 'editors_choice',
 'release_year',
 'release_month',
 'release_day']

In [8]:
PLATFORM = 4
SCORE = 5
GENRE = 6
RELEASE_YEAR = 8

In [9]:
gameInfosRdd = tabIgn.filter(lambda tab: len(tab)== 11 and len(tab[0])>0)

In [10]:
gameInfosRdd.first()

['0',
 'Amazing',
 'LittleBigPlanet PS Vita',
 '/games/littlebigplanet-vita/vita-98907',
 'PlayStation Vita',
 '9.0',
 'Platformer',
 'Y',
 '2012',
 '9',
 '12']

In [14]:
# jakiego typu są g1, g2 i g3?
g1 = gameInfosRdd.groupBy(lambda gi: gi[GENRE])
type(g1)

pyspark.rdd.PipelinedRDD

In [15]:
g2 = gameInfosRdd.map(lambda gi: (gi[GENRE],gi))

In [None]:
g3 = gameInfosRdd.keyBy(lambda gi: gi[GENRE]) # genre

In [16]:
# Co wyliczają poniższe wyrażenia?
# Jakiego typu są wyniki?
# Czy wszystkie dają ten sam wynik?
g4 = gameInfosRdd.keyBy(lambda gi: gi[GENRE]).aggregateByKey(0.0, lambda m,gi : m + float(gi[SCORE]), lambda mx,my: mx + my)

In [19]:
g5 = gameInfosRdd.map(lambda gi: (gi[GENRE],gi)).groupByKey().\
mapValues(lambda gis: sum(list(map(lambda x:float(x[SCORE]), gis))))

In [20]:
g6 = gameInfosRdd.map(lambda gi: (gi[GENRE],float(gi[SCORE]))).reduceByKey(lambda mx,my: mx + my)

In [17]:
%%time
g4.first()

CPU times: total: 31.2 ms
Wall time: 2.39 s


('Platformer', 5914.5000000000055)

In [21]:
%%time
g5.first()

CPU times: total: 31.2 ms
Wall time: 2.41 s


('Platformer', 5914.5000000000055)

In [22]:
%%time
g6.first()

CPU times: total: 0 ns
Wall time: 2.3 s


('Platformer', 5914.5000000000055)

In [26]:
g7 = gameInfosRdd.map(lambda gi: (gi[GENRE],gi)).groupByKey().\
mapValues(lambda gis: sum(list(map(lambda x:float(x[SCORE]), gis))))

In [29]:
g8 = gameInfosRdd.keyBy(lambda gi: gi[GENRE]).countByKey()
{k: g8[k] for k in list(g8)[:10]}

{'Platformer': 823,
 'Puzzle': 776,
 'Sports': 1916,
 'Strategy': 1071,
 'Fighting': 547,
 'RPG': 980,
 '': 36,
 '"Action, Adventure"': 765,
 'Adventure': 1175,
 'Action': 3797}

In [49]:
# countByKey jest akcją – punktem końcowym przetwarzania
# Jak wyglądałaby transformacja wyliczająca dokładnie to samo, ale pozostawiająca dane w postaci RDD?

gameInfosRdd.keyBy(lambda gi: gi[GENRE]).groupByKey().mapValues(lambda values: len(values)).take(10)

[('Platformer', 823),
 ('Puzzle', 776),
 ('Strategy', 1071),
 ('', 36),
 ('"Action, Adventure"', 765),
 ('Action', 3797),
 ('Shooter', 1610),
 ('"Strategy, RPG"', 77),
 ('"Action, Compilation"', 89),
 ('"Educational, Puzzle"', 25)]

In [52]:
# Dokończ poniższy fragment kodu tak, aby wyznaczyć liczbę platform objętych recenzjami gier.
gameInfosRdd.keyBy(lambda gi: gi[PLATFORM]).countByKey()

defaultdict(int,
            {'PlayStation Vita': 155,
             'iPad': 99,
             'Xbox 360': 1631,
             'PlayStation 3': 1356,
             'Macintosh': 81,
             'PC': 3370,
             'iPhone': 842,
             'Nintendo DS': 1045,
             'Nintendo 3DS': 225,
             'Android': 39,
             'Wii': 1366,
             'PlayStation 4': 277,
             'Wii U': 114,
             'Linux': 10,
             'PlayStation Portable': 633,
             'PlayStation': 952,
             'Nintendo 64': 302,
             'Saturn': 6,
             'Lynx': 82,
             'Game Boy': 22,
             'Game Boy Color': 356,
             'NeoGeo Pocket Color': 31,
             'Game.Com': 3,
             'Dreamcast': 286,
             'Dreamcast VMU': 1,
             'WonderSwan': 4,
             'Arcade': 11,
             'Nintendo 64DD': 7,
             'PlayStation 2': 1686,
             'WonderSwan Color': 1,
             'Game Boy Advance': 623,
    

In [63]:
# Do tej pory liczyliśmy sumy ocen... Jak wyglądałoby obliczenie średniej oceny
# Podpowiedź: zastosuj metody: mapValues (może nie raz?), reduceByKey.
result = gameInfosRdd.map(lambda gi: (gi[GENRE],gi)).mapValues(lambda x: (float(x[SCORE]), 1)).reduceByKey(lambda x,y: (x[0]+y[0], x[1]+y[1])).mapValues(lambda x: x[0] / x[1]).collect()

In [64]:
sorted(result, key=lambda v: v[1], reverse=True)[:5]

[('"Compilation, Compilation"', 9.5),
 ('Hardware', 9.15),
 ('"Puzzle, RPG"', 9.1),
 ('"Other, Action"', 9.0),
 ('"Adventure, Episodic"', 8.9)]

In [84]:
gameInfosRdd.filter(lambda gi: gi[PLATFORM] == "PlayStation 4").map(lambda x : (x[RELEASE_YEAR], 1)).keyBy(lambda x : x[0]).take(5)

[('2012', ('2012', 1)),
 ('2013', ('2013', 1)),
 ('2013', ('2013', 1)),
 ('2013', ('2013', 1)),
 ('2013', ('2013', 1))]

In [79]:
ps4rdd = gameInfosRdd.filter(lambda gi: gi[PLATFORM] == "PlayStation 4").\
    keyBy(lambda gi: gi[RELEASE_YEAR])
wiiUrdd = gameInfosRdd.filter(lambda gi: gi[PLATFORM] == "Wii U").\
    keyBy(lambda gi: gi[RELEASE_YEAR])
xbOnerdd = gameInfosRdd.filter(lambda gi: gi[PLATFORM] == "Xbox One").\
    keyBy(lambda gi: gi[RELEASE_YEAR])

ps4rdd.first()

('2012',
 ['192',
  'Amazing',
  'Sound Shapes',
  '/games/sound-shapes-queasy-games/ps4-20007461',
  'PlayStation 4',
  '9.0',
  'Platformer',
  'Y',
  '2012',
  '8',
  '8'])

In [67]:
#Jaki tu będzie typ wyniku?
alljoin = ps4rdd.join(wiiUrdd).join(xbOnerdd)
alljoin.first()

('2014',
 ((['17452',
    'Amazing',
    'Tomb Raider: Definitive Edition',
    '/games/tomb-raider-definitive-edition/ps4-20009692',
    'PlayStation 4',
    '9.1',
    'Action',
    'Y',
    '2014',
    '1',
    '25'],
   ['17489',
    'Good',
    'Dr. Luigi',
    '/games/dr-luigi/wii-u-20010245',
    'Wii U',
    '7.5',
    'Puzzle',
    'N',
    '2014',
    '1',
    '10']),
  ['17451',
   'Amazing',
   'Tomb Raider: Definitive Edition',
   '/games/tomb-raider-definitive-edition/xbox-one-20009691',
   'Xbox One',
   '9.1',
   'Action',
   'Y',
   '2014',
   '1',
   '25']))

In [71]:
ps4rdd.mapValues(lambda gi: 1).reduceByKey(lambda x, y: x + y).\
join(wiiUrdd.mapValues(lambda gi: 1).reduceByKey(lambda x, y: x + y)).\
join(xbOnerdd.mapValues(lambda gi: 1).reduceByKey(lambda x, y: x + y)).\
mapValues(lambda x: (x[0][0], x[0][1], x[1])).take(2)

[('2013', (34, 44, 23)), ('2014', (84, 23, 59))]

In [74]:
rdd1 = sc.parallelize([(1, "Alice"), (2, "Bob"), (3, "Charlie"), (1, "David")])
rdd2 = sc.parallelize([(1, 25), (2, 30), (3, 22)])

# Perform join operation
rdd1.join(rdd2).take(3)

[(1, ('Alice', 25)), (1, ('David', 25)), (2, ('Bob', 30))]