In [0]:
from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName('average')
sc = SparkContext.getOrCreate(conf=conf)

In [0]:
rdd = sc.textFile('/FileStore/tables/movie_review.csv')
rdd.collect()

Out[2]: ['The Shawshank Redemption,3',
 'The Matrix,5',
 '12 Angry Men,3',
 '12 Angry Men,4',
 'The Matrix,5',
 'Pulp Fiction,4',
 'The Godfather,5',
 'The Shawshank Redemption,2',
 'Pulp Fiction,4',
 'The Godfather,5',
 '12 Angry Men,2',
 'The Godfather,3',
 'Pulp Fiction,4',
 '12 Angry Men,1',
 'The Shawshank Redemption,2',
 '12 Angry Men,1',
 'The Shawshank Redemption,5',
 'Pulp Fiction,5',
 'Pulp Fiction,2',
 'The Matrix,4']

In [0]:
#break down the value as a key value pair from the data
def getkeyvaluepair(text):
    text,rating = text.split(',')
    return (text, int(rating))
rdd2 = rdd.map(getkeyvaluepair)

In [0]:
rdd2.collect()

Out[9]: [('The Shawshank Redemption', 3),
 ('The Matrix', 5),
 ('12 Angry Men', 3),
 ('12 Angry Men', 4),
 ('The Matrix', 5),
 ('Pulp Fiction', 4),
 ('The Godfather', 5),
 ('The Shawshank Redemption', 2),
 ('Pulp Fiction', 4),
 ('The Godfather', 5),
 ('12 Angry Men', 2),
 ('The Godfather', 3),
 ('Pulp Fiction', 4),
 ('12 Angry Men', 1),
 ('The Shawshank Redemption', 2),
 ('12 Angry Men', 1),
 ('The Shawshank Redemption', 5),
 ('Pulp Fiction', 5),
 ('Pulp Fiction', 2),
 ('The Matrix', 4)]

In [0]:
#to find the average we need to have the count of each key 
#so we will add 1 to the rating to  keep the count track

def getkeyvaluepair(text):
    text,rating = text.split(',')
    return (text, (int(rating),1))
rdd2 = rdd.map(getkeyvaluepair)
rdd2.collect()

Out[10]: [('The Shawshank Redemption', (3, 1)),
 ('The Matrix', (5, 1)),
 ('12 Angry Men', (3, 1)),
 ('12 Angry Men', (4, 1)),
 ('The Matrix', (5, 1)),
 ('Pulp Fiction', (4, 1)),
 ('The Godfather', (5, 1)),
 ('The Shawshank Redemption', (2, 1)),
 ('Pulp Fiction', (4, 1)),
 ('The Godfather', (5, 1)),
 ('12 Angry Men', (2, 1)),
 ('The Godfather', (3, 1)),
 ('Pulp Fiction', (4, 1)),
 ('12 Angry Men', (1, 1)),
 ('The Shawshank Redemption', (2, 1)),
 ('12 Angry Men', (1, 1)),
 ('The Shawshank Redemption', (5, 1)),
 ('Pulp Fiction', (5, 1)),
 ('Pulp Fiction', (2, 1)),
 ('The Matrix', (4, 1))]

In [0]:
rdd2.groupByKey().mapValues(list).collect()

Out[11]: [('The Shawshank Redemption', [(3, 1), (2, 1), (2, 1), (5, 1)]),
 ('12 Angry Men', [(3, 1), (4, 1), (2, 1), (1, 1), (1, 1)]),
 ('The Godfather', [(5, 1), (5, 1), (3, 1)]),
 ('The Matrix', [(5, 1), (5, 1), (4, 1)]),
 ('Pulp Fiction', [(4, 1), (4, 1), (4, 1), (5, 1), (2, 1)])]

In [0]:
#finding the average 
rdd3 = rdd2.reduceByKey(lambda x,y :(x[0]+y[0], x[1]+y[1]))
rdd3.collect()


Out[13]: [('The Shawshank Redemption', (12, 4)),
 ('12 Angry Men', (11, 5)),
 ('The Godfather', (13, 3)),
 ('The Matrix', (14, 3)),
 ('Pulp Fiction', (19, 5))]

In [0]:
rdd4 = rdd3.map(lambda x:(x[0],x[1][0]/x[1][1]))
rdd4.collect()

Out[19]: [('The Shawshank Redemption', 3.0),
 ('12 Angry Men', 2.2),
 ('The Godfather', 4.333333333333333),
 ('The Matrix', 4.666666666666667),
 ('Pulp Fiction', 3.8)]

In [0]:
#alternate method
rdd = sc.textFile('/FileStore/tables/movie_review.csv')
rdd.collect()


Out[20]: ['The Shawshank Redemption,3',
 'The Matrix,5',
 '12 Angry Men,3',
 '12 Angry Men,4',
 'The Matrix,5',
 'Pulp Fiction,4',
 'The Godfather,5',
 'The Shawshank Redemption,2',
 'Pulp Fiction,4',
 'The Godfather,5',
 '12 Angry Men,2',
 'The Godfather,3',
 'Pulp Fiction,4',
 '12 Angry Men,1',
 'The Shawshank Redemption,2',
 '12 Angry Men,1',
 'The Shawshank Redemption,5',
 'Pulp Fiction,5',
 'Pulp Fiction,2',
 'The Matrix,4']

In [0]:
#break down the value as a key value pair from the data
def getkeyvaluepair(text):
    text,rating = text.split(',')
    return (text, int(rating))
rdd2 = rdd.map(getkeyvaluepair)
rdd2 = rdd.map(getkeyvaluepair)
rdd2.collect()

Out[24]: [('The Shawshank Redemption', 3),
 ('The Matrix', 5),
 ('12 Angry Men', 3),
 ('12 Angry Men', 4),
 ('The Matrix', 5),
 ('Pulp Fiction', 4),
 ('The Godfather', 5),
 ('The Shawshank Redemption', 2),
 ('Pulp Fiction', 4),
 ('The Godfather', 5),
 ('12 Angry Men', 2),
 ('The Godfather', 3),
 ('Pulp Fiction', 4),
 ('12 Angry Men', 1),
 ('The Shawshank Redemption', 2),
 ('12 Angry Men', 1),
 ('The Shawshank Redemption', 5),
 ('Pulp Fiction', 5),
 ('Pulp Fiction', 2),
 ('The Matrix', 4)]

In [0]:
rdd3 = rdd2.groupByKey().mapValues(list)

In [0]:
rdd3.collect()
    

Out[31]: [('The Shawshank Redemption', [3, 2, 2, 5]),
 ('12 Angry Men', [3, 4, 2, 1, 1]),
 ('The Godfather', [5, 5, 3]),
 ('The Matrix', [5, 5, 4]),
 ('Pulp Fiction', [4, 4, 4, 5, 2])]

In [0]:
def getavg(arr):
    x = arr[1]
    return (arr[0],sum(x)/len(x))

rdd3.map(getavg).collect()

Out[34]: [('The Shawshank Redemption', 3.0),
 ('12 Angry Men', 2.2),
 ('The Godfather', 4.333333333333333),
 ('The Matrix', 4.666666666666667),
 ('Pulp Fiction', 3.8)]