# Demo Spark Analysis

In [1]:
# import modules
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
import collections

### RDD Approach

In [2]:
# create spark context
conf = SparkConf() \
    .setMaster('local') \
    .setAppName('RatingsHistogram')

sc = SparkContext(conf=conf)

24/10/17 14:43:43 WARN Utils: Your hostname, Nicks-MacBook-Pro-2.local resolves to a loopback address: 127.0.0.1; using 192.168.0.73 instead (on interface en0)
24/10/17 14:43:43 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/10/17 14:43:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# load data
# schema: userId, movieId, ratingValue, timeStamp
lines = sc.textFile('../data/ml-100k/u.data')

In [4]:
# split line on white space and return index 2 (ratingValue) using lambda
ratings = lines.map(lambda line: line.split()[2])
ratings.collect()

                                                                                

['3',
 '3',
 '1',
 '2',
 '1',
 '4',
 '2',
 '5',
 '3',
 '3',
 '2',
 '5',
 '5',
 '3',
 '3',
 '3',
 '5',
 '2',
 '4',
 '2',
 '4',
 '4',
 '4',
 '2',
 '4',
 '2',
 '5',
 '2',
 '4',
 '5',
 '3',
 '4',
 '4',
 '4',
 '3',
 '4',
 '1',
 '5',
 '1',
 '4',
 '4',
 '4',
 '2',
 '3',
 '5',
 '5',
 '5',
 '5',
 '5',
 '3',
 '4',
 '2',
 '4',
 '5',
 '5',
 '2',
 '4',
 '5',
 '5',
 '4',
 '3',
 '1',
 '4',
 '3',
 '4',
 '5',
 '1',
 '5',
 '5',
 '4',
 '4',
 '3',
 '1',
 '3',
 '4',
 '2',
 '4',
 '3',
 '3',
 '5',
 '5',
 '3',
 '5',
 '3',
 '5',
 '4',
 '5',
 '3',
 '4',
 '4',
 '4',
 '4',
 '4',
 '5',
 '2',
 '5',
 '4',
 '3',
 '4',
 '4',
 '3',
 '5',
 '4',
 '3',
 '4',
 '5',
 '4',
 '5',
 '5',
 '4',
 '3',
 '5',
 '5',
 '4',
 '4',
 '4',
 '4',
 '3',
 '3',
 '4',
 '4',
 '3',
 '4',
 '5',
 '5',
 '2',
 '5',
 '5',
 '3',
 '3',
 '3',
 '3',
 '4',
 '3',
 '3',
 '3',
 '5',
 '5',
 '4',
 '4',
 '1',
 '4',
 '2',
 '2',
 '2',
 '4',
 '2',
 '5',
 '2',
 '2',
 '4',
 '3',
 '3',
 '4',
 '3',
 '4',
 '4',
 '3',
 '4',
 '4',
 '3',
 '4',
 '5',
 '3',
 '5',
 '2',
 '3'

In [5]:
# count records per rating
result = ratings.countByValue()

# result is collected as key: value pair
# rating: count
result

defaultdict(int, {'3': 27145, '1': 6110, '2': 11370, '4': 34174, '5': 21201})

In [6]:
# sort results
sortedResults = collections.OrderedDict(sorted(result.items()))
sortedResults

OrderedDict([('1', 6110),
             ('2', 11370),
             ('3', 27145),
             ('4', 34174),
             ('5', 21201)])

In [7]:
# iterate through dict and display results
for key, value in sortedResults.items():
    # %s string, %i int
    print("%s %i" % (key, value))

1 6110
2 11370
3 27145
4 34174
5 21201


### DataFrame Approach

In [8]:
# create spark session
session = SparkSession.builder \
    .appName("RatingsHistogram") \
    .getOrCreate()

In [9]:
# load data
# schema: userId, movieId, ratingValue, timeStamp
lines = sc.textFile('../data/ml-100k/u.data')

In [10]:
# create rdd from lines
lines_rdd = lines.map(lambda line: line.split())
type(lines_rdd)

pyspark.rdd.PipelinedRDD

In [11]:
# create scehma and convert rdd to data frame
schema = ['userId', 'movieId', 'ratingValue', 'timeStamp']
df = session.createDataFrame(lines_rdd, schema)
df.show()

24/10/17 14:43:58 WARN PythonRunner: Detected deadlock while completing task 0.0 in stage 3 (TID 3): Attempting to kill Python Worker
                                                                                

+------+-------+-----------+---------+
|userId|movieId|ratingValue|timeStamp|
+------+-------+-----------+---------+
|   196|    242|          3|881250949|
|   186|    302|          3|891717742|
|    22|    377|          1|878887116|
|   244|     51|          2|880606923|
|   166|    346|          1|886397596|
|   298|    474|          4|884182806|
|   115|    265|          2|881171488|
|   253|    465|          5|891628467|
|   305|    451|          3|886324817|
|     6|     86|          3|883603013|
|    62|    257|          2|879372434|
|   286|   1014|          5|879781125|
|   200|    222|          5|876042340|
|   210|     40|          3|891035994|
|   224|     29|          3|888104457|
|   303|    785|          3|879485318|
|   122|    387|          5|879270459|
|   194|    274|          2|879539794|
|   291|   1042|          4|874834944|
|   234|   1184|          2|892079237|
+------+-------+-----------+---------+
only showing top 20 rows



In [12]:
# return ratings and counts
result = df \
    .groupBy('ratingValue') \
    .count() \
    .sort('ratingValue', ascending=True)

result.show()

[Stage 4:>                                                          (0 + 1) / 1]

+-----------+-----+
|ratingValue|count|
+-----------+-----+
|          1| 6110|
|          2|11370|
|          3|27145|
|          4|34174|
|          5|21201|
+-----------+-----+



                                                                                