In [2]:
import sys
from pyspark import SparkConf, SparkContext
from math import sqrt

In [9]:
def loadMovieNames():
    movieNames = {}
    with open("ml-100k/u.ITEM", encoding='latin-1') as f:
        for line in f:
            fields = line.split('|')
            movieNames[int(fields[0])] = fields[1]
    return movieNames

In [27]:
def makePairs(x):
    (movie1, rating1) = x[1][0]
    (movie2, rating2) = x[1][1]
    return ((movie1, movie2), (rating1, rating2))

In [25]:
def filterDuplicates(x):
    (movie1, rating1) = x[1][0]
    (movie2, rating2) = x[1][1]
    return movie1 < movie2

In [32]:
def computeCosineSimilarity(ratingPairs):
    numPairs = 0
    sum_xx = sum_yy = sum_xy = 0
    for ratingX, ratingY in ratingPairs:
        sum_xx += ratingX * ratingX
        sum_yy += ratingY * ratingY
        sum_xy += ratingX * ratingY
        numPairs += 1

    numerator = sum_xy
    denominator = sqrt(sum_xx) * sqrt(sum_yy)

    score = 0
    if (denominator):
        score = (numerator / (float(denominator)))

    return (score, numPairs)

In [3]:
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")
sc = SparkContext(conf = conf)

In [10]:
print("\nLoading movie names...")
nameDict = loadMovieNames()


Loading movie names...


In [13]:
nameDict[4]

'Get Shorty (1995)'

In [18]:
data = sc.textFile("ml-100k/u.data")

In [19]:
data.top(5)

['99\t98\t5\t885679596',
 '99\t978\t3\t885679382',
 '99\t975\t3\t885679472',
 '99\t963\t3\t885679998',
 '99\t931\t2\t886780147']

In [20]:
type(data)

pyspark.rdd.RDD

In [21]:
# Map ratings to key / value pairs: user ID => movie ID, rating
ratings = data.map(lambda l: l.split()).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))

In [22]:
ratings.top(5)

[(943, (1330, 3.0)),
 (943, (1228, 3.0)),
 (943, (1188, 3.0)),
 (943, (1074, 4.0)),
 (943, (1067, 2.0))]

In [23]:
# Emit every movie rated together by the same user.
# Self-join to find every combination.
joinedRatings = ratings.join(ratings)

In [24]:
# At this point our RDD consists of userID => ((movieID, rating), (movieID, rating))
joinedRatings.top(5)

[(943, ((1330, 3.0), (1330, 3.0))),
 (943, ((1330, 3.0), (1228, 3.0))),
 (943, ((1330, 3.0), (1188, 3.0))),
 (943, ((1330, 3.0), (1074, 4.0))),
 (943, ((1330, 3.0), (1067, 2.0)))]

In [26]:
# Filter out duplicate pairs
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)

In [28]:
# Now key by (movie1, movie2) pairs.
moviePairs = uniqueJoinedRatings.map(makePairs)
# We now have (movie1, movie2) => (rating1, rating2)

In [29]:
type(moviePairs)

pyspark.rdd.PipelinedRDD

In [31]:
moviePairs.top(5)

[((1679, 1680), (3.0, 2.0)),
 ((1678, 1680), (1.0, 2.0)),
 ((1678, 1679), (1.0, 3.0)),
 ((1675, 1676), (3.0, 2.0)),
 ((1672, 1681), (2.0, 3.0))]

In [30]:
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()

In [33]:
# We now have (movie1, movie2) = > (rating1, rating2), (rating1, rating2) ...
# Can now compute similarities.
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).cache()

In [34]:
moviePairSimilarities.top(5)

[((1679, 1680), (1.0, 1)),
 ((1678, 1680), (1.0, 1)),
 ((1678, 1679), (1.0, 1)),
 ((1675, 1676), (1.0, 1)),
 ((1672, 1681), (1.0, 1))]

In [35]:
# Save the results if desired
#moviePairSimilarities.sortByKey()
#moviePairSimilarities.saveAsTextFile("movie-sims")

In [38]:
sys.argv

['/usr/local/Cellar/python3/3.6.4_2/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/ipykernel_launcher.py',
 '-f',
 '/Users/paolo/Library/Jupyter/runtime/kernel-d55aa5e4-00ab-4be0-b3fd-0d127317e17e.json']

In [45]:
defaultId = 50
# Extract similarities for the movie we care about that are "good".
if (len(sys.argv) > 1):
    try:
        movieID = int(sys.argv[1])
    except:
        movieID = defaultId    
else: 
    movieID = defaultId

In [46]:
scoreThreshold = 0.97
coOccurenceThreshold = 50


# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda x: \
        (x[0][0] == movieID or x[0][1] == movieID) \
        and x[1][0] > scoreThreshold and x[1][1] > coOccurenceThreshold)

In [47]:
# Sort by quality score.
results = filteredResults.map(lambda x: (x[1],x[0])).sortByKey(ascending = False).take(10)

print("Top 10 similar movies for " + nameDict[movieID])
for result in results:
        (sim, pair) = result
        # Display the similarity result that isn't the movie we're looking at
        similarMovieID = pair[0]
        if (similarMovieID == movieID):
            similarMovieID = pair[1]
        print(nameDict[similarMovieID] + "\tscore: " + str(sim[0]) + "\tstrength: " + str(sim[1]))

Top 10 similar movies for Star Wars (1977)
Empire Strikes Back, The (1980)	score: 0.9895522078385338	strength: 345
Return of the Jedi (1983)	score: 0.9857230861253026	strength: 480
Raiders of the Lost Ark (1981)	score: 0.981760098872619	strength: 380
20,000 Leagues Under the Sea (1954)	score: 0.9789385605497993	strength: 68
12 Angry Men (1957)	score: 0.9776576120448436	strength: 109
Close Shave, A (1995)	score: 0.9775948291054827	strength: 92
African Queen, The (1951)	score: 0.9764692222674887	strength: 138
Sting, The (1973)	score: 0.9751512937740359	strength: 204
Wrong Trousers, The (1993)	score: 0.9748681355460885	strength: 103
Wallace & Gromit: The Best of Aardman Animation (1996)	score: 0.9741816128302572	strength: 58
