In [1]:
import urllib.request as urllib
# u.data -- The full u data set, 100000 ratings by 943 users on 1682 item. 
          # Each user has rated at least 20 movies.  Users and items are numbered consecutively from 1. 
          # The data is randomly ordered. This is a tab separated list of user id | item id | rating | timestamp 
urllib.urlretrieve ("http://files.grouplens.org/datasets/movielens/ml-100k/u.data", "u.data")
# u.item     -- Information about the items (movies); this is a tab separated list of
              # movie id | movie title | release date | video release date | IMDb URL |
              # unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary |# Drama | Fantasy |Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |Thriller | War | Western |
              # The last 19 fields are the genres, a 1 indicates the movie is of that genre, a 0 indicates it is not
              # The movie ids are the ones used in the u.data data set
urllib.urlretrieve ("http://files.grouplens.org/datasets/movielens/ml-100k/u.item", "u.item")

('u.item', <http.client.HTTPMessage at 0x7f33ba8e8e48>)

In [2]:
from pyspark import SparkContext
sc = SparkContext()

### The aim of this exercise is to recommend movies to the users.The exercise is divided into three parts. 
#### In the first part , you will preprocess the data, transform it into a meaningful format and use mathematical calculations to recommend. 
#### In the second part, we will use Machine learning methods to recommend on a much more efficient way.
#### In the third part, you will recommend movies for yourself based on the ratings you supply manually

#### 1 A. Create the ratings and movies RDDs

In [4]:
ratings = sc.textFile('u.data', 20)
movies = sc.textFile('u.item', 20)

#user id | item id | rating | timestamp
print(ratings.take(1))

# movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary |# Drama | Fantasy |Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |Thriller | War | Western |
print(movies.take(1))

['196\t242\t3\t881250949']
['1|Toy Story (1995)|01-Jan-1995||http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)|0|0|0|1|1|1|0|0|0|0|0|0|0|0|0|0|0|0|0']


#### 1 B. Feature Extraction: Extracting the relevant features for our problem

In [30]:
# Write down the code for parsing the ratings of the above generated RDD called ratings
def ratings_parse(x):
    """
    Returns: (user_id, movie_id(item_id), rating)
    """
    
    items = x.split('\t')
    return int(items[0]), int(items[1]), float(items[2])


def movies_parse(x):
    """
    Returns: (movie_id, movie_title)
    """
    items = x.split('|')
    return int(items[0]), items[1]
    


# cache tallentaa muistiin
ratingsRDD = ratings.map(ratings_parse).cache()
print(ratingsRDD.take(5))
print(ratingsRDD.count())

moviesRDD = movies.map(movies_parse).cache()
print(moviesRDD.take(5))
print(moviesRDD.count())

[(196, 242, 3.0), (186, 302, 3.0), (22, 377, 1.0), (244, 51, 2.0), (166, 346, 1.0)]
100000
[(1, 'Toy Story (1995)'), (2, 'GoldenEye (1995)'), (3, 'Four Rooms (1995)'), (4, 'Get Shorty (1995)'), (5, 'Copycat (1995)')]
1682


#### 1 C. First, we will try to recommend movies to the general public and the very basic way is to show all the movies which have high average ratings. We have to display the name, number of ratings, and the average rating of atleast 20 movies with the highest average rating. We should also filter our records based on a specific review threshold i.e. we need only select movies which have total number of reviews above a certain threshold value. 

In [13]:
# You need to implement a helper function which can help in the desired mathematical calculations
def getCountsAndAverages(movieIDandRatingsItem):
    """ Calculate average rating of a movie
    Args:
        movieIDandRatingsItem: (movie_id, (rating1, rating2, ...))
    Returns:
        (movie_id, (total number of ratings, averageRating))
    """
    
    return (movieIDandRatingsItem[0], (len(movieIDandRatingsItem[1]), sum(movieIDandRatingsItem[1]) / len(movieIDandRatingsItem[1])))

#### -> 1 D. Bring all the reviews for a movie together and then using the above helper function calculate the total count of ratings and average rating

In [14]:
# Map the ratingsRDD in such a way that it contains only (movie_id, rating)
# Then bring all the ratings for a particular movie_id together
ratingsRDD_without_userid = ratingsRDD.map(lambda x: (x[1], x[2]))
movieIDsWithRatingsRDD = ratingsRDD_without_userid.groupByKey()
print(movieIDsWithRatingsRDD.mapValues(list).take(1))  # [(movie_id, [rating1, rating2 ....])]


# Use the helper function getCountsAndAverages to get the total number of ratings for a particular movie and the average of of them
movieIDsWithAvgRatingsRDD = movieIDsWithRatingsRDD.map(getCountsAndAverages)
print(movieIDsWithAvgRatingsRDD.take(5))

[(640, [4.0, 2.0, 3.0, 5.0, 3.0, 5.0, 5.0, 5.0, 4.0, 3.0, 5.0, 2.0, 1.0, 3.0, 3.0, 4.0, 2.0, 4.0, 5.0, 4.0, 4.0, 4.0, 1.0, 5.0, 3.0, 4.0, 5.0, 1.0, 3.0, 1.0, 2.0, 2.0, 2.0, 5.0, 2.0, 1.0, 5.0, 4.0, 4.0, 3.0, 2.0, 3.0, 4.0, 1.0, 3.0, 3.0, 4.0, 3.0, 1.0, 2.0, 3.0, 1.0, 1.0, 3.0, 4.0, 3.0, 4.0, 5.0, 1.0, 4.0, 2.0, 2.0, 3.0, 4.0, 3.0, 3.0, 1.0, 2.0, 2.0, 1.0, 1.0, 5.0, 3.0, 4.0, 1.0, 2.0, 3.0, 5.0, 4.0, 4.0, 3.0, 2.0])]
[(640, (82, 3.024390243902439)), (960, (25, 3.24)), (1600, (4, 3.75)), (900, (42, 3.6904761904761907)), (1040, (25, 2.6))]


####  -> 1 E. Attach the name of the movie in the movieIDsWithAvgRatings RDD using moviesRDD which contains the movie name

In [15]:
# Attach the name from the moviesRDD to moviesIDsWithAvgRatingsRDD first
movieNameWithAvgRatingsRDD = moviesRDD.join(movieIDsWithAvgRatingsRDD)
print(movieNameWithAvgRatingsRDD.take(1))  # (movie_id, (movie name, (total_ratings, avg_rating)))

# Transform the RDD into this form -> (average rating, movie name, number of ratings)
movieNameWithAvgRatingsRDD = movieNameWithAvgRatingsRDD.map(lambda x: (x[1][1][1], x[1][0], x[1][1][0]))
print(movieNameWithAvgRatingsRDD.take(1))

[(320, ('Paradise Lost: The Child Murders at Robin Hood Hills (1996)', (20, 4.05)))]
[(4.05, 'Paradise Lost: The Child Murders at Robin Hood Hills (1996)', 20)]


#### -> 1 F. Selecting only those movies who have more than 200 reviews to appeal to a broader audience

In [16]:
# First select only those records where the total number of reviews are greater than the threshold which is 200
# And then sort the final results by avg ratings in descending orders so that the highest avg rating is on the top
# Then show first 20 records
movieLimitedAndSortedByRatingRDD = movieNameWithAvgRatingsRDD.filter(lambda x:  x[2] > 200)
movieLimitedAndSortedByRatingRDD = movieLimitedAndSortedByRatingRDD.sortBy(lambda x: -x[0])
print(movieLimitedAndSortedByRatingRDD.take(20))  # Top 20 Movies for general public

[(4.466442953020135, "Schindler's List (1993)", 298), (4.45679012345679, 'Casablanca (1942)', 243), (4.445229681978798, 'Shawshank Redemption, The (1994)', 283), (4.3875598086124405, 'Rear Window (1954)', 209), (4.385767790262173, 'Usual Suspects, The (1995)', 267), (4.3584905660377355, 'Star Wars (1977)', 583), (4.292237442922374, 'To Kill a Mockingbird (1962)', 219), (4.291666666666667, "One Flew Over the Cuckoo's Nest (1975)", 264), (4.28974358974359, 'Silence of the Lambs, The (1991)', 390), (4.283292978208232, 'Godfather, The (1972)', 413), (4.252380952380952, 'Raiders of the Lost Ark (1981)', 420), (4.2457142857142856, 'Titanic (1997)', 350), (4.204359673024523, 'Empire Strikes Back, The (1980)', 367), (4.203980099502488, 'Boot, Das (1981)', 201), (4.186602870813397, 'Godfather: Part II, The (1974)', 209), (4.172839506172839, 'Princess Bride, The (1987)', 324), (4.163043478260869, 'Amadeus (1984)', 276), (4.161616161616162, 'L.A. Confidential (1997)', 297), (4.155511811023622, 'F

### Option 2: Now with a more advanced approach we can do tackle the same problem in a more efficient way with one of the Machine learning techniques known as Collaborative filtering. Benefits?

In [6]:
# Use the ALS algorithm for performing Collaborative filtering
from pyspark.mllib.recommendation import ALS, Rating
# Divide the dataset into three parts as exlained in the slides
# ratingsRDD -> (movie_id, user_id, rating)
trainingRDD, validationRDD, testRDD = ratingsRDD.randomSplit([6, 2, 2], seed=0)
print(validationRDD.take(1))

[(22, 377, 1.0)]


In [7]:
# Build the recommendation model using Alternating Least Squares
ranks = [2, 3, 4, 8, 12]  # Rank is a factor which can be tuned to get the best model for our dataset
numIterations = 5
regularizationParameter = 0.1
for rank in ranks:
    model = ALS.train(trainingRDD, rank, numIterations, lambda_=regularizationParameter)

    # Evaluate the model on training data
    validation_data = validationRDD.map(lambda p: (p[0], p[1]))
    predictions = model.predictAll(validation_data).map(lambda r: ((r[0], r[1]), r[2]))
    ratesAndPreds = validationRDD.map(lambda r: ((r[0], r[1]), r[2])).join(predictions)
    MSE = ratesAndPreds.map(lambda r: (r[1][0] - r[1][1])**2).mean()
    print("Mean Squared Error = " + str(MSE))
    
print('Predictions RDD: ', predictions.take(1))
print('Ratings and Predictions combined RDD: ', ratesAndPreds.take(1))

Mean Squared Error = 0.8949482092140121
Mean Squared Error = 0.8905240743563481
Mean Squared Error = 0.889356063525915
Mean Squared Error = 0.9161278096110412
Mean Squared Error = 0.914008649357351
Predictions RDD:  [((650, 670), 2.565526227777272)]
Ratings and Predictions combined RDD:  [((872, 930), (3.0, 3.1881548922747243))]


In [8]:
bestRank = 3 # Fill the best rank by observing the errors from several runs
myModel = ALS.train(trainingRDD, bestRank, seed=0, iterations=numIterations,
                      lambda_=regularizationParameter)
testForPredictingRDD = testRDD.map(lambda item: (item[0], item[1]) )  # (user, movie, rating) -> (user, movie)
predictedTestRDD = myModel.predictAll(testForPredictingRDD)
# Check how does it look like after predicting from the model
predictedTestRDD.take(2)

[Rating(user=331, product=1100, rating=4.638645224307679),
 Rating(user=655, product=1100, rating=3.1824183115058235)]

In [9]:
# Calculate average rating value of all the ratings for the whole predicted test RDD
predictedTestRDD.map(lambda item: item[2]).reduce(lambda a,b: a+b) / predictedTestRDD.count() # item[2] = ratings

3.4198448238160895

In [10]:
testRDD.take(2) # verify again how it looks like

[(298, 474, 4.0), (115, 265, 2.0)]

In [11]:
# Calculate average ratings for the test RDD
testRDD.map(lambda item: item[2]).reduce(lambda a,b: a+b) / testRDD.count() # item[2] = ratings

3.532626021119745

#### 3. In this section you will use the above demonstration to calculate recommendations for yourself. You need to provide manual ratings to at least 10 of the movies from the list and then do the same step of training and testing steps again.

In [17]:
# Execute this section to get the list of movies which need to select from
print('List of movies with maximum number of ratings')
print('(average rating, movie name, number of reviews)')
for ratingsTuple in movieLimitedAndSortedByRatingRDD.take(50):
    print(ratingsTuple)

List of movies with maximum number of ratings
(average rating, movie name, number of reviews)
(4.466442953020135, "Schindler's List (1993)", 298)
(4.45679012345679, 'Casablanca (1942)', 243)
(4.445229681978798, 'Shawshank Redemption, The (1994)', 283)
(4.3875598086124405, 'Rear Window (1954)', 209)
(4.385767790262173, 'Usual Suspects, The (1995)', 267)
(4.3584905660377355, 'Star Wars (1977)', 583)
(4.292237442922374, 'To Kill a Mockingbird (1962)', 219)
(4.291666666666667, "One Flew Over the Cuckoo's Nest (1975)", 264)
(4.28974358974359, 'Silence of the Lambs, The (1991)', 390)
(4.283292978208232, 'Godfather, The (1972)', 413)
(4.252380952380952, 'Raiders of the Lost Ark (1981)', 420)
(4.2457142857142856, 'Titanic (1997)', 350)
(4.204359673024523, 'Empire Strikes Back, The (1980)', 367)
(4.203980099502488, 'Boot, Das (1981)', 201)
(4.186602870813397, 'Godfather: Part II, The (1974)', 209)
(4.172839506172839, 'Princess Bride, The (1987)', 324)
(4.163043478260869, 'Amadeus (1984)', 276)


In [18]:
moviesRDD.take(2)  # Verify again what moviesRDD looks like (movie_id, movie_name)

[(1, 'Toy Story (1995)'), (2, 'GoldenEye (1995)')]

In [22]:
ratingsRDD.take(1)

[(196, 242, 3.0)]

In [33]:
my_user_id = 0  # This is your user id , do not change it

# Note that the movie IDs are the *last* number on each line. A common error was to use the number of ratings as the movie ID.
myRatedMoviesName = [
    (my_user_id, u'Dead Man Walking (1995)' , 2),
    (my_user_id, u'Godfather: Part II, The (1974)' , 1),
    (my_user_id, u'Raiders of the Lost Ark (1981)' , 5),
    (my_user_id, u'Indiana Jones and the Last Crusade (1989)', 4),
    (my_user_id, u'Terminator 2: Judgment Day (1991)', 4),
    (my_user_id, u'Godfather, The (1972)', 2),
    (my_user_id, u'Apollo 13 (1995)', 3),
    (my_user_id, u'Usual Suspects, The (1995)', 1),
    (my_user_id, u'Amadeus (1984)', 1),
    (my_user_id, u'Alien (1979)', 5),
    ]
myRatedMovies = []
# We need to pull movie_ids for the movies that you have entered from the moviesRDD
for (uid, name, rating) in myRatedMoviesName:
    movie_id = moviesRDD.filter(lambda item: item[1] == name).take(1)[0][0]
    myRatedMovies.append( (uid, movie_id, float(rating)) )
# Convert the python list into RDD     
myRatingsRDD = sc.parallelize(myRatedMovies)
print(myRatingsRDD.take(10))

[(0, 9, 2.0), (0, 187, 1.0), (0, 174, 5.0), (0, 210, 4.0), (0, 96, 4.0), (0, 127, 2.0), (0, 28, 3.0), (0, 12, 1.0), (0, 191, 1.0), (0, 183, 5.0)]


In [34]:
# Use the union() function of spark to append the contents of myRatingsRDD to trainingRDD
trainingWithMyRatingsRDD = trainingRDD.union(myRatingsRDD)
# Then train the model with the new RDD using same old parameters as before
myRatingsModel = ALS.train(trainingWithMyRatingsRDD, bestRank, seed=0, iterations=numIterations,
                      lambda_=regularizationParameter)

In [36]:
# Now select all movies except the ones you rated in myRatedMovies array. 
# Hint: you can run a for loop with lambda to filter and select only the movies which were not in the myRatedMovies array 
_myUnratedMoviesRDD = moviesRDD.filter(lambda item: item[0] not in [x[1] for x in myRatedMovies]) # item -> (movie_id, movie_name)
print('_myUnratedMoviesRDD', _myUnratedMoviesRDD.take(1))  # (movie_id, movie_name)

_myUnratedMoviesRDD [(1, 'Toy Story (1995)')]


In [44]:
# Transform the above result into an RDD which looks like (my_user_id, movie_id)
myUnratedMoviesRDD = _myUnratedMoviesRDD.map(lambda item: (my_user_id, item[0]))
print('myUnratedMoviesRDD ', myUnratedMoviesRDD.take(1))

myUnratedMoviesRDD  [(0, 1)]


In [45]:
# Remember how we converted testRDD to testForPredictingRDD by removing the ratings field from the testRDD 
# myUnratedMovies now has an user_id (my_user_id) and movie_id.
# Hence now you can use myUnratedMoviesRDD with myRatingsModel to predict your ratings for the movies

predictedRatingsRDD = myRatingsModel.predictAll(myUnratedMoviesRDD)
predictedRatingsRDD.take(2) # IMPORTANT, This RDD is not made up of tuples now, it is an RDD of 'Rating' objects!

[Rating(user=0, product=39, rating=1.7181107901675894),
 Rating(user=0, product=767, rating=1.2998711094634228)]

In [51]:
# Transform movieIDsWithAvgRatingsRDD from section(1 D)
# Remember it has the form (MovieID, (number of ratings, average rating)), transform into an RDD of the form (MovieID, number of ratings)
movieCountsRDD = movieIDsWithAvgRatingsRDD.map(lambda item: (item[0], item[1][0]))
print('movieCountsRDD (movie_id, number of ratings)', movieCountsRDD.take(1))

# Transform predictedRatingsRDD into an RDD with entries that are pairs of the form (Movie ID, Predicted Rating)
predictedMoviesWithRatingsRDD = predictedRatingsRDD.map(lambda item: (item.product, item.rating))
print('predictedMoviesWithRatingsRDD (movie_id, rating)', predictedMoviesWithRatingsRDD.take(1))

movieCountsRDD (movie_id, number of ratings) [(640, 82)]
predictedMoviesWithRatingsRDD (movie_id, rating) [(39, 1.7181107901675894)]


In [52]:
# Use predictedMoviesWithRatingsRDD and movieCountsRDD (created above) to yield a new RDD of the form (Movie ID, (Predicted Rating, number of ratings))
predictedMoviesWithRatingsAndCountsRDD  = predictedMoviesWithRatingsRDD.join(movieCountsRDD)
predictedMoviesWithRatingsAndCountsRDD.take(2)

[(1155, (3.370059549747708, 3)), (132, (2.6694794408147793, 246))]

In [69]:
# Select movies from predictedMoviesWithRatingsAndCountsRDD with number of ratings more than say, 150
# Then, Using PredictedMoviesWithRatingsAndCountsRDD and moviesRDD (which has the movie name) we need to yield an RDD of the form
# (Predicted Rating, Movie Name, number of ratings)

predictedMoviesWithRatingsAndCountsRDDOver150 = predictedMoviesWithRatingsAndCountsRDD.filter(lambda x: x[1][1] > 150)

predictedMoviesWithRatingsCountsAndNamesRDD = predictedMoviesWithRatingsAndCountsRDDOver150.join(moviesRDD)
print('predictedMoviesWithRatingsCountsAndNamesRDD ', predictedMoviesWithRatingsCountsAndNamesRDD.take(1)) 

ratingsWithNamesRDD = predictedMoviesWithRatingsCountsAndNamesRDD.map(lambda x: (x[1][0][0], x[1][1], x[1][0][1]))
print('ratingsWithNamesRDD ', ratingsWithNamesRDD.take(1))

predictedMoviesWithRatingsCountsAndNamesRDD  [(265, ((3.2741733749611015, 227), 'Hunt for Red October, The (1990)'))]
ratingsWithNamesRDD  [(3.2741733749611015, 'Hunt for Red October, The (1990)', 227)]


In [70]:
# use takeOrdered instead of take and pass the lambda function in key to sort it in descending order (select 20 movies)
ratingsWithNamesRDD.takeOrdered(20, key=lambda x: -x[0])

# These are Highest rated 20 movies (Predicted Recommendations) with reviews > 150

[(4.02084016526709, 'Independence Day (ID4) (1996)', 429),
 (3.894651260229967, 'Titanic (1997)', 350),
 (3.8348272488208406, 'Time to Kill, A (1996)', 232),
 (3.8301274572475466, 'Air Force One (1997)', 431),
 (3.8291934034118724, 'Top Gun (1986)', 220),
 (3.826040571012653, 'True Lies (1994)', 208),
 (3.762219294861951, 'Rock, The (1996)', 378),
 (3.7426954865776985, 'American President, The (1995)', 164),
 (3.7023181843835777, 'Speed (1994)', 230),
 (3.699236157667446, 'Conspiracy Theory (1997)', 295),
 (3.699073772874656, 'Ghost (1990)', 170),
 (3.6919508207344593, 'Ransom (1996)', 267),
 (3.6862950699112043, 'Twister (1996)', 293),
 (3.6802226059231122, 'Phenomenon (1996)', 244),
 (3.655166308654925, 'Braveheart (1995)', 297),
 (3.61163440141907, 'Pretty Woman (1990)', 164),
 (3.590322699372818, 'Saint, The (1997)', 316),
 (3.588461536881516, 'While You Were Sleeping (1995)', 162),
 (3.582116508512235, 'Forrest Gump (1994)', 321),
 (3.572981038065703, 'Lion King, The (1994)', 220)