In [1]:
import pyspark 

In [2]:
spark  = pyspark.sql.SparkSession.builder.getOrCreate()
sc = spark.sparkContext

In [3]:
ratings = spark.read.json('data/ratings.json')

In [4]:
ratings.persist()

DataFrame[movie_id: bigint, rating: bigint, timestamp: double, user_id: bigint]

In [5]:
ratings.show(5)

+--------+------+------------+-------+
|movie_id|rating|   timestamp|user_id|
+--------+------+------------+-------+
|     858|     4|9.56678732E8|   6040|
|    2384|     4|9.56678754E8|   6040|
|     593|     5|9.56678754E8|   6040|
|    1961|     4|9.56678777E8|   6040|
|    1419|     3|9.56678856E8|   6040|
+--------+------+------------+-------+
only showing top 5 rows



In [6]:
import pandas as pd 
movies = pd.read_csv('data/movies.dat', sep='::', engine='python', header=None)
movies.head()

Unnamed: 0,0,1,2
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
users = pd.read_csv('data/users.dat', sep='::', engine='python', header=None)
users.head()

Unnamed: 0,0,1,2,3,4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [8]:
users = users.rename({0:'user_id', 
              1:'gender', 
              2:'min_age', 
              3:'occupation', 
              4:'zipcode'}, 
             axis=1)
users.min_age.value_counts()

25    2096
35    1193
18    1103
45     550
50     496
56     380
1      222
Name: min_age, dtype: int64

In [9]:
requests = spark.read.json('data/requests.json')

In [10]:
requests.persist()

DataFrame[movie_id: bigint, rating: double, timestamp: double, user_id: bigint]

In [11]:
requests.count()

280260

In [13]:
ratings_df = ratings.toPandas()

In [16]:
ratings_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id
0,858,4,956678732.0,6040
1,2384,4,956678754.0,6040
2,593,5,956678754.0,6040
3,1961,4,956678777.0,6040
4,1419,3,956678856.0,6040


In [17]:
from pyspark.ml.recommendation import ALSModel, ALS

In [18]:
als = ALS(
    rank=11,
    userCol='user_id',
    itemCol='movie_id',
    ratingCol='rating'
)

In [19]:
als_model = als.fit(ratings)

In [20]:
preds = als_model.transform(ratings)

In [21]:
request_preds = als_model.transform(requests)

In [38]:
nan_df = request_preds.toPandas()

In [39]:
nan_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction
0,148,,977959026.0,53,
1,148,,976559602.0,4169,3.267519
2,148,,989024856.0,5333,2.466185
3,148,,977005381.0,4387,2.241686
4,148,,966907208.0,3539,2.798501


In [40]:
import numpy as np
nan_df = nan_df[nan_df['prediction'].isnull()]

In [41]:
nan_df.head()

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction
0,148,,977959000.0,53,
6,148,,976841600.0,216,
7,148,,976191200.0,482,
9,148,,1026978000.0,424,
14,463,,978242800.0,26,


In [42]:
len(nan_df)

95628

In [92]:
meta_df = pd.read_csv('data/movies_metadata.csv')

In [93]:
meta_df[meta_df.id.str.contains('-')==True]
#drop things that got shifted 

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [94]:
bad_ids = ['1997-08-20', '2012-09-29', '2014-01-01'] 

In [95]:
meta_df = meta_df[~meta_df['id'].isin(bad_ids)]
meta_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [96]:
meta_df['id'] = meta_df['id'].astype(int)

In [97]:
all_data_df = nan_df.merge(meta_df, how='left', left_on='movie_id', right_on='id')

In [98]:
all_data_df.head().T

Unnamed: 0,0,1,2,3,4
movie_id,148,148,148,148,463.0
rating,,,,,
timestamp,9.77959e+08,9.76842e+08,9.76191e+08,1.02698e+09,978243000.0
user_id,53,216,482,424,26.0
prediction,,,,,
adult,False,False,False,False,
belongs_to_collection,,,,,
budget,5000000,5000000,5000000,5000000,
genres,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...","[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",
homepage,,,,,


In [99]:
all_data_df = nan_df.merge(meta_df, how='left', left_on='movie_id', right_on='id')

In [100]:
all_data_df = all_data_df.merge(users, how='left', left_on='user_id', right_on='user_id')

In [103]:
all_data_df.head(1)

Unnamed: 0,movie_id,rating,timestamp,user_id,prediction,adult,belongs_to_collection,budget,genres,homepage,...,status,tagline,title,video,vote_average,vote_count,gender,min_age,occupation,zipcode
0,148,,977959026.0,53,,False,,5000000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,Released,,The Secret Life of Words,False,6.8,52.0,M,25,0,96931


In [109]:
X = all_data_df.filter(['zipcode','occupation','min_age','gender','vote_count', 'vote_average', 
                'runtime', 'revenue', 'release_date', 'popularity', 
               'budget', 'adult', 'user_id', 'movie_id'], axis=1)

In [110]:
y = all_data_df.filter(['prediction'], axis=1)