In [1]:
import pyspark as ps
import numpy as np
import pandas as pd
from pyspark.sql.types import *
from pyspark.ml.recommendation import ALS
from pyspark.sql import functions as F
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
spark = (ps.sql.SparkSession
         .builder
         .master('local[4]')
         .appName('recommender')
         .getOrCreate()
        )
sc = spark.sparkContext


from pyspark.sql import SparkSession

# Setup a SparkSession
spark = SparkSession.builder.getOrCreate()

In [3]:
def get_frames1(filename,test_file = False):

    ## Reading in the data
    ratings_data = pd.read_csv(filename)

    movie_data = pd.read_csv("../data/movies.dat",
                            delimiter = "::",
                            names=["movie","title","genre"])

    user_data = pd.read_csv("../data/users.dat",
                            delimiter = "::",
                            names=["user","gender","age","occupation","zipcode"])


    ## Adding Movie Genre Dummy Cols
    dummy_cols = movie_data.genre.str.get_dummies()
    movie_data = pd.concat((movie_data,dummy_cols),axis = 1)
    #movie_data.drop("genre",axis=1, inplace=True)


    ## Creating seperate year column and title column
    movie_data["year"]=movie_data["title"].apply(lambda x: x[-5:-1])
    movie_data["title"] = movie_data["title"].apply(lambda x: x[:-7])

    ## Mapping M and F in user data to 1 and 0
    user_data["gender"] = user_data["gender"].map({"M":1,"F":0})


    ###################################
    ####### MERGES ###################

    ## DF with movie rating and the movie info
    movie_rating = pd.merge(ratings_data,
                            movie_data,
                            how="left",
                            left_on ="movie",
                            right_on="movie")

    ## DF with movie rating and the user info
    user_rating = pd.merge(ratings_data,
                           user_data,
                           how="left",
                           left_on ="user",
                           right_on="user")

    ## Final DF with both movie info and user info
    if test_file == False:
        final_train = pd.merge(movie_rating,
                               user_rating,
                               on=["user","movie","rating","timestamp"])
    else:
        final_train = pd.merge(movie_rating,
                               user_rating,
                               on=["user","movie"])


    ## Returning frames as dictionary
    frames = {"ratings_data": ratings_data,
              "movie_data": movie_data,
              "user_data": user_data,
              "movie_rating": movie_rating,
              "user_rating": user_rating,
              "total_frame": final_train}
    print("Name of Frames for reference")
    print("ratings_data, movie_data, user_data, movie_rating, user_rating, total_frame")
    return frames

In [4]:
#train_kp = pd.read_csv("training.csv")
train_dict = get_frames1("training.csv")
train_kp = train_dict["total_frame"]

  
  if sys.path[0] == '':


Name of Frames for reference
ratings_data, movie_data, user_data, movie_rating, user_rating, total_frame


In [5]:
mov = train_dict["movie_data"]
mov.head()

Unnamed: 0,movie,title,genre,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,Toy Story,Animation|Children's|Comedy,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,Jumanji,Adventure|Children's|Fantasy,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,Comedy|Romance,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1995
3,4,Waiting to Exhale,Comedy|Drama,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,Comedy,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [6]:
subset_train = train_kp[["user","movie","rating"]]

In [9]:
test_dict = get_frames1("requests.csv",test_file=True)
test_kp = test_dict["total_frame"]
subset_test = test_kp[["user","movie"]]

  
  if sys.path[0] == '':


Name of Frames for reference
ratings_data, movie_data, user_data, movie_rating, user_rating, total_frame


In [10]:
train_kp.head()


Unnamed: 0,user,movie,rating,timestamp,title,genre,Action,Adventure,Animation,Children's,...,Romance,Sci-Fi,Thriller,War,Western,year,gender,age,occupation,zipcode
0,6040,858,4,956703932,"Godfather, The",Action|Crime|Drama,1,0,0,0,...,0,0,0,0,0,1972,1,25,6,11106
1,6040,593,5,956703954,"Silence of the Lambs, The",Drama|Thriller,0,0,0,0,...,0,0,1,0,0,1991,1,25,6,11106
2,6040,2384,4,956703954,Babe: Pig in the City,Children's|Comedy,0,0,0,1,...,0,0,0,0,0,1998,1,25,6,11106
3,6040,1961,4,956703977,Rain Man,Drama,0,0,0,0,...,0,0,0,0,0,1988,1,25,6,11106
4,6040,2019,5,956703977,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama,1,0,0,0,...,0,0,0,0,0,1954,1,25,6,11106


In [15]:
movie_data=train_dict["movie_data"]
movie_data.columns[3:-1]

Index(['Action', 'Adventure', 'Animation', 'Children's', 'Comedy', 'Crime',
       'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical',
       'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'],
      dtype='object')

In [13]:
r1 = train_kp.groupby(["genre","user"]).mean()['rating']

In [None]:
#genre_avg = defaultdict(float)
#for i in movie_data.columns[2:-1]:
#    genre_avg[i] = dict(train_kp.groupby([i,"user"]).mean()['rating'][1])

In [17]:
r2 = train_kp.groupby('movie').mean()['rating']

In [18]:
r3 = train_kp.groupby("user").mean()['rating']

In [19]:
r4 = train_kp.rating.mean()

In [20]:
r4

3.59047875

In [21]:
r1.Action[637]

3.8

In [22]:
r2[640]

2.685185185185185

In [23]:
r3[640]

4.333333333333333

In [24]:
train_kp.head()

Unnamed: 0,user,movie,rating,timestamp,title,genre,Action,Adventure,Animation,Children's,...,Romance,Sci-Fi,Thriller,War,Western,year,gender,age,occupation,zipcode
0,6040,858,4,956703932,"Godfather, The",Action|Crime|Drama,1,0,0,0,...,0,0,0,0,0,1972,1,25,6,11106
1,6040,593,5,956703954,"Silence of the Lambs, The",Drama|Thriller,0,0,0,0,...,0,0,1,0,0,1991,1,25,6,11106
2,6040,2384,4,956703954,Babe: Pig in the City,Children's|Comedy,0,0,0,1,...,0,0,0,0,0,1998,1,25,6,11106
3,6040,1961,4,956703977,Rain Man,Drama,0,0,0,0,...,0,0,0,0,0,1988,1,25,6,11106
4,6040,2019,5,956703977,Seven Samurai (The Magnificent Seven) (Shichin...,Action|Drama,1,0,0,0,...,0,0,0,0,0,1954,1,25,6,11106


In [25]:
train_spark = spark.createDataFrame(subset_train)
test_spark = spark.createDataFrame(subset_test)

In [51]:
#train, test = train_spark.randomSplit([0.8,0.2],seed=0)

In [26]:
als_model = ALS(
                itemCol = 'movie',
                userCol = 'user',
                ratingCol = 'rating',
                nonnegative = True,
                regParam = 0.1,
                rank = 10)

In [27]:
recommender = als_model.fit(train_spark)

In [28]:
y_pred = recommender.transform(test_spark)

In [29]:
pd_pred = y_pred.toPandas()

In [30]:
pd_pred[pd_pred['prediction'].isnull()==True].head()

Unnamed: 0,user,movie,prediction
0,53,148,
5,216,148,
6,482,148,
8,424,148,
11,26,463,


In [35]:
pd_pred.shape

(200209, 3)

In [34]:
def nulls(n):
    if n.user not in train_kp.user.unique() or n.movie not in mov.movie:
        return r4
    elif n.user in r1[mov.genre[n.movie]]:
        return r1[mov.genre[n.movie]][n.user]
    elif n.movie in r2:
        return r2[n.movie]
    elif n.movie in r3:
        return r3[n.movie]
    else:
        return r4

In [32]:
n.movie in mov.movie

NameError: name 'n' is not defined

In [212]:
n.user in train_kp.user.unique()

True

In [41]:
test = pd_pred.apply(lambda x: nulls(x) if pd.isnull(x.prediction) else x['prediction'], axis = 1)

In [43]:
sum(test.isnull())

0

In [44]:
pd_pred['rating'] = test

In [45]:
pd_pred.head()

Unnamed: 0,user,movie,prediction,rating
0,53,148,,3.590479
1,4169,148,3.12119,3.12119
2,5333,148,2.410599,2.410599
3,4387,148,2.463508,2.463508
4,840,148,2.52256,2.52256


In [46]:
fin = pd_pred[['user','movie', 'rating']]

In [47]:
fin.describe()

Unnamed: 0,user,movie,rating
count,200209.0,200209.0,200209.0
mean,1511.751225,1930.586682,3.473943
std,1582.930564,1129.67035,0.491622
min,1.0,1.0,0.511817
25%,331.0,1046.0,3.391875
50%,752.0,1946.0,3.590479
75%,2131.0,2890.0,3.590479
max,6040.0,3952.0,5.460301


In [61]:
fin.to_csv('final_res.csv')