In [3]:
import numpy as np
import pandas as pd
from pyspark.sql.types import IntegerType
from pyspark.ml.recommendation import ALS
import matplotlib.pyplot as plt
import pyspark as ps

In [2]:
spark = (ps.sql.SparkSession.builder 
        .master("local[4]") 
        .appName("sparkSQL exercise") 
        .getOrCreate()
        )
sc = spark.sparkContext

In [11]:
ratings_data = pd.read_csv("training.csv")
ratings_data.head()

Unnamed: 0,user,movie,rating,timestamp
0,6040,858,4,956703932
1,6040,593,5,956703954
2,6040,2384,4,956703954
3,6040,1961,4,956703977
4,6040,2019,5,956703977


In [50]:
movie_data = pd.read_csv("movies.dat",delimiter = "::",names=["movie","title","genre"])

  """Entry point for launching an IPython kernel.


In [49]:
movie_data.head()

Unnamed: 0,movie,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [51]:
dummy_cols = movie_data.genre.str.get_dummies()

In [54]:
movie_data = pd.concat((movie_data,dummy_cols),axis = 1)

In [56]:
movie_data= movie_data.drop("genre",axis=1)

In [33]:
user_data = pd.read_csv("users.dat",delimiter = "::",names=["user","gender","age","occupation","zipcode"])

  """Entry point for launching an IPython kernel.


In [34]:
user_data.head()

Unnamed: 0,user,gender,age,occupation,zipcode
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [57]:
movie_data["year"]=movie_data["title"]
movie_data["year"] = movie_data["year"].apply(lambda x: x[-5:-1])
movie_data.head()

Unnamed: 0,movie,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,Toy Story (1995),0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [58]:
movie_data["title"] = movie_data["title"].apply(lambda x: x[:-7])
movie_data.head()

Unnamed: 0,movie,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,Toy Story,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,Jumanji,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
3,4,Waiting to Exhale,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [41]:
genres = set()
for i in movie_data['genre'].unique():
    genre_list = i.split("|")
    for item in genre_list:
        genres.add(item)
genres 
#movie_data['genre'].unique()

{'Action',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [None]:
movie_data["genre"] = movie_data["genre"].apply(lambda x: x.split("|"))

In [62]:
movie_data.head()

Unnamed: 0,movie,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,1,Toy Story,0,0,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
1,2,Jumanji,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,3,Grumpier Old Men,0,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1995
3,4,Waiting to Exhale,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1995
4,5,Father of the Bride Part II,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [60]:
user_data["gender"] = user_data["gender"].map({"M":1,"F":0})

In [61]:
user_data.head()

Unnamed: 0,user,gender,age,occupation,zipcode
0,1,0,1,10,48067
1,2,1,56,16,70072
2,3,1,25,15,55117
3,4,1,45,7,2460
4,5,1,25,20,55455


In [63]:
movie_rating = pd.merge(ratings_data,movie_data,how="left",left_on ="movie",right_on="movie")



In [70]:
movie_rating.head(10)

Unnamed: 0,user,movie,rating,timestamp,title,Action,Adventure,Animation,Children's,Comedy,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,year
0,6040,858,4,956703932,"Godfather, The",1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1972
1,6040,593,5,956703954,"Silence of the Lambs, The",0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1991
2,6040,2384,4,956703954,Babe: Pig in the City,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,1998
3,6040,1961,4,956703977,Rain Man,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1988
4,6040,2019,5,956703977,Seven Samurai (The Magnificent Seven) (Shichin...,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1954
5,6040,1419,3,956704056,Walkabout,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1971
6,6040,573,4,956704056,"Ciao, Professore! (Io speriamo che me la cavo )",0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1993
7,6040,3111,5,956704056,Places in the Heart,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1984
8,6040,213,5,956704056,Burnt By the Sun (Utomlyonnye solntsem),0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1994
9,6040,3505,4,956704056,No Way Out,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1987


In [67]:
user_rating = pd.merge(ratings_data,user_data,how="left",left_on ="user",right_on="user")


In [69]:
user_rating.sample(10)

Unnamed: 0,user,movie,rating,timestamp,gender,age,occupation,zipcode
114516,5239,2410,2,961443420,1,35,7,7043
479471,2901,3359,4,971884464,1,25,17,78749
199298,4619,3176,5,964124663,0,25,1,97225
35699,5780,2540,4,958154897,1,18,17,92886
522083,2777,2806,3,973729444,1,18,4,95326
59459,5616,1388,1,959134683,1,45,1,8840
70078,5555,1605,1,959550643,1,1,10,37830
633227,1632,1238,3,974717779,1,25,16,94120
345390,3687,509,2,966316948,0,50,1,62221
418772,3272,3471,5,968204107,1,35,0,8330


In [71]:
final_train = pd.merge(movie_rating,user_rating,on=["user","movie"])

In [72]:
final_train.head()

Unnamed: 0,user,movie,rating_x,timestamp_x,title,Action,Adventure,Animation,Children's,Comedy,...,Thriller,War,Western,year,rating_y,timestamp_y,gender,age,occupation,zipcode
0,6040,858,4,956703932,"Godfather, The",1,0,0,0,0,...,0,0,0,1972,4,956703932,1,25,6,11106
1,6040,593,5,956703954,"Silence of the Lambs, The",0,0,0,0,0,...,1,0,0,1991,5,956703954,1,25,6,11106
2,6040,2384,4,956703954,Babe: Pig in the City,0,0,0,1,1,...,0,0,0,1998,4,956703954,1,25,6,11106
3,6040,1961,4,956703977,Rain Man,0,0,0,0,0,...,0,0,0,1988,4,956703977,1,25,6,11106
4,6040,2019,5,956703977,Seven Samurai (The Magnificent Seven) (Shichin...,1,0,0,0,0,...,0,0,0,1954,5,956703977,1,25,6,11106
