Let's setup Spark on your Colab environment. Run the cell below!


In [39]:
!pip install pyspark
!pip install -U -q PyDrive
!pip install keras
!pip install scikit-surprise
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!pip install findspark
!pip install wget



In [40]:
import pandas as pd
from pyspark import SparkConf, SparkContext, sql
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
import findspark
findspark.add_packages('mysql:mysql-connector-java:8.0.11')
import keras
import os.path
from os import path
from zipfile import ZipFile
import wget


# **1. Download the movie lens dataset and extract** 

In [41]:
# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
dataset_path = "dataset/"

if not path.exists(dataset_path):
    !mkdir dataset
    #!wget -P dataset/ https://files.grouplens.org/datasets/movielens/ml-25m.zip
    data_url = "https://files.grouplens.org/datasets/movielens/ml-25m.zip"
    data_url1 = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
    wget.download(data_url1, 'dataset/')
else:
  print("dataset already exist. No need to download ")


dataset already exist. No need to download 


In [22]:
# Only extract the data the first time the script is run.
movielens_dir = dataset_path + "/ml-25m"
movielens_zipped_file = dataset_path + "ml-25m.zip"

if not path.exists(movielens_dir):
    with ZipFile(movielens_zipped_file, "r") as zip:
        # Extract files
        print("Extracting all the files now...")
        zip.extractall(path=dataset_path)
        print("Done!")
else:
   print("dataset already exist. No need to extract ")


Extracting all the files now...
Done!


# **2. Setup the big data environment with pyspark**

In [28]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")
# create the context
sc = SparkContext(conf=conf)

spark = SparkSession \
    .builder \
    .appName("Movie recommendation") \
    .getOrCreate()


KeyboardInterrupt: 

In [None]:
spark

# **3. Data exploration and cleaning**


**a .Reading the downloaded movie dataset to pyspark dataframe**



In [23]:
ratings_file = movielens_dir + "/ratings.csv"
movies_file = movielens_dir + "/movies.csv"


In [None]:
# Define the dataset schema 

from pyspark.sql.types import *

ratings_df_schema = StructType(
  [StructField('userId', IntegerType()),
   StructField('movieId', IntegerType()),
   StructField('rating', DoubleType())]
)
movies_df_schema = StructType(
  [StructField('movieId', IntegerType()),
   StructField('title', StringType()),
   StructField('genres', StringType())]
)

# creating the pyspark dataframes and cache in memory

ratings_df = spark.read\
                  .options(header =True, inferSchema=False)\
                  .schema(ratings_df_schema)\
                  .csv(ratings_file)
movies_df = spark.read\
                .options(header =True, inferSchema=False)\
                .schema(movies_df_schema)\
                .csv(movies_file)

ratings_df.cache()
movies_df.cache()

In [None]:
ratings_df.show(10, truncate=False)
movies_df.show(10, truncate=False)

In [None]:
ratings_df


In [None]:
movies_df

# **4. Building the user-based collaborative filtering**



> For the this project, we will use the SurPRISE (Simple Python RecommendatIon System Engine) library


> This is because it is faster and has an integrated SVD algorithms, a Matrix factorization algoritms





In [37]:
# Import libraries from Surprise package

from surprise import Reader, Dataset, SVD, SVDpp
from surprise import accuracy
from surprise.model_selection import cross_validate

In [19]:
# Surprise is only compatible with pandas. So we will convert the pyspark dataframes to pandas dataframes
#ratings_df_pd =ratings_df.toPandas()

In [27]:
ratings_pd = pd.read_csv(ratings_file)


In [28]:
ratings_pd = ratings_pd[['userId', 'movieId', 'rating']]

In [29]:
ratings_pd

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
25000090,162541,50872,4.5
25000091,162541,55768,2.5
25000092,162541,56176,2.0
25000093,162541,58559,4.0


In [30]:
ratings_pd.dtypes

userId       int64
movieId      int64
rating     float64
dtype: object

In [38]:
# Load Reader library
reader = Reader(rating_scale=(1, 5))

# Load ratings dataset with Dataset library
data = Dataset.load_from_df(ratings_pd, reader)


svd = SVD(n_factors=50)
#svd_plusplus = SVDpp(n_factors=50)


# Run 5-fold cross-validation and print results.

cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

KeyboardInterrupt: 

In [None]:
# Build the the training set and fit the model

trainset = dataset.build_full_trainset()

svd.fit(trainset)  # old version use svd.train