### Creating SPARK_HOME 

In [1]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

### Initializing Spark

In [2]:
import findspark

In [3]:
findspark.init("/usr/local/spark/")

In [4]:
from pyspark.sql import SparkSession

In [5]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("Movie Recommendation Applicationk").setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)



### Loading the dependent libraries

In [6]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.sql.functions import isnan, when, count, col, countDistinct


#### Problem Statement
 To predict movie ratings




### Reading the movies and ratings data and creating a dataframe

In [7]:
## Read data and create a dataframe
ratingsData = spark.read.format("csv")\
       .option("header", "true")\
       .option("inferSchema", "true")\
       .load("file:///Users/jaisachdev/Desktop/SCIT/sem2/big data/20180701_Batch39_CSE7322c_Recommendation/ml-latest-small/rating_edx.csv")
    
moviesData = spark.read.format("csv")\
       .option("header","true")\
       .option("inferSchema", "true")\
       .load("file:///Users/jaisachdev/Desktop/SCIT/sem2/big data/20180701_Batch39_CSE7322c_Recommendation/ml-latest-small/movies.csv")
    

### Understanding Data

#### Print Schema

In [8]:
ratingsData.printSchema()
moviesData.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



#### Total number of Columns and Records

In [9]:
ratingsData.count()

1048575

In [10]:
print("No. of Columns in Ratings data= {}".format(len(ratingsData.columns)))

print('No. of Records in rating data= {}'.format(ratingsData.count()))

print("No. of Columns in movies data = {}".format(len(moviesData.columns)))

print('No. of Records in movies data= {}'.format(moviesData.count()))

No. of Columns in Ratings data= 4
No. of Records in rating data= 1048575
No. of Columns in movies data = 3
No. of Records in movies data= 27278


#### Look at first 3 row of the dataframe

In [11]:
ratingsData.show(3)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
+------+-------+------+----------+
only showing top 3 rows



In [12]:
moviesData.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



Getting the count of Distinct usersIds and movieIDs

In [13]:
print ("Number of different users: " + str(ratingsData.select('userId').distinct().count()))
print ("Number of different movies rated: " + str(ratingsData.select('movieId').distinct().count()))
print ("Number of different movies: " + str(moviesData.select('movieId').distinct().count()))

Number of different users: 7120
Number of different movies rated: 14026
Number of different movies: 27278


#### Split the data into training and test sets (20% held out for testing)

In [14]:
(trainingData,testData)=ratingsData.randomSplit([0.8,0.2])

### Model Building and Evaluation

In [15]:
from pyspark.ml.recommendation import ALS

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als= ALS(userCol="userId",itemCol="movieId",ratingCol="rating",coldStartStrategy='drop')

In [16]:
model = als.fit(trainingData)

# Prediction on the test data


In [17]:
predictions=model.transform(testData)

In [18]:
predictions.show(50)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|  5186|    148|   2.0| 962906606|  3.183427|
|   903|    148|   3.0| 903702432| 2.1616917|
|  3433|    148|   2.0| 940482169| 3.0820343|
|  5814|    148|   3.0| 859547410| 3.3013158|
|  6225|    148|   2.0| 842275770| 2.4905005|
|   741|    148|   4.5|1194731117|  2.435807|
|  1931|    148|   2.0| 848773886| 3.1583629|
|  5110|    463|   3.0| 862695893| 2.3977346|
|  4548|    463|   2.0| 846889607| 2.6793292|
|  3179|    471|   4.0|1076914207| 3.2690513|
|  3986|    471|   5.0|1080277641| 4.1049805|
|  1808|    471|   3.0| 846426225| 2.7743587|
|  4531|    471|   4.0| 938899399|  3.486186|
|  5375|    471|   4.0| 858636489|  3.576294|
|   101|    471|   3.0| 836325948| 3.7018466|
|  3008|    471|   2.5|1112482555| 2.4421215|
|  2313|    471|   4.0| 844329059| 3.7920914|
|  5895|    471|   5.0|1264332812| 3.0111866|
|  1372|    471|   4.0| 958144579|

### Defining the evaluator

In [19]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator=RegressionEvaluator(metricName='rmse',labelCol="rating",predictionCol="prediction")

#### Evaluation on the test data

In [21]:
rmse=evaluator.evaluate(predictions)

In [33]:
print("RMSE Error =" + str(rmse))

RMSE Error =0.8232699598238117


In [22]:
predictions.show(100)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|  5186|    148|   2.0| 962906606|  3.183427|
|   903|    148|   3.0| 903702432| 2.1616917|
|  3433|    148|   2.0| 940482169| 3.0820343|
|  5814|    148|   3.0| 859547410| 3.3013158|
|  6225|    148|   2.0| 842275770| 2.4905005|
|   741|    148|   4.5|1194731117|  2.435807|
|  1931|    148|   2.0| 848773886| 3.1583629|
|  5110|    463|   3.0| 862695893| 2.3977346|
|  4548|    463|   2.0| 846889607| 2.6793292|
|  3179|    471|   4.0|1076914207| 3.2690513|
|  3986|    471|   5.0|1080277641| 4.1049805|
|  1808|    471|   3.0| 846426225| 2.7743587|
|  4531|    471|   4.0| 938899399|  3.486186|
|  5375|    471|   4.0| 858636489|  3.576294|
|   101|    471|   3.0| 836325948| 3.7018466|
|  3008|    471|   2.5|1112482555| 2.4421215|
|  2313|    471|   4.0| 844329059| 3.7920914|
|  5895|    471|   5.0|1264332812| 3.0111866|
|  1372|    471|   4.0| 958144579|

In [23]:
spark.stop()