### Create SPARK_HOME and PYLIB env var and update PATH env var

In [23]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

### Initializing Spark

Build __SparkConf__ object 

    Contains information about your application.  


Create __SparkContext__ object 
    
    Tells Spark how to access a cluster. 
    

Create __SparkSession__ object

    The entry point to programming Spark with the Dataset and DataFrame API.

    Used to create DataFrame, register DataFrame as tables and execute SQL over tables etc.

In [24]:
import findspark

In [25]:
findspark.init("/usr/local/spark/")

In [26]:
from pyspark.sql import SparkSession

In [27]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("Movie Recommendation Applicationkjbjbjb").setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)



### Loading the dependent libraries

In [28]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

from pyspark.sql.functions import isnan, when, count, col, countDistinct


#### Problem Statement
 Building a recommender system for movies with a data set from MovieLens.


#### Data Dictionary

Ratings Data File Structure (ratings.csv)
-----------------------------------------

All ratings are contained in the file `ratings.csv`. Each line of this file after the header row represents one rating of one movie by one user, and has the following format:

    userId, movieId, rating, timestamp

The lines within this file are ordered first by userId, then, within user, by movieId.

Ratings are made on a 5-star scale, with half-star increments (0.5 stars - 5.0 stars).

Timestamps represent seconds since midnight Coordinated Universal Time (UTC) of January 1, 1970.


Movies Data File Structure (movies.csv)
---------------------------------------

Movie information is contained in the file `movies.csv`. Each line of this file after the header row represents one movie, and has the following format:

    movieId, title, genres

Genres are a pipe-separated list, and are selected from the following:

* Action
* Adventure
* Animation
* Children's
* Comedy
* Crime
* Documentary
* Drama
* Fantasy
* Film-Noir
* Horror
* Musical
* Mystery
* Romance
* Sci-Fi
* Thriller
* War
* Western
* (no genres listed)


### Reading the movies and ratings data and creating a dataframe

In [29]:
## Read data and create a dataframe
ratingsData = spark.read.format("csv")\
       .option("header", "true")\
       .option("inferSchema", "true")\
       .load("file:///Users/pavantej/Desktop/SCIT/sem2/big data/20180701_Batch39_CSE7322c_Recommendation/ml-latest-small/rating_edx.csv")
    
moviesData = spark.read.format("csv")\
       .option("header","true")\
       .option("inferSchema", "true")\
       .load("file:///Users/pavantej/Desktop/SCIT/sem2/big data/20180701_Batch39_CSE7322c_Recommendation/ml-latest-small/movies.csv")
    

### Understanding Data

#### Print Schema

In [8]:
ratingsData.printSchema()
moviesData.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



#### Total number of Columns and Records

In [13]:
ratingsData.count()

IllegalArgumentException: 'Unsupported class file major version 55'

In [9]:
print("No. of Columns in Ratings data= {}".format(len(ratingsData.columns)))

print('No. of Records in rating data= {}'.format(ratingsData.count()))

print("No. of Columns in movies data = {}".format(len(moviesData.columns)))

print('No. of Records in movies data= {}'.format(moviesData.count()))

No. of Columns in Ratings data= 4


IllegalArgumentException: 'Unsupported class file major version 55'

#### Look at first 3 row of the dataframe

In [10]:
ratingsData.show(3)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
+------+-------+------+----------+
only showing top 3 rows



In [11]:
moviesData.show(3)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
+-------+--------------------+--------------------+
only showing top 3 rows



#### Summary statistics

In [12]:
ratingsData.describe().show()

IllegalArgumentException: 'Unsupported class file major version 55'

In [15]:
moviesData.describe().show()

+-------+-----------------+--------------------+------------------+
|summary|          movieId|               title|            genres|
+-------+-----------------+--------------------+------------------+
|  count|            27278|               27278|             27278|
|   mean|59855.48057042305|                null|              null|
| stddev|44429.31469707313|                null|              null|
|    min|                1|"""Great Performa...|(no genres listed)|
|    max|           131262|  ��vegtigris (2001)|           Western|
+-------+-----------------+--------------------+------------------+



Getting the count of Distinct usersIds and movieIDs

In [16]:
print ("Number of different users: " + str(ratingsData.select('userId').distinct().count()))
print ("Number of different movies: " + str(ratingsData.select('movieId').distinct().count()))
print ("Number of different movies: " + str(moviesData.select('movieId').distinct().count()))

Number of different users: 7120
Number of different movies: 14026
Number of different movies: 27278


#### Split the data into training and test sets (30% held out for testing)

In [17]:
(trainingData,testData)=ratingsData.randomSplit([0.8,0.2])

### Model Building and Evaluation

#### ALS model params


1. numBlocks is the number of blocks the users and items will be partitioned into in order to parallelize computation (defaults to 10).
2. rank is the number of latent factors in the model (defaults to 10).
3. maxIter is the maximum number of iterations to run (defaults to 10).
4. regParam specifies the regularization parameter in ALS (defaults to 1.0).
5. implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data (defaults to false which means using explicit feedback).
6. alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations (defaults to 1.0).
7. nonnegative specifies whether or not to use nonnegative constraints for least squares (defaults to false).


In [18]:
from pyspark.ml.recommendation import ALS

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics
als= ALS(userCol="userId",itemCol="movieId",ratingCol="rating",coldStartStrategy='drop')

In [19]:
model = als.fit(trainingData)

In [20]:
# Predicting on the test data


In [21]:
predictions=model.transform(testData)

In [22]:
predictions.show(50)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|  4948|    148|   3.0| 832312374| 2.3302832|
|  6225|    148|   2.0| 842275770| 2.3093112|
|  1931|    148|   2.0| 848773886| 2.5986013|
|   156|    463|   4.0|1038076799|  3.384677|
|  3179|    471|   4.0|1076914207| 3.3080032|
|  5173|    471|   4.0| 856799387| 3.5659912|
|  6361|    471|   3.0| 837413612|  2.989394|
|  5157|    471|   4.0|1291260775| 3.7524853|
|  4531|    471|   4.0| 938899399|  3.481783|
|  5030|    471|   3.0| 879420344| 3.1060998|
|  1296|    471|   5.0| 945829918| 3.5904336|
|   406|    471|   1.0| 834597354| 2.1695347|
|  1030|    471|   4.0|1007159126| 3.7809534|
|  5895|    471|   5.0|1264332812|   2.76644|
|   587|    471|   5.0|1077326007| 3.5009513|
|  4866|    471|   5.0| 940340020|  3.968884|
|  1210|    471|   3.0| 967063772| 3.1346464|
|  2090|    471|   5.0| 859420630| 3.9920585|
|  3912|    471|   1.0| 943223892|

### Defining the evaluator

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator=RegressionEvaluator(metricName='rmse',labelCol="rating",predictionCol="prediction")

In [42]:
?RegressionEvaluator

#### Evaluation on the test data

In [32]:
rmse=evaluator.evaluate(predictions)

In [33]:
print("RMSE Error =" + str(rmse))

RMSE Error =0.8232699598238117


In [34]:
predictions.show(100)

+------+-------+------+----------+----------+
|userId|movieId|rating| timestamp|prediction|
+------+-------+------+----------+----------+
|  4948|    148|   3.0| 832312374| 2.3302832|
|  6225|    148|   2.0| 842275770| 2.3093112|
|  1931|    148|   2.0| 848773886| 2.5986013|
|   156|    463|   4.0|1038076799|  3.384677|
|  3179|    471|   4.0|1076914207| 3.3080032|
|  5173|    471|   4.0| 856799387| 3.5659912|
|  6361|    471|   3.0| 837413612|  2.989394|
|  5157|    471|   4.0|1291260775| 3.7524853|
|  4531|    471|   4.0| 938899399|  3.481783|
|  5030|    471|   3.0| 879420344| 3.1060998|
|  1296|    471|   5.0| 945829918| 3.5904336|
|   406|    471|   1.0| 834597354| 2.1695347|
|  1030|    471|   4.0|1007159126| 3.7809534|
|  5895|    471|   5.0|1264332812|   2.76644|
|   587|    471|   5.0|1077326007| 3.5009513|
|  4866|    471|   5.0| 940340020|  3.968884|
|  1210|    471|   3.0| 967063772| 3.1346464|
|  2090|    471|   5.0| 859420630| 3.9920585|
|  3912|    471|   1.0| 943223892|

In [22]:
spark.stop()