# Question1

## data clean and vectorization 

#### create DataFrame

In [1]:
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.recommendation import ALS
from pyspark.sql import Row

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
sc = SparkContext('local')
spark = SparkSession(sc)

#data cleaning via RDD
lines = spark.read.text("/Users/littleostrichsnewmacbook/Desktop/bigdata-hw2/Q1/jester_ratings.dat").rdd
parts = lines.map(lambda row: row.value.split("		"))
ratingsRDD = parts.map(lambda p: Row(userId=int(p[0]), jokeId=int(p[1]),
                                     rating=float(p[2])))

In [2]:
ratings = spark.createDataFrame(ratingsRDD)
ratings.show()
#ratings.select("userId","movieId","rating").filter(ratings.rating>2).show()

+------+------+------+
|jokeId|rating|userId|
+------+------+------+
|     5| 0.219|     1|
|     7|-9.281|     1|
|     8|-9.281|     1|
|    13|-6.781|     1|
|    15| 0.875|     1|
|    16|-9.656|     1|
|    17|-9.031|     1|
|    18|-7.469|     1|
|    19|-8.719|     1|
|    20|-9.156|     1|
|    21|-7.188|     1|
|    22|-8.781|     1|
|    23|-8.531|     1|
|    24|-7.906|     1|
|    25|-7.469|     1|
|    89| 9.812|     1|
|    50| 9.906|     1|
|   102|  0.75|     1|
|   103|  -5.0|     1|
|   104| 2.938|     1|
+------+------+------+
only showing top 20 rows



#### splitting train/test datasets 

In [3]:
(training, test) = ratings.randomSplit([0.8, 0.2])

# Build the recommendation model using ALS on the training data
# Note we set cold start strategy to 'drop' to ensure we don't get NaN evaluation metrics

## Build pipeline and model 

In [4]:
#start pipeline
#Build als model and set parameters
als = ALS(maxIter=5, regParam=0.01, implicitPrefs=True,userCol="userId", itemCol="jokeId", ratingCol="rating",
          coldStartStrategy="drop")

#fit training data into model
model = als.fit(training)



## Results  

In [5]:
# Evaluate the model by computing the RMSE(root mean square error) on the test data
#transform dataframe->model->dataframe
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))
# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

Root-mean-square error = 5.28366494368


In [6]:
userRecs.show(5,False)

+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userId|recommendations                                                                                                                                                              |
+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|148   |[[18,1.0576177], [53,0.94112104], [35,0.93663365], [105,0.89039695], [16,0.8419746], [19,0.8298727], [17,0.821825], [117,0.81641746], [72,0.8144705], [76,0.8115986]]        |
|463   |[[17,1.0685358], [18,0.973804], [19,0.87529767], [7,0.84893125], [16,0.79163754], [8,0.65068066], [58,0.33258885], [89,0.32694378], [99,0.291487], [75,0.28846085]]          |
|471   |[[63,0.914786], [35,0.89350754], [15,0.87503934], [105,0.87342507], [97,0.863