In [1]:
import findspark

In [2]:
findspark.init()

In [3]:
import pyspark
import random

## Caculate PI

In [4]:
sc = pyspark.SparkContext(appName="Pi")
num_samples = 100000000

def inside(p):     
  x, y = random.random(), random.random()
  return x*x + y*y < 1

count = sc.parallelize(range(0, num_samples)).filter(inside).count()

pi = 4 * count / num_samples
print(pi)

sc.stop()

3.14155836


## Recommendation System (ALS model) on PySpark

dataset: MovieLens 20M

https://www.kaggle.com/grouplens/movielens-20m-dataset
https://www.kaggle.com/jneupane12/movielens

In [5]:
! ls /data/datasets/movielens/

genome_scores.csv  genome_tags.csv  link.csv  movie.csv  rating.csv  tag.csv


In [6]:
! ls /data/datasets/movielens-lite/

links.csv  movies.csv  ratings.csv  README.txt	tags.csv


In [7]:
from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS

In [8]:
spark = (SparkSession
    .builder
    .appName("movielens-reco")
    .config("spark.executor.memory", "8g")
    .getOrCreate())

#### load data

In [9]:
ratings = spark.read.csv('/data/datasets/movielens-lite/ratings.csv', inferSchema=True, header=True)
movies = spark.read.csv('/data/datasets/movielens-lite/movies.csv', inferSchema=True, header=True)
ratings.join(movies, "movieId").show(3)

+-------+------+------+----------+--------------------+--------------------+
|movieId|userId|rating| timestamp|               title|              genres|
+-------+------+------+----------+--------------------+--------------------+
|      2|     1|   3.5|1112486027|      Jumanji (1995)|Adventure|Childre...|
|     29|     1|   3.5|1112484676|City of Lost Chil...|Adventure|Drama|F...|
|     32|     1|   3.5|1112484819|Twelve Monkeys (a...|Mystery|Sci-Fi|Th...|
+-------+------+------+----------+--------------------+--------------------+
only showing top 3 rows



#### prepare data

In [10]:
data = ratings.select("userId", "movieId", "rating")
traindf, testdf = data.randomSplit([0.7, 0.3])
traindf = traindf.withColumnRenamed("rating", "label")
testdf = testdf.withColumnRenamed("rating", "trueLabel")
#calculate number of rows
train_rows = traindf.count()
test_rows = testdf.count()
print(f"training data rows: {train_rows}, testing data row: {test_rows}")

training data rows: 734237, testing data row: 314338


In [11]:
traindf.show(3)

+------+-------+-----+
|userId|movieId|label|
+------+-------+-----+
|     1|     29|  3.5|
|     1|     47|  3.5|
|     1|    112|  3.5|
+------+-------+-----+
only showing top 3 rows



In [12]:
traindf.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- label: double (nullable = true)



#### training model

In [13]:
als = ALS(maxIter=20,
          regParam=0.01,
          userCol="userId", 
          itemCol="movieId",
          ratingCol="label")
model = als.fit(traindf)

In [14]:
print(model.explainParams())

coldStartStrategy: strategy for dealing with unknown or new users/items at prediction time. This may be useful in cross-validation or production scenarios, for handling user/item ids the model has not seen in the training data. Supported values: nan,drop. (default: nan)
itemCol: column name for item ids. Ids must be within the integer value range. (default: item, current: movieId)
predictionCol: prediction column name (default: prediction)
userCol: column name for user ids. Ids must be within the integer value range. (default: user, current: userId)


In [15]:
model.rank

10

#### Predict

In [16]:
prediction = model.transform(testdf)

In [17]:
prediction.show(3)

+------+-------+---------+----------+
|userId|movieId|trueLabel|prediction|
+------+-------+---------+----------+
|  5585|    148|      3.0|  3.874677|
|  3673|    148|      2.0| 2.9365218|
|  3439|    148|      1.0| 2.4671319|
+------+-------+---------+----------+
only showing top 3 rows



In [18]:
prediction.join(movies, "movieId").select(
    "userId", "title", "prediction", "trueLabel").filter("userId==1259").show(n=10, truncate=False)

+------+---------------------------------------------------+----------+---------+
|userId|title                                              |prediction|trueLabel|
+------+---------------------------------------------------+----------+---------+
|1259  |Jerky Boys, The (1995)                             |1.8968399 |1.0      |
|1259  |Shallow Grave (1994)                               |4.554488  |5.0      |
|1259  |Lord of Illusions (1995)                           |3.3361583 |2.0      |
|1259  |Jade (1995)                                        |4.228554  |1.0      |
|1259  |Bullets Over Broadway (1994)                       |4.8040743 |4.0      |
|1259  |City Slickers II: The Legend of Curly's Gold (1994)|2.852591  |1.0      |
|1259  |Outbreak (1995)                                    |3.6788158 |2.0      |
|1259  |Desperado (1995)                                   |3.933239  |2.0      |
|1259  |Little Odessa (1994)                               |3.4827013 |4.0      |
|1259  |Tombston

#### Evaluate

In [19]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="trueLabel", predictionCol="prediction", metricName="rmse")

cleanPrediction = prediction.dropna(how="any", subset=["prediction"])
rmse = evaluator.evaluate(cleanPrediction)
rmse

0.9041674530778843

In [20]:
spark.stop()