# Machine learning

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.2.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 43 kB/s 
[?25hCollecting py4j==0.10.9.3
  Downloading py4j-0.10.9.3-py2.py3-none-any.whl (198 kB)
[K     |████████████████████████████████| 198 kB 47.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.2.1-py2.py3-none-any.whl size=281853642 sha256=70577c2e17dbbb35749d551f7bb5eee2f2de58039da1e23267d7fe24f26aa2dc
  Stored in directory: /root/.cache/pip/wheels/9f/f5/07/7cd8017084dce4e93e84e92efd1e1d5334db05f2e83bcef74f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.3 pyspark-3.2.1


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import json
from pyspark import SparkContext
sc = SparkContext("local","Finalproject")


In [None]:
from pyspark.sql import SparkSession
spark = SparkSession(sc)

In [None]:
df = spark.read.csv("/content/ratings.csv")
df.printSchema()


root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)
 |-- _c2: string (nullable = true)



In [None]:
from pyspark.sql.functions import col
df = df.select(col("_c0").alias("book_id"), col("_c1").alias("user_id"),col("_c2").alias("rating"))

In [None]:
df.show()

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    314|     5|
|      1|    439|     3|
|      1|    588|     5|
|      1|   1169|     4|
|      1|   1185|     4|
|      1|   2077|     4|
|      1|   2487|     4|
|      1|   2900|     5|
|      1|   3662|     4|
|      1|   3922|     5|
|      1|   5379|     5|
|      1|   5461|     3|
|      1|   5885|     5|
|      1|   6630|     5|
|      1|   7563|     3|
|      1|   9246|     1|
|      1|  10140|     4|
|      1|  10146|     5|
|      1|  10246|     4|
|      1|  10335|     4|
+-------+-------+------+
only showing top 20 rows



In [None]:
r = df.sample(fraction = 0.1)

In [None]:
from pyspark.sql.types import IntegerType
r = r.withColumn("book_id", r["book_id"].cast(IntegerType()))
r = r.withColumn("user_id", r["user_id"].cast(IntegerType()))
r = r.withColumn("rating", r["rating"].cast(IntegerType()))

In [None]:
r.show()

+-------+-------+------+
|book_id|user_id|rating|
+-------+-------+------+
|      1|    588|     5|
|      1|   6630|     5|
|      1|  20076|     3|
|      1|  24326|     5|
|      1|  25164|     4|
|      1|  33697|     4|
|      2|   6630|     5|
|      2|  10751|     3|
|      2|  11692|     3|
|      2|  11868|     5|
|      2|  16913|     2|
|      2|  17643|     1|
|      2|  19526|     4|
|      2|  32305|     5|
|      2|  46421|     5|
|      2|  49298|     5|
|      2|  50104|     5|
|      2|  53292|     5|
|      3|   5885|     4|
|      3|   9246|     1|
+-------+-------+------+
only showing top 20 rows



In [None]:
(training, test) = r.randomSplit([0.8, 0.2])

In [None]:
import sys
import itertools
from math import sqrt
from operator import add
from os.path import join, isfile, dirname

from pyspark import SparkConf, SparkContext
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
als = ALS(maxIter=5, regParam=0.01, userCol="book_id", itemCol="user_id", ratingCol="rating",coldStartStrategy="drop")
model = als.fit(training)

In [None]:
# Evaluate the model by computing the RMSE on the test data
predictions = model.transform(test)
evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating",
                                predictionCol="prediction")
rmse = evaluator.evaluate(predictions)
print("Root-mean-square error = " + str(rmse))

# Generate top 10 movie recommendations for each user
userRecs = model.recommendForAllUsers(10)
# Generate top 10 user recommendations for each movie
movieRecs = model.recommendForAllItems(10)

Root-mean-square error = 4.634064010046933




In [None]:
movieRecs.show()

+-------+--------------------+
|user_id|     recommendations|
+-------+--------------------+
|      1|[{1488, 6.975687}...|
|      6|[{75, 10.989511},...|
|      7|[{75, 15.115718},...|
|      8|[{1185, 8.577438}...|
|     10|[{3135, 7.9691377...|
|     11|[{822, 11.474008}...|
|     18|[{940, 9.425072},...|
|     20|[{621, 9.671779},...|
|     22|[{3135, 10.06607}...|
|     23|[{172, 7.9582667}...|
|     24|[{360, 19.030195}...|
|     25|[{1255, 10.105552...|
|     27|[{80, 13.194258},...|
|     29|[{3135, 9.248354}...|
|     35|[{664, 14.661599}...|
|     36|[{430, 8.168689},...|
|     41|[{621, 16.583927}...|
|     42|[{787, 13.150075}...|
|     43|[{430, 9.160945},...|
|     45|[{1471, 12.446703...|
+-------+--------------------+
only showing top 20 rows

