#Getting Started with PySpark in Google Colab

PySpark is Python interface for Apache Spark. The primary use cases for PySpark are to work with huge amounts of data and for creating data pipelines.

You don't need to work with big data to benefit from PySpark. I find that the SparkSQL is a great tool for performing routine data anlysis. Pandas can get slow and you may find yourself writing a lot of code for data cleaning whereas the same actions take much less code in SQL. Let's get started!

See more here! http://spark.apache.org/docs/latest/api/python/

# 1. Installing PySpark in Google Colab

In [None]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
#Check this site for the latest download link https://www.apache.org/dyn/closer.lua/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!wget -q https://dlcdn.apache.org/spark/spark-3.2.1/spark-3.2.1-bin-hadoop3.2.tgz
!tar xf spark-3.2.1-bin-hadoop3.2.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j
!pip install recommenders

import os
import sys
# os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
# os.environ["SPARK_HOME"] = "/content/spark-3.2.1-bin-hadoop3.2"


import findspark
findspark.init()
findspark.find()

import sys
import pyspark
from pyspark.ml.recommendation import ALS
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import StringType, FloatType, IntegerType, LongType
from pyspark.ml.feature import StringIndexer

from recommenders.utils.timer import Timer
from recommenders.datasets import movielens
from recommenders.utils.notebook_utils import is_jupyter
from recommenders.datasets.spark_splitters import spark_random_split
from recommenders.evaluation.spark_evaluation import SparkRatingEvaluation, SparkRankingEvaluation
from recommenders.utils.spark_utils import start_or_get_spark

[33m0% [Working][0m            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
22 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sources' as repository 

In [None]:
!pip install -q handyspark

In [None]:
print(f"System version: {sys.version}")
print("Spark version: {}".format(pyspark.__version__))

System version: 3.11.11 (main, Dec  4 2024, 08:55:07) [GCC 11.4.0]
Spark version: 3.5.4


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. Set up parameters

In [None]:
# top k items to recommend
TOP_K = 10

DATA_SIZE = '100k'

COL_USER = "user"
COL_ITEM = "business"
COL_RATING = "rating"
COL_TIMESTAMP = "timestamp"

## 3. Get datasets

In [None]:
import os
import requests
import gzip, shutil

# URLs of the files to download
urls = {
    "rating-New_York.csv.gz": "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/rating-New_York.csv.gz",
    "meta-New_York.json.gz": "https://datarepo.eng.ucsd.edu/mcauley_group/gdrive/googlelocal/meta-New_York.json.gz"
}

for filename, url in urls.items():
    if not os.path.isfile(filename):
        print(f"Downloading {filename}...")

        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)
        print(f"{filename} downloaded successfully.")

        with gzip.open(filename, 'rb') as f_in:
          with open(filename[:-3], 'wb') as f_out:
              shutil.copyfileobj(f_in, f_out)

    else:
        print(f"{filename} already exists.")


Downloading rating-New_York.csv.gz...
rating-New_York.csv.gz downloaded successfully.
Downloading meta-New_York.json.gz...
meta-New_York.json.gz downloaded successfully.


## 3. Check with pandas

In [None]:
import pandas as pd

df = pd.read_csv("rating-New_York.csv")
df.head()

Unnamed: 0,business,user,rating,timestamp
0,0x89c24469c758686b:0x641f5b84cb9bedfa,101855823232666695168,1,1629141186463
1,0x89c24469c758686b:0x641f5b84cb9bedfa,105821946869087882225,1,1528477593994
2,0x89c24469c758686b:0x641f5b84cb9bedfa,108990883320903443748,1,1424830512308
3,0x89c24469c758686b:0x641f5b84cb9bedfa,117021514778630212205,5,1512641660497
4,0x89c25fc9494dce47:0x6d63c807b59a55,113722104692308235141,5,1603494795361


In [None]:
df.isnull().sum()

Unnamed: 0,0
business,0
user,0
rating,0
timestamp,0


#3. Build PySpark DataFrames

In [None]:
spark = SparkSession.builder \
    .appName("ALS PySpark: New York Reco") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryoserializer.buffer.max", "2047m") \
    .config("spark.sql.analyzer.failAmbiguousSelfJoin", "false") \
    .getOrCreate()


In [None]:
# Define schema with user and business as StringType
schema = StructType([
    StructField("user", StringType(), nullable=False),
    StructField("business", StringType(), nullable=False),
    StructField("rating", IntegerType(), nullable=False),
    StructField("timestamp", LongType(), nullable=False),
])

# Read the CSV file with the specified schema
data = spark.read.csv("rating-New_York.csv", schema=schema, header=True)

if DATA_SIZE == '100k':
  data = data.limit(100000)
elif DATA_SIZE == '1m':
  data = data.limit(1000000)
elif DATA_SIZE == '10m':
  data = data.limit(10000000)
elif DATA_SIZE == '20m':
  data = data.limit(20000000)

# Extract unique user and business IDs
unique_users = data.select("user").distinct().rdd.map(lambda row: row[0])
unique_businesses = data.select("business").distinct().rdd.map(lambda row: row[0])

# Assign unique integers to each unique ID
user_mapping = unique_users.zipWithUniqueId().toDF(["user", "user_id"])
business_mapping = unique_businesses.zipWithUniqueId().toDF(["business", "business_id"])

# Join the mappings back to the original data
data = data.join(user_mapping, on="user").join(business_mapping, on="business")

In [None]:

# Select only the necessary columns for ALS
als_data = data.select("user_id", "business_id", "rating")

# Show the transformed data
als_data.show(5)

+-------+-----------+------+
|user_id|business_id|rating|
+-------+-----------+------+
|      1|         28|     4|
|      1|         28|     4|
|     20|        214|     5|
|     20|        214|     5|
|     49|        576|     4|
+-------+-----------+------+
only showing top 5 rows



## 4. Exploratory Data Analysis

In [None]:
# Amount of rows (given by DATA_SIZE)

als_data.count()

100000

In [None]:
# General characteristics of the dataset

als_data.describe().show()

+-------+------------------+------------------+------------------+
|summary|           user_id|       business_id|            rating|
+-------+------------------+------------------+------------------+
|  count|            100000|            100000|            100000|
|   mean|        1546.92383|       27699.35974|           4.39017|
| stddev|1039.4558938471982|19223.577995673993|1.2897030683223474|
|    min|                 0|                 0|                 1|
|    max|              3632|             66770|                 5|
+-------+------------------+------------------+------------------+



In [None]:
# Visualize dataset

# handy_data = als_data.toHandy()

## 5. Split data in train and test using Spark's built in features


In [None]:
train, test = spark_random_split(data, ratio=0.75, seed=42)
print("N train", train.cache().count())
print("N test", test.cache().count())

N train 75067
N test 24933


## 6. Train ALS model and get top k recommendations

In [None]:
header = {
    "userCol": "user_id",
    "itemCol": "business_id",
    "ratingCol": COL_RATING,
}

als = ALS(
    rank=10,
    maxIter=15,
    implicitPrefs=False,
    regParam=0.05,
    coldStartStrategy='drop',
    nonnegative=False,
    seed=43,
    **header
)

In [None]:
with Timer() as train_time:
  model = als.fit(train)

print(f"Took {train_time.interval} seconds to train the model...")

Took 42.35550659600017 seconds to train the model...


## 5. Run predictions on the test set

In [None]:
from pyspark.sql.functions import col, explode

test_users = test.select("user_id").distinct()
top_n = 10

with Timer() as test_time:
    recommendations = model.recommendForUserSubset(test_users, top_n)
    recommendations = recommendations.select("user_id", explode("recommendations").alias("rec"))
    recommendations = recommendations.select("user_id", col("rec.business_id"), col("rec.rating"))

    recommendations.cache().count()

print(f"Took {test_time.interval} seconds to generate recommendations...")

Took 41.7400340669999 seconds to generate recommendations...


In [None]:
recommendations.show(5)

+-------+-----------+---------+
|user_id|business_id|   rating|
+-------+-----------+---------+
|   1580|      27780|4.9765615|
|   1580|      27779|4.9765615|
|   1580|      50818|4.2616773|
|   1580|      50816|4.2616773|
|   1580|      46652| 4.124104|
+-------+-----------+---------+
only showing top 5 rows



## 6. Evaluate the model

In [None]:
from pyspark.sql.functions import round

# Round the predictions to the nearest whole number
recommendations = recommendations.withColumn("rating", round("rating"))

# Show the rounded predictions
recommendations.show()

+-------+-----------+------+
|user_id|business_id|rating|
+-------+-----------+------+
|   1580|      27780|   5.0|
|   1580|      27779|   5.0|
|   1580|      50818|   4.0|
|   1580|      50816|   4.0|
|   1580|      46652|   4.0|
|   1580|      37703|   4.0|
|   1580|      39401|   4.0|
|   1580|      39399|   4.0|
|   1580|      39398|   4.0|
|   1580|      39397|   4.0|
|    471|      60526|   5.0|
|    471|       8100|   5.0|
|    471|       8099|   5.0|
|    471|       8098|   5.0|
|    471|       8097|   5.0|
|    471|       8096|   5.0|
|    471|       8095|   5.0|
|    471|       8094|   5.0|
|    471|       8093|   5.0|
|    471|       8091|   5.0|
+-------+-----------+------+
only showing top 20 rows



In [None]:
rank_eval = SparkRankingEvaluation(test, recommendations, k=TOP_K,
                                   col_user="user_id", col_item="business_id",
                                   col_rating="rating", col_prediction="rating",
                                   relevancy_method="top_k")

print("Model:\tALS",
      "Top K:\t%d" % rank_eval.k,
      "MAP:\t%f" % rank_eval.map_at_k(),
      "NDCG:\t%f" % rank_eval.ndcg_at_k(),
      "Precision@K:\t%f" % rank_eval.precision_at_k(),
      "Recall@K:\t%f" % rank_eval.recall_at_k(), sep='\n')



Model:	ALS
Top K:	10
MAP:	0.136637
NDCG:	0.194354
Precision@K:	0.111949
Recall@K:	0.211849


In [None]:
test.show(5)

+--------------------+--------------------+------+-------------+-------+-----------+
|            business|                user|rating|    timestamp|user_id|business_id|
+--------------------+--------------------+------+-------------+-------+-----------+
|10012214140755166...|0x89c2f4603275f4a...|     4|1630773051592|    109|       1529|
|10015700120420716...|0x89c2e7f5b0153be...|     3|1548754275402|   2356|      62379|
|10022578938945580...|0x89de74f1bc12ba4...|     1|1389805967254|   3302|      55551|
|10024985179954856...|0x89c28a937f8e1b7...|     5|1630964842884|   1504|      61046|
|10068590924214935...|0x89d1313cef0322d...|     4|1623701630840|    318|       4820|
+--------------------+--------------------+------+-------------+-------+-----------+
only showing top 5 rows



## 7. Evaluate the predictions

In [None]:
prediction = model.transform(test).cache()
# Round the predictions to the nearest whole number
prediction = prediction.withColumn("prediction", round("prediction"))

prediction.show()

rating_eval = SparkRatingEvaluation(test, prediction, col_user="user_id", col_item="business_id",
                                    col_rating="rating", col_prediction="prediction")

print("Model:\tALS rating prediction",
      "RMSE:\t%f" % rating_eval.rmse(),
      "MAE:\t%f" % rating_eval.mae(),
      "Explained variance:\t%f" % rating_eval.exp_var(),
      "R squared:\t%f" % rating_eval.rsquared(), sep='\n')

+-------+-----------+------+
|user_id|business_id|rating|
+-------+-----------+------+
|   1580|      27780|   5.0|
|   1580|      27779|   5.0|
|   1580|      50818|   4.0|
|   1580|      50816|   4.0|
|   1580|      46652|   4.0|
|   1580|      37703|   4.0|
|   1580|      39401|   4.0|
|   1580|      39399|   4.0|
|   1580|      39398|   4.0|
|   1580|      39397|   4.0|
|    471|      60526|   5.0|
|    471|       8100|   5.0|
|    471|       8099|   5.0|
|    471|       8098|   5.0|
|    471|       8097|   5.0|
|    471|       8096|   5.0|
|    471|       8095|   5.0|
|    471|       8094|   5.0|
|    471|       8093|   5.0|
|    471|       8091|   5.0|
+-------+-----------+------+
only showing top 20 rows

+--------------------+--------------------+------+-------------+-------+-----------+----------+
|            business|                user|rating|    timestamp|user_id|business_id|prediction|
+--------------------+--------------------+------+-------------+-------+-----------+--



Model:	ALS rating prediction
RMSE:	1.295144
MAE:	0.328726
Explained variance:	0.034185
R squared:	-0.030969
