In [4]:
# Using Spark ML to Produce Movie Recommendations

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, LongType
from pyspark.ml.recommendation import ALS
import codecs

def load_movie_names():
    # Load movie names from the u.ITEM file and return a dictionary mapping movie IDs to movie titles.
    movie_names = {}
    with codecs.open("C:/SparkCourse/ml-100k/u.ITEM", "r", encoding='ISO-8859-1', errors='ignore') as f:
        for line in f:
            fields = line.split('|')
            movie_names[int(fields[0])] = fields[1]
    return movie_names

def main():
    # Initialize Spark session
    spark = SparkSession.builder.appName("ALSMovie").getOrCreate()

    # Define schema for the ratings data
    movies_schema = StructType([
        StructField("userID", IntegerType(), True),
        StructField("movieID", IntegerType(), True),
        StructField("rating", IntegerType(), True),
        StructField("timestamp", LongType(), True)
    ])

    # Load movie names and ratings data
    movie_names = load_movie_names()
    ratings = spark.read \
        .option("sep", "\t") \
        .schema(movies_schema) \
        .csv("file:///SparkCourse/ml-100k/u.data")

    print("Training recommendation model...")

    # Configure ALS model
    als = ALS(maxIter=5, regParam=0.01, userCol="userID", itemCol="movieID", ratingCol="rating")

    # Train the ALS model
    model = als.fit(ratings)

    # Specify the user ID to recommend movies for
    user_id = 6
    user_schema = StructType([StructField("userID", IntegerType(), True)])
    users = spark.createDataFrame([[user_id]], user_schema)

    # Generate recommendations for the specified user
    recommendations = model.recommendForUserSubset(users, 10).collect()

    print(f"Top 10 recommendations for user ID {user_id}:")

    # Display recommendations
    for user_recs in recommendations:
        movie_recs = user_recs[1]  # user_recs is (userID, [Row(movieId, rating), Row(movieID, rating)...])
        for rec in movie_recs:  # movie_recs is the list of recommendations for the user
            movie_id = rec[0]  # Extract movie ID
            rating = rec[1]  # Extract rating
            movie_name = movie_names.get(movie_id, "Unknown Movie")
            print(f"{movie_name}: {rating}")

if __name__ == "__main__":
    main()


Training recommendation model...
Top 10 recommendations for user ID 6:
Angel Baby (1995): 6.486196517944336
Mina Tannenbaum (1994): 5.871776103973389
Old Man and the Sea, The (1958): 5.560771465301514
Underneath, The (1995): 5.533425807952881
Love and Death on Long Island (1997): 5.529603958129883
Boys, Les (1997): 5.509486198425293
In the Bleak Midwinter (1995): 5.403690338134766
Shooting Fish (1997): 5.364198684692383
Umbrellas of Cherbourg, The (Parapluies de Cherbourg, Les) (1964): 5.194911479949951
Paradise Lost: The Child Murders at Robin Hood Hills (1996): 5.178156852722168
