**ALL in one it should take more time than showing recommednations directly + UI  :Dynamic implementation +poster api**

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.ml.functions import vector_to_array
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType
import numpy as np
import requests
from IPython.display import display, HTML





spark = SparkSession.builder \
    .appName("MovieGenreBinaryVectorization") \
    .master("local[*]") \
    .getOrCreate()



ratingsSchema = StructType([
    StructField("userID", IntegerType(), True),
    StructField("movieID", IntegerType(), True),
    StructField("rating", FloatType(), True)
])

moviesSchema = StructType([
    StructField("movieID", IntegerType(), True),
    StructField("movieTitle", StringType(), True),
    StructField("genres", StringType(), True),
    StructField("preproceesedStr", StringType(), True),
    StructField("genreVector", StringType(), True)
])


movies = spark.read.option("charset", "ISO-8859-1") \
    .schema(moviesSchema) \
    .csv("movies.csv")

validGenres = [
    "Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary",
    "Drama", "Fantasy", "Film-Noir", "Horror", "IMAX", "Musical", "Mystery",
    "Romance", "Sci-Fi", "Thriller", "War", "Western"
]

movies = movies.withColumn("preproceesed", F.split(F.col("genres"), "\\|"))
movies = movies.withColumn("preproceesedStr", F.concat_ws(",", F.col("preproceesed")))


movies = movies.withColumn("genreList", F.split(F.col("genres"), "\\|"))

# generate a binary vector for each movie
def genre_to_vector(genres):
    genre_set = set(genres) if genres else set()
    return Vectors.dense([1.0 if genre in genre_set else 0.0 for genre in validGenres])


genre_vector_udf = F.udf(genre_to_vector, VectorUDT())
movies = movies.withColumn("genreVector", genre_vector_udf(F.col("genreList")))
movies = movies.withColumn("genreArray", vector_to_array("genreVector"))
movies = movies.withColumn("genreArrayStr", F.concat_ws(",", "genreArray"))



movies.select("movieID", "movieTitle", "genres","preproceesedStr", "genreArrayStr") \
    .write.option("header", "true") \
    .csv("movies_with_genre_vectors.csv")




# Get user input
specific_user_id = int(input("Enter the user ID for recommendations: "))
ratings = spark.read.schema(ratingsSchema).csv("ratings.csv", header=True)
movies = spark.read.schema(moviesSchema).csv("movies_with_genre_vectors.csv", header=True)
movies = movies.filter(F.col("genreVector").isNotNull())


# Convert genreVector string to Spark Vector
def parse_vector(vector_str):
    try:
        return Vectors.dense([float(x) for x in vector_str.split(',')])
    except:
        return Vectors.dense([0.0] * 19)

parse_vector_udf = F.udf(parse_vector, VectorUDT())
movies = movies.withColumn("genreVector", parse_vector_udf(F.col("genreVector")))

# =============================================
# 1. Show User's Ratings with Adjusted Ratings
# =============================================
user_ratings = ratings.filter(F.col("userID") == specific_user_id)

if user_ratings.count() == 0:
    print(f"\nError: User ID {specific_user_id} not found in the dataset.")
    spark.stop()
    exit()

user_avg_rating = user_ratings.agg(F.avg("rating").alias("avg_rating")).collect()[0]["avg_rating"]

user_ratings_with_adjusted = user_ratings.withColumn(
    "avg_rating", F.lit(user_avg_rating)
).withColumn(
    "adjusted_rating", F.col("rating") - F.col("avg_rating")
)

# =============================================
# 2. Calculate User's Centred Weighted Genre Vector
# =============================================
user_ratings_with_genres = user_ratings_with_adjusted.join(movies, on="movieID")

# Calculate weighted genre vectors
def weighted_genre_vector(genre_vector, adjusted_rating):
    return Vectors.dense([x * adjusted_rating for x in genre_vector.toArray()])

weighted_genre_udf = F.udf(weighted_genre_vector, VectorUDT())
user_ratings_with_genres = user_ratings_with_genres.withColumn(
    "weighted_genre_vector",
    weighted_genre_udf(F.col("genreVector"), F.col("adjusted_rating"))
)

# Function to sum vectors
def sum_vectors(vectors):
    if not vectors:
        return Vectors.dense([0.0] * 19)
    summed_vector = vectors[0]
    for vec in vectors[1:]:
        summed_vector += vec
    return summed_vector

# Calculate count vector
count_vector_df = user_ratings_with_genres.groupBy("userID") \
    .agg(F.collect_list("genreVector").alias("genre_vectors")) \
    .withColumn("count_vector", F.udf(sum_vectors, VectorUDT())(F.col("genre_vectors")))

# Calculate weighted sum vector
weighted_sum_df = user_ratings_with_genres.groupBy("userID") \
    .agg(F.collect_list("weighted_genre_vector").alias("weighted_vectors")) \
    .withColumn("weighted_sum", F.udf(sum_vectors, VectorUDT())(F.col("weighted_vectors")))

# Join and divide to get normalized weighted vector
def divide_vectors(weighted_vec, count_vec):
    result = []
    for w, c in zip(weighted_vec, count_vec):
        result.append(w / c if c != 0 else 0.0)
    return Vectors.dense(result)

user_profile_df = count_vector_df.join(weighted_sum_df, "userID") \
    .withColumn("normalized_vector",
               F.udf(divide_vectors, VectorUDT())(F.col("weighted_sum"), F.col("count_vector"))) \
    .withColumn("normalized_array", vector_to_array("normalized_vector"))


user_vector = np.array(user_profile_df.select("normalized_array").first()[0])

# =============================================
# 3. Calculate Recommendations
# =============================================
rated_movies = user_ratings.select("movieID")

def cosine_similarity(movie_vector):
    try:
        if movie_vector is None:
            return 0.0

        uv = np.array(user_vector)
        mv = np.array(movie_vector)

        min_length = min(len(uv), len(mv))
        uv = uv[:min_length]
        mv = mv[:min_length]

        dot_product = np.dot(uv, mv)
        norm_uv = np.linalg.norm(uv)
        norm_mv = np.linalg.norm(mv)

        if norm_uv == 0 or norm_mv == 0:
            return 0.0

        return float(dot_product / (norm_uv * norm_mv))
    except Exception as e:
        print(f"Error in cosine similarity: {e}")
        return 0.0

cosine_similarity_udf = F.udf(cosine_similarity, FloatType())

movies_for_recommendation = movies.join(
    rated_movies,
    on="movieID",
    how="left_anti"
).withColumn(
    "genre_array",
    vector_to_array("genreVector")
)

movies_with_similarity = movies_for_recommendation.withColumn(
    "similarity_score",
    cosine_similarity_udf(F.col("genre_array"))
)

N = 10
recommended_movies = movies_with_similarity.select(
    "movieID", "movieTitle", "genres", "similarity_score"
).orderBy(
    F.col("similarity_score").desc()
).limit(N)

# Collect recommendations for display
recommended_movies_list = recommended_movies.collect()

# =============================================
# 4. Display All Results
# =============================================
display(HTML(f"<h1>Movie Recommendation Analysis for User {specific_user_id}</h1>"))
display(HTML(f"<h2> Ratings for User {specific_user_id} with Adjusted Ratings</h2>"))
user_ratings_with_adjusted.select("userID", "movieID", "rating", "avg_rating", "adjusted_rating").show()
display(HTML(f"<h2> User {specific_user_id}'s Normalized Weighted Genre Vector</h2>"))
user_profile_df.select("userID", "normalized_array").show(truncate=False)
display(HTML(f"<h2> Top 10 Movie Recommendations for User {specific_user_id}</h2>"))
recommended_movies.show(truncate=False)
display(HTML(f"<h2> Recommended Movies for User {specific_user_id}</h2>"))

# API Keys
TMDB_API_KEY = "d348679826a515c083a06353ba605405"
OMDB_API_KEY = "a71ce210-fa7d-4f01-b138-d4e2effa3693"

def get_movie_poster(title, year=None):
    """Get movie poster URL from TMDb or OMDB"""
    clean_title = title.split(" (")[0]
    tmdb_url = f"https://api.themoviedb.org/3/search/movie?api_key={TMDB_API_KEY}&query={clean_title}"
    if year:
        tmdb_url += f"&year={year}"

    try:
        tmdb_response = requests.get(tmdb_url).json()
        if tmdb_response.get("results"):
            poster_path = tmdb_response["results"][0].get("poster_path")
            if poster_path:
                return f"https://image.tmdb.org/t/p/w500{poster_path}"
    except:
        pass

    omdb_url = f"http://www.omdbapi.com/?apikey={OMDB_API_KEY}&t={clean_title}"
    if year:
        omdb_url += f"&y={year}"

    try:
        omdb_response = requests.get(omdb_url).json()
        if omdb_response.get("Poster") and omdb_response["Poster"] != "N/A":
            return omdb_response["Poster"]
    except:
        pass

    return None

html_output = """
<div style="display: flex; flex-wrap: wrap; justify-content: center; gap: 20px; padding: 20px;">
"""

for movie in recommended_movies_list:
    movie_name = movie["movieTitle"]
    movie_genres = movie["genres"]
    similarity_score = movie["similarity_score"]

    year = None
    if "(" in movie_name:
        year_str = movie_name.split("(")[-1].split(")")[0]
        if year_str.isdigit():
            year = year_str

    poster_url = get_movie_poster(movie_name, year)


    display_title = movie_name.replace('\\"', '"').replace('""', '"')

    html_output += f"""
    <div style="flex: 0 0 200px; text-align: center; margin-bottom: 20px;">
        <img src="{poster_url if poster_url else 'https://via.placeholder.com/200x300?text=No+Poster'}"
             width="200" style="border-radius: 5px; border: 1px solid #ddd; margin-bottom: 5px;">
        <p style="margin: 0; font-weight: bold; font-size: 14px;">{display_title}</p>
        <p style="margin: 0; font-size: 12px;">{movie_genres}</p>
        <p style="margin: 0; font-size: 12px;">Similarity: {similarity_score:.2f}</p>
    </div>
    """

html_output += "</div>"
display(HTML(html_output))

spark.stop()

Enter the user ID for recommendations: 1


+------+-------+------+------------------+-------------------+
|userID|movieID|rating|        avg_rating|    adjusted_rating|
+------+-------+------+------------------+-------------------+
|     1|     17|   4.0|3.5319148936170213|0.46808510638297873|
|     1|     25|   1.0|3.5319148936170213|-2.5319148936170213|
|     1|     29|   2.0|3.5319148936170213|-1.5319148936170213|
|     1|     30|   5.0|3.5319148936170213| 1.4680851063829787|
|     1|     32|   5.0|3.5319148936170213| 1.4680851063829787|
|     1|     34|   2.0|3.5319148936170213|-1.5319148936170213|
|     1|     36|   1.0|3.5319148936170213|-2.5319148936170213|
|     1|     80|   5.0|3.5319148936170213| 1.4680851063829787|
|     1|    110|   3.0|3.5319148936170213|-0.5319148936170213|
|     1|    111|   5.0|3.5319148936170213| 1.4680851063829787|
|     1|    161|   1.0|3.5319148936170213|-2.5319148936170213|
|     1|    166|   5.0|3.5319148936170213| 1.4680851063829787|
|     1|    176|   4.0|3.5319148936170213|0.46808510638

+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|userID|normalized_array                                                                                                                                                                                                                                                                                                                                                  |
+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

+-------+-----------------------+---------+----------------+
|movieID|movieTitle             |genres   |similarity_score|
+-------+-----------------------+---------+----------------+
|91697  |Pitfall (1948)         |Film-Noir|0.70510167      |
|2066   |Out of the Past (1947) |Film-Noir|0.70510167      |
|746    |Force of Evil (1948)   |Film-Noir|0.70510167      |
|4426   |Kiss Me Deadly (1955)  |Film-Noir|0.70510167      |
|1154   |T-Men (1947)           |Film-Noir|0.70510167      |
|72090  |13th Letter, The (1951)|Film-Noir|0.70510167      |
|5169   |Scarlet Street (1945)  |Film-Noir|0.70510167      |
|1153   |Raw Deal (1948)        |Film-Noir|0.70510167      |
|99444  |Blonde Ice (1948)      |Film-Noir|0.70510167      |
|3380   |Railroaded! (1947)     |Film-Noir|0.70510167      |
+-------+-----------------------+---------+----------------+

