In [4]:
# !unzip -q /content/millions-of-movies.zip

In [None]:
# !pip install pyspark
# !pip install sentence-transformers

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql.functions import col, lower, sum as _sum, when
from pyspark.sql.types import IntegerType
from sklearn.preprocessing import MinMaxScaler
from sentence_transformers import SentenceTransformer
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
import pickle

In [3]:
spark = SparkSession.builder.appName("movies_pyspark").getOrCreate()

In [5]:
df = spark.read.csv('/content/movies.csv', header=True, inferSchema=True)

In [6]:
null_counts = df.select([_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df.columns])
null_counts.show()

+---+-----+------+-----------------+--------+----------+--------------------+------------+------+-------+-------+------+-------+------------+----------+-------+--------+-----------+-------------+---------------+
| id|title|genres|original_language|overview|popularity|production_companies|release_date|budget|revenue|runtime|status|tagline|vote_average|vote_count|credits|keywords|poster_path|backdrop_path|recommendations|
+---+-----+------+-----------------+--------+----------+--------------------+------------+------+-------+-------+------+-------+------------+----------+-------+--------+-----------+-------------+---------------+
|  0|    4|210307|                9|  118240|         7|              384902|       51569|     5|      2|  34334|     2| 613810|          27|         5| 224638|  511524|     184582|       498988|         686050|
+---+-----+------+-----------------+--------+----------+--------------------+------------+------+-------+-------+------+-------+------------+----------+

In [7]:
df_filled = df.fillna({"genres": '[NO_GENRE]', "credits": '[NO_CREDITS]', 'overview': 'No overview available'})

In [8]:
df_without_status = df_filled.filter(df_filled.status == 'Released')

In [9]:
df_year = df_without_status.withColumn("release_year", F.split(F.col("release_date"), "-")[0]).alias('release_year')

In [10]:
df_released = df_year.filter(df_filled.status == 'Released')

In [11]:
df_vote = df_released.withColumn("vote_count", col("vote_count").cast(IntegerType())).withColumn("vote_average", col("vote_average").cast(IntegerType()))

m = df_vote.approxQuantile("vote_count", [0.7], 0.01)[0] # Calculate the 70th percentile
C = df_vote.select(F.avg("vote_average")).collect()[0][0] # Calculate the mean vote average

df_weighted = df_vote.withColumn(
    "weighted_rating",
    ((col("vote_average") * col("vote_count")) + (C * m)) / (col("vote_count") + m)
)

df_weighted.show()

+------+--------------------+--------------------+-----------------+--------------------+----------+--------------------+------------+-----------+------------+-------+--------+--------------------+------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+------------------+
|    id|               title|              genres|original_language|            overview|popularity|production_companies|release_date|     budget|     revenue|runtime|  status|             tagline|vote_average|vote_count|             credits|            keywords|         poster_path|       backdrop_path|     recommendations|release_year|   weighted_rating|
+------+--------------------+--------------------+-----------------+--------------------+----------+--------------------+------------+-----------+------------+-------+--------+--------------------+------------+----------+--------------------+--------------------+-------------------

In [12]:
df_weighted.dropDuplicates()
df_weighted.count()

717058

In [13]:
df_cols_dropped = df_weighted.drop("id", 'production_companies', 'tagline','keywords', 'poster_path', 'backdrop_path', 'recommendations', 'status', "budget", "revenue", 'release_date', 'popularity', 'vote_count', 'vote_average')

In [14]:
df_cols_dropped.show()

+--------------------+--------------------+-----------------+--------------------+-------+--------------------+------------+------------------+
|               title|              genres|original_language|            overview|runtime|             credits|release_year|   weighted_rating|
+--------------------+--------------------+-----------------+--------------------+-------+--------------------+------------+------------------+
|   Meg 2: The Trench|Action-Science Fi...|               en|An exploratory di...|  116.0|Jason Statham-Wu ...|        2023| 6.996598351734084|
| The Pope's Exorcist|Horror-Mystery-Th...|               en|Father Gabriele A...|  103.0|Russell Crowe-Dan...|        2023| 6.991489649210181|
|Deadpool & Wolverine|Action-Comedy-Sci...|               en|A listless Wade W...|  128.0|Ryan Reynolds-Hug...|        2024| 6.998760892925002|
|Transformers: Ris...|Action-Adventure-...|               en|When a new threat...|  127.0|Anthony Ramos-Dom...|        2023| 6.995390226

In [15]:
null_counts = df_cols_dropped.select([_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_cols_dropped.columns])
null_counts.show()

+-----+------+-----------------+--------+-------+-------+------------+---------------+
|title|genres|original_language|overview|runtime|credits|release_year|weighted_rating|
+-----+------+-----------------+--------+-------+-------+------------+---------------+
|    4|     0|                0|       0|  33678|      0|       46522|            419|
+-----+------+-----------------+--------+-------+-------+------------+---------------+



In [16]:
df_no_na = df_cols_dropped.dropna()
df_no_na.count()

642315

In [17]:
from pyspark.sql.functions import concat_ws

df_combined = df_no_na.withColumn(
    "combined_text",
    concat_ws(
        " ",
        "title",
        "genres",
        "original_language",
        "overview",
        "credits"
    )
)

df_combined.show()


+--------------------+--------------------+-----------------+--------------------+-------+--------------------+------------+------------------+--------------------+
|               title|              genres|original_language|            overview|runtime|             credits|release_year|   weighted_rating|       combined_text|
+--------------------+--------------------+-----------------+--------------------+-------+--------------------+------------+------------------+--------------------+
|   Meg 2: The Trench|Action-Science Fi...|               en|An exploratory di...|  116.0|Jason Statham-Wu ...|        2023| 6.996598351734084|Meg 2: The Trench...|
| The Pope's Exorcist|Horror-Mystery-Th...|               en|Father Gabriele A...|  103.0|Russell Crowe-Dan...|        2023| 6.991489649210181|The Pope's Exorci...|
|Deadpool & Wolverine|Action-Comedy-Sci...|               en|A listless Wade W...|  128.0|Ryan Reynolds-Hug...|        2024| 6.998760892925002|Deadpool & Wolver...|
|Transform

In [18]:
null_counts = df_combined.select([_sum(when(col(c).isNull(), 1).otherwise(0)).alias(c) for c in df_combined.columns])
null_counts.show()

+-----+------+-----------------+--------+-------+-------+------------+---------------+-------------+
|title|genres|original_language|overview|runtime|credits|release_year|weighted_rating|combined_text|
+-----+------+-----------------+--------+-------+-------+------------+---------------+-------------+
|    0|     0|                0|       0|      0|      0|           0|              0|            0|
+-----+------+-----------------+--------+-------+-------+------------+---------------+-------------+



In [19]:
df_combined.write.mode("overwrite").parquet("/content/movies.parquet")

In [20]:
movies = pd.read_parquet('/content/movies.parquet')

movies.columns

Index(['title', 'genres', 'original_language', 'overview', 'runtime',
       'credits', 'release_year', 'weighted_rating', 'combined_text'],
      dtype='object')

In [21]:
movies

Unnamed: 0,title,genres,original_language,overview,runtime,credits,release_year,weighted_rating,combined_text
0,Meg 2: The Trench,Action-Science Fiction-Horror,en,An exploratory dive into the deepest depths of...,116.0,Jason Statham-Wu Jing-Shuya Sophia Cai-Sergio ...,2023,6.996598,Meg 2: The Trench Action-Science Fiction-Horro...
1,The Pope's Exorcist,Horror-Mystery-Thriller,en,Father Gabriele Amorth Chief Exorcist of the V...,103.0,Russell Crowe-Daniel Zovatto-Alex Essoe-Franco...,2023,6.991490,The Pope's Exorcist Horror-Mystery-Thriller en...
2,Deadpool & Wolverine,Action-Comedy-Science Fiction,en,A listless Wade Wilson toils away in civilian ...,128.0,Ryan Reynolds-Hugh Jackman-Emma Corrin-Matthew...,2024,6.998761,Deadpool & Wolverine Action-Comedy-Science Fic...
3,Transformers: Rise of the Beasts,Action-Adventure-Science Fiction,en,When a new threat capable of destroying the en...,127.0,Anthony Ramos-Dominique Fishback-Luna Lauren V...,2023,6.995390,Transformers: Rise of the Beasts Action-Advent...
4,Dune: Part Two,Science Fiction-Adventure,en,Follow the mythic journey of Paul Atreides as ...,167.0,Timothée Chalamet-Zendaya-Rebecca Ferguson-Jav...,2024,7.997962,Dune: Part Two Science Fiction-Adventure en Fo...
...,...,...,...,...,...,...,...,...,...
642310,Tomcat Tales,Documentary-History-War,en,Learn from real-life Top Guns who have flown a...,120.0,[NO_CREDITS],2020,2.353348,Tomcat Tales Documentary-History-War en Learn ...
642311,Strictly Modern,Comedy-Romance,en,Strictly Modern is a 1930 American pre-Code co...,63.0,Dorothy Mackaill-Sidney Blackmer-Julanne Johns...,1930,2.353348,Strictly Modern Comedy-Romance en Strictly Mod...
642312,Your Letter,Animation-Drama-Mystery-Adventure,ko,Sori Lee is hoping for a fresh start at her ne...,97.0,Lee Soo-hyun-Kim Min-ju-Min Seungwoo-Nam Doh-h...,2024,2.353348,Your Letter Animation-Drama-Mystery-Adventure ...
642313,Café Tacvba - Un Viaje,[NO_GENRE],en,Un Viaje is the first live album by the rock b...,120.0,Rubén Albarrán-Emmanuel del Real-Joselo Rangel...,2005,2.353348,Café Tacvba - Un Viaje [NO_GENRE] en Un Viaje ...


In [22]:
movies.isna().sum()

Unnamed: 0,0
title,0
genres,0
original_language,0
overview,0
runtime,0
credits,0
release_year,0
weighted_rating,0
combined_text,0


In [23]:
scaler = MinMaxScaler()
numerical_features = ["runtime", 'weighted_rating', "release_year"]

movies[numerical_features] = scaler.fit_transform(movies[numerical_features])

In [24]:
movies = movies.reset_index(drop=True)

In [25]:
model1 = SentenceTransformer("all-MiniLM-L6-v2")

text_embeddings1 = model1.encode(movies["combined_text"], show_progress_bar=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/20073 [00:00<?, ?it/s]

In [28]:
text_embeddings1 = np.array(text_embeddings1)

movie_embeddings1 = np.hstack((text_embeddings1, movies[numerical_features].values))

In [31]:
knn = NearestNeighbors(n_neighbors=10, metric="cosine", algorithm="brute")

model = knn.fit(movie_embeddings1)

In [32]:
def get_recommendations(movie_title):
    idx = movies[movies["title"] == movie_title].index[0]
    distances, indices = knn.kneighbors([movie_embeddings1[idx]], n_neighbors=9)

    recommended_movies = [movies.iloc[i]["title"] for i in indices.flatten()[1:]]
    return recommended_movies

In [None]:
# Save the trained model using pickle
with open('recommendation_model.pkl', 'wb') as f:
    pickle.dump(model, f)

# Save the movie embeddings
with open('movie_embeddings.pkl', 'wb') as f:
  pickle.dump(movie_embeddings1, f)

# Save the movies DataFrame
with open('movies_df.pkl', 'wb') as f:
    pickle.dump(movies, f)