In [1]:
import findspark
import os
import pandas as pd
from ast import literal_eval

from stack_data.utils import truncate

findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[2] pyspark-shell --jars C:\\Users\\09398\\PycharmProjects\\elasticsearch-spark-recommender\\elasticsearch-hadoop-8.17.0\\dist\\elasticsearch-spark-30_2.12-8.17.0.jar"
es_hadoop_jar = "C:\\Users\\09398\\PycharmProjects\\elasticsearch-spark-recommender\\elasticsearch-hadoop-8.17.0\\dist\\elasticsearch-spark-30_2.12-8.17.0.jar"
spark = SparkSession.builder \
    .appName("ElasticsearchIntegration") \
    .config("spark.speculation", "false") \
    .config("spark.jars", es_hadoop_jar) \
    .config("es.nodes", "http://localhost:9200") \
    .config("es.nodes.wan.only", "true") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.memory.fraction", "0.8") \
    .config("spark.memory.storageFraction", "0.5") \
    .getOrCreate()

In [2]:
spark.catalog.clearCache()


In [3]:
PATH_TO_DATA = "../data/archive"

## Top charts of movies

Top 250 Movies of All Time, based on

Weighted Rating (WR) = $(\frac{v}{v + m} . R) + (\frac{m}{v + m} . C) + P$

where,
* *v* is the number of votes for the movie
* *m* is the minimum votes required to be listed in the chart
* *R* is the average rating of the movie
* *C* is the mean vote across the whole report
* *P* is Popularity of the movie

In [5]:
md = spark.read.csv(
    f"{PATH_TO_DATA}/movies_metadata_transformed.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    escape="\"",
    quote='"',
    sep=',',
    ignoreTrailingWhiteSpace=True,
    ignoreLeadingWhiteSpace=True
)
md.show(5)

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|     revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+-------+--

In [6]:
vote_counts = md.filter(col('vote_count').isNotNull()).select(col('vote_count').cast('int'))
vote_averages = md.filter(col('vote_average').isNotNull()).select(col('vote_average').cast('int'))

In [7]:

C = vote_averages.agg(avg('vote_average')).first()[0]
C

5.244896612406511

In [8]:

m = vote_counts.approxQuantile('vote_count', [0.95], 0)[0]
m

434.0

In [9]:
md = md.withColumn('release_date', to_date(col('release_date'), 'yyyy-MM-dd'))
md = md.withColumn('year', year(col('release_date')))
md.show(5)

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+----+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|     revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|year|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+

In [10]:
qualified = md.filter((col('vote_count') >= m) & col('vote_count').isNotNull() & col('vote_average').isNotNull()) \
              .select('title', 'year', 'vote_count', 'vote_average', 'popularity', 'genres')
qualified = qualified.withColumn('vote_count', col('vote_count').cast('int')) \
                     .withColumn('vote_average', col('vote_average').cast('int'))
shape = (qualified.count(), len(qualified.columns))
shape

(2274, 6)

In [11]:
from pyspark.sql.types import DoubleType, StructField


@udf(DoubleType())
def weighted_rating_udf(vote_count, vote_average):
    return (vote_count / (vote_count + m) * vote_average) + (m / (vote_count + m) * C)

In [12]:
qualified = qualified.withColumn('wr', weighted_rating_udf(col('vote_count'), col('vote_average')))

In [14]:
qualified = qualified.orderBy(col('wr').desc())
qualified.show(10)

+--------------------+----+----------+------------+----------+--------------------+------------------+
|               title|year|vote_count|vote_average|popularity|              genres|                wr|
+--------------------+----+----------+------------+----------+--------------------+------------------+
|           Inception|2010|     14075|           8| 29.108149|['Action', 'Thril...| 7.917588057742396|
|     The Dark Knight|2008|     12269|           8|123.167259|['Drama', 'Action...| 7.905871457906355|
|        Interstellar|2014|     11187|           8| 32.213481|['Adventure', 'Dr...| 7.897107402958818|
|          Fight Club|1999|      9678|           8| 63.869599|           ['Drama']| 7.881752880714441|
|The Lord of the R...|2001|      8892|           8| 32.070725|['Adventure', 'Fa...| 7.871786953654775|
|        Pulp Fiction|1994|      8670|           8|140.950236|['Thriller', 'Cri...| 7.868660493166128|
|The Shawshank Red...|1994|      8358|           8| 51.645403|  ['Drama',

In [15]:
from pyspark.sql.types import StringType, ArrayType

genres_schema = ArrayType(StringType())
md = md.withColumn("genres", from_json(col("genres"), genres_schema))
gen_md = md.withColumn("genre", explode(col("genres")))
gen_md = gen_md.drop("genres")
gen_md.show(truncate=False)


+-----+---------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+------------------------------------+-----+---------+-----------------+---------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+--------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [16]:
from pyspark.sql.functions import col, mean, lit, when, expr
from pyspark.sql.types import DoubleType

def build_chart(genre, percentile=0.85):
    genre_df = gen_md.filter(col("genre") == genre)
    genre_df = genre_df.filter(col("vote_count").isNotNull() & col("vote_average").isNotNull())
    genre_df = genre_df.withColumn("vote_count", col("vote_count").cast("int"))
    genre_df = genre_df.withColumn("vote_average", col("vote_average").cast("double"))
    C = genre_df.select(mean(col("vote_average"))).first()[0]
    m = genre_df.approxQuantile("vote_count", [percentile], 0.01)[0]
    qualified = genre_df.filter(col("vote_count") >= m).select(
        "title", "year", "vote_count", "vote_average", "popularity"
    )
    qualified = qualified.withColumn(
        "wr",
        (col("vote_count") / (col("vote_count") + lit(m)) * col("vote_average")) +
        (lit(m) / (col("vote_count") + lit(m)) * lit(C))
    )

    qualified = qualified.orderBy(col("wr").desc()).limit(250)

    return qualified


In [17]:
build_chart('Romance').show(5, truncate=False)

+---------------------------+----+----------+------------+----------+-----------------+
|title                      |year|vote_count|vote_average|popularity|wr               |
+---------------------------+----+----------+------------+----------+-----------------+
|Dilwale Dulhania Le Jayenge|1995|661       |9.1         |34.457024 |8.728219129758896|
|Your Name.                 |2016|1030      |8.5         |34.461252 |8.296609741176281|
|Forrest Gump               |1994|8147      |8.2         |48.307194 |8.17553435204354 |
|Cinema Paradiso            |1988|834       |8.2         |14.177005 |7.980635350403461|
|La La Land                 |2016|4745      |7.9         |19.681686 |7.863516929316707|
+---------------------------+----+----------+------------+----------+-----------------+
only showing top 5 rows



**Content-based recommendation system

In [18]:
links_small = spark.read.csv(
    f"{PATH_TO_DATA}/links_small.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    quote='"',
    sep=',',
    ignoreTrailingWhiteSpace=True,  #
    ignoreLeadingWhiteSpace=True
)
links_small.show(5)

+-------+------+------+
|movieId|imdbId|tmdbId|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
+-------+------+------+
only showing top 5 rows



In [19]:
links_small = links_small.filter(col('tmdbId').isNotNull()) \
                         .select(col('tmdbId').cast('int'))
links_small.show()

+------+
|tmdbId|
+------+
|   862|
|  8844|
| 15602|
| 31357|
| 11862|
|   949|
| 11860|
| 45325|
|  9091|
|   710|
|  9087|
| 12110|
| 21032|
| 10858|
|  1408|
|   524|
|  4584|
|     5|
|  9273|
| 11517|
+------+
only showing top 20 rows



In [20]:
md = md.withColumn('id', col('id').cast('int'))

In [21]:
md = md.filter(col('id').isNotNull())

In [22]:
md.show(5)

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+----+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|     revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|year|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+

Movie description Based Recommender

In [23]:
import torch

In [1]:
from elasticsearch import Elasticsearch
es = Elasticsearch(hosts="http://localhost:9200")
es.info(pretty=True)

ObjectApiResponse({'name': '4bbae3288639', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'dsCurIfHRImh4-bX_WtMsA', 'version': {'number': '8.10.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '6d20dd8ce62365be9b1aca96427de4622e970e9e', 'build_date': '2023-09-19T08:16:24.564900370Z', 'build_snapshot': False, 'lucene_version': '9.7.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [25]:
VECTOR_DIM = 20
VECTOR_META_DIM=500
create_ratings = {
    "mappings": {
        "properties": {
            "timestamp": {
                "type": "date"
            },
            "userId": {
                "type": "integer"
            },
            "movieId": {
                "type": "integer"
            },
            "rating": {
                "type": "double"
            }
        }
    }
}
create_users = {
    "mappings": {
        "properties": {
            "userId": {
                "type": "integer"
            },
            "model_factor": {
                "type": "dense_vector",
                "dims" : VECTOR_DIM
            },
            "model_version": {
                "type": "keyword"
            },
            "model_timestamp": {
                "type": "date"
            }
        }
    }
}

create_movies = {
    "mappings": {
        "properties": {
            "movieId": {
                "type": "integer"
            },
           "year": {
                "type": "date",
                "format": "year"
            },
            "title": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "keyword": {
                        "type": "keyword"
                    }
                }
            },
            "tmdbId": {
                "type": "keyword"
            },
            "vote_count": {
                "type": "integer"
            },
            "vote_average": {
                "type": "double"
            },
            "popularity": {
                "type": "double"
            },
            "description": {
                "type": "text",
                "analyzer": "english",
                "fields": {
                    "keyword": {
                        "type": "keyword"
                    }
                }
            },
            "meta_factor": {
                "type": "dense_vector",
                "dims" : VECTOR_META_DIM
            },
            "model_factor": {
                "type": "dense_vector",
                "dims" : VECTOR_DIM
            },
            "model_version": {
                "type": "keyword"
            },
            "model_timestamp": {
                "type": "date"
            }
        }
    }
}
def delete_index(index_name):
    if es.indices.exists(index=index_name):
        es.indices.delete(index=index_name)
        print(f"Deleted index: {index_name}")
delete_index("ratings")
delete_index("users")
delete_index("movies")

# Create indices with the provided mappings
res_ratings = es.indices.create(index="ratings", body=create_ratings)
res_users = es.indices.create(index="users", body=create_users)
res_movies = es.indices.create(index="movies", body=create_movies)

print("Created indices:")
print(res_ratings)
print(res_users)
print(res_movies)

Deleted index: ratings
Deleted index: users
Deleted index: movies
Created indices:
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'ratings'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'users'}
{'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'}


*Handle Meta data of movies

In [26]:
credits = spark.read.csv(
    f"{PATH_TO_DATA}/credits_transformed.csv",
    header=True,
    inferSchema=False,
    multiLine=True,
    escape="\"",
    quote='"',
    sep=',',
    ignoreTrailingWhiteSpace=True,
    ignoreLeadingWhiteSpace=True
)
keywords = spark.read.csv(
    f"{PATH_TO_DATA}/keywords_transformed.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    escape="\"",
    quote='"',
    sep=',',
    ignoreTrailingWhiteSpace=True,
    ignoreLeadingWhiteSpace=True
)

In [27]:
credits = credits.withColumn('id', col('id').cast('int'))
keywords = keywords.withColumn('id', col('id').cast('int'))

In [28]:
credits.printSchema()

root
 |-- cast: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- cast_size: string (nullable = true)
 |-- crew_size: string (nullable = true)
 |-- director: string (nullable = true)



In [29]:
credits.show(5)

+--------------------+-----+---------+---------+--------------------+
|                cast|   id|cast_size|crew_size|            director|
+--------------------+-----+---------+---------+--------------------+
|['tomhanks', 'tim...|  862|       13|      106|['johnlasseter', ...|
|['robinwilliams',...| 8844|       26|       16|['joejohnston', '...|
|['waltermatthau',...|15602|        7|        4|['howarddeutch', ...|
|['whitneyhouston'...|31357|       10|       10|['forestwhitaker'...|
|['stevemartin', '...|11862|       12|        7|['charlesshyer', ...|
+--------------------+-----+---------+---------+--------------------+
only showing top 5 rows



In [28]:
credits = credits.withColumn('cast', from_json(col('cast'), ArrayType(StringType())))

In [29]:
credits = credits.withColumn('director', from_json(col('director'), ArrayType(StringType())))

In [32]:
keywords.printSchema()

root
 |-- id: integer (nullable = true)
 |-- keywords: string (nullable = true)



In [33]:
keywords.show(5)

+-----+--------------------+
|   id|            keywords|
+-----+--------------------+
|  862|['jealousi', 'toy...|
| 8844|['boardgam', 'dis...|
|15602|['fish', 'bestfri...|
|31357|['basedonnovel', ...|
|11862|['babi', 'midlife...|
+-----+--------------------+
only showing top 5 rows



In [30]:
keywords = keywords.withColumn('keywords', from_json(col('keywords'), ArrayType(StringType())))

In [56]:
md.show(5)

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+----+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|     revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|year|
+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+

In [31]:
md = md.join(credits, md['id'] == credits['id'], 'left').drop(credits['id'])
md = md.join(keywords, md['id'] == keywords['id'], 'left').drop(keywords['id'])
md = md.join(links_small, md['id'] == links_small['tmdbId'])
md = md.withColumn('tagline', coalesce(col('tagline').cast('string'), lit('')))
md = md.withColumn('overview', coalesce(col('overview').cast('string'), lit('')))
md = md.withColumn('description', concat(col('tagline'), lit(' '), col('overview')))


In [32]:
base_poster_url = 'https://image.tmdb.org/t/p/w500'


md = md.withColumn(
   'poster_path',
    concat(
        lit("<img src='"),
        lit(base_poster_url),
        col('poster_path'),
        lit("' style='height:100px;'>")
    )
)


In [59]:
import pandas as pd

In [60]:
credits = pd.read_csv(f"{PATH_TO_DATA}/credits.csv")
keywords = pd.read_csv(f"{PATH_TO_DATA}/keywords.csv")

In [42]:
from ast import literal_eval

In [43]:
credits['cast'] = credits['cast'].apply(literal_eval)
credits['crew'] = credits['crew'].apply(literal_eval)

In [44]:
keywords['keywords'] = keywords['keywords'].apply(literal_eval)

In [45]:
credits['cast_size'] = credits['cast'].apply(lambda x: len(x))
credits['crew_size'] = credits['crew'].apply(lambda x: len(x))

In [46]:
import numpy as np


def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [47]:
credits['director'] = credits['crew'].apply(get_director)

In [48]:
credits['cast'] = credits['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])


In [16]:
credits['cast'] = credits['cast'].apply(lambda x: x[:3] if len(x) > 3 else x)

In [33]:
keywords['keywords'] = keywords['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [18]:
credits['cast'] = credits['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [19]:
credits['director'] = credits['director'].astype('str').apply(lambda x: str.lower(x.replace(" ", "")) if x != np.nan else np.nan)

In [23]:
credits['director'] = credits['director'].astype('str').apply(lambda x: [x, x, x])

In [31]:
credits.drop(columns=['keywords'], inplace=True)

In [41]:
credits.drop(columns=['crew'], inplace=True)

In [32]:
credits.head(5)

Unnamed: 0,cast,crew,id,cast_size,crew_size,director
0,"[tomhanks, timallen, donrickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,13,106,"[johnlasseter, johnlasseter, johnlasseter]"
1,"[robinwilliams, jonathanhyde, kirstendunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,26,16,"[joejohnston, joejohnston, joejohnston]"
2,"[waltermatthau, jacklemmon, ann-margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,7,4,"[howarddeutch, howarddeutch, howarddeutch]"
3,"[whitneyhouston, angelabassett, lorettadevine]","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,10,10,"[forestwhitaker, forestwhitaker, forestwhitaker]"
4,"[stevemartin, dianekeaton, martinshort]","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,12,7,"[charlesshyer, charlesshyer, charlesshyer]"


In [35]:
s = keywords.apply(lambda x: pd.Series(x['keywords']), axis=1).stack().reset_index(level=1, drop=True)

In [36]:
s = s.value_counts()

s = s[s > 1]

In [37]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [38]:
from nltk.stem.snowball import SnowballStemmer

In [39]:
stemmer = SnowballStemmer('english')

In [40]:
keywords['keywords'] = keywords['keywords'].apply(filter_keywords)
keywords['keywords'] = keywords['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
keywords['keywords'] = keywords['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [44]:
keywords.to_csv(f"{PATH_TO_DATA}/keywords_transformed.csv", index=False)
credits.to_csv(f"{PATH_TO_DATA}/credits_transformed.csv", index=False)


In [197]:
md.show(5)

+-----+---------------------+--------+--------------------+--------------------+-----+---------+-----------------+--------------------+--------------------+----------+--------------------+--------------------+--------------------+------------+------------+-------+--------------------+--------+--------------------+--------------------+-----+------------+----------+----+--------------------+---------+---------+--------------------+--------------------+--------------------+
|adult|belongs_to_collection|  budget|              genres|            homepage|   id|  imdb_id|original_language|      original_title|            overview|popularity|         poster_path|production_companies|production_countries|release_date|     revenue|runtime|    spoken_languages|  status|             tagline|               title|video|vote_average|vote_count|year|                cast|cast_size|crew_size|            director|            keywords|         description|
+-----+---------------------+--------+----------

In [33]:
ratings = spark.read.csv(
    f"{PATH_TO_DATA}/ratings_small.csv",
    header=True,
    inferSchema=True,
    multiLine=True,
    quote='"',
    sep=',',
    ignoreTrailingWhiteSpace=True,
    ignoreLeadingWhiteSpace=True
)

Dataframe count: 100004
ES index count:  100004


In [200]:
md.printSchema()


root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- homepage: string (nullable = true)
 |-- id: integer (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = false)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: date (nullable = true)
 |-- revenue: double (nullable = true)
 |-- runtime: double (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = false)
 |-- title: string (nullable = true)
 |-- video: boolean (nullable = true)
 |-- vote_average: dou

In [34]:

insert = md.withColumn("movieId", col("id")).withColumn("tmdbId", col("imdb_id").cast("string"))



In [35]:
insert = insert.withColumn("meta_data", concat_ws(
        " ",  # Separator
        concat_ws(" ", col("keywords")),
        concat_ws(" ", col("cast")),
        concat_ws("", col("director")),
        concat_ws(" ", col("genres"))
    ))

In [93]:
insert.select("poster_path").show(5, truncate=False)

+-------------------------------------------------------------------------------------------------+
|poster_path                                                                                      |
+-------------------------------------------------------------------------------------------------+
|<img src='http://image.tmdb.org/t/p/w185//rhIRbceoE9lR4veEXuwCC2wARtG.jpg' style='height:100px;'>|
|<img src='http://image.tmdb.org/t/p/w185//vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg' style='height:100px;'>|
|<img src='http://image.tmdb.org/t/p/w185//6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg' style='height:100px;'>|
|<img src='http://image.tmdb.org/t/p/w185//16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg' style='height:100px;'>|
|<img src='http://image.tmdb.org/t/p/w185//e64sOI48hQXyru7naBFyssKFxVd.jpg' style='height:100px;'>|
+-------------------------------------------------------------------------------------------------+
only showing top 5 rows



In [36]:
movies_data = insert.select("movieId","year",  "title", "description", "meta_data", "tmdbId", "vote_count", "vote_average", "popularity")

In [107]:
movies_data.where(col("movieId") == 265189).show()


+-------+----+-------------+--------------------+--------------------+---------+----------+------------+----------+
|movieId|year|        title|         description|           meta_data|   tmdbId|vote_count|vote_average|popularity|
+-------+----+-------------+--------------------+--------------------+---------+----------+------------+----------+
| 265189|2014|Force Majeure| While holidaying...|femalenud darkcom...|tt2121382|     255.0|         6.8| 12.165685|
| 265189|2014|Force Majeure| While holidaying...|femalenud darkcom...|tt2121382|     255.0|         6.8| 12.165685|
| 265189|2014|Force Majeure| While holidaying...|femalenud darkcom...|tt2121382|     255.0|         6.8| 12.165685|
| 265189|2014|Force Majeure| While holidaying...|femalenud darkcom...|tt2121382|     255.0|         6.8| 12.165685|
| 265189|2014|Force Majeure| While holidaying...|femalenud darkcom...|tt2121382|     255.0|         6.8| 12.165685|
| 265189|2014|Force Majeure| While holidaying...|femalenud darkcom...|tt

In [37]:
movies_data_deduplicated = movies_data.dropDuplicates(["movieId"])

In [38]:
movies_data_deduplicated.show(10,truncate=False)

+-------+----+------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------+----------+------------+----------+
|movieId|year|title            

In [39]:
movie_data_to_indexed = movies_data_deduplicated.drop("meta_data")

In [40]:
movies_data_deduplicated.write.format("es").option("es.mapping.id", "movieId").save("movies")
num_movies_df = movies_data_deduplicated.count()
num_movies_es = es.count(index="movies")['count']

print("Movie DF count: {}".format(num_movies_df))
print("ES index count: {}".format(num_movies_es))

Movie DF count: 9082
ES index count: 9082


In [41]:
es.indices.refresh(index="movies")

ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [69]:
ratings = ratings.filter(col('movieId').isin([i.movieId for i in movies_data_deduplicated.select("movieId").collect()]))

In [70]:
ratings.write.format("es").save("ratings")
num_ratings_es = es.count(index="ratings")['count']
num_ratings_df = ratings.count()
print("Dataframe count: {}".format(num_ratings_df))
print("ES index count:  {}".format(num_ratings_es))

Dataframe count: 32131
ES index count:  32131


In [45]:
from pyspark.ml.feature import Tokenizer, NGram, StopWordsRemover, CountVectorizer

tokenizer = Tokenizer(inputCol="meta_data", outputCol="words")
wordsData = tokenizer.transform(movies_data_deduplicated)
stop_words_remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
df_filtered = stop_words_remover.transform(wordsData)
ngram1 = NGram(n=1, inputCol="filtered_words", outputCol="ngrams_1")
ngram2 = NGram(n=2, inputCol="filtered_words", outputCol="ngrams_2")
df_ngrams_1 = ngram1.transform(df_filtered)
df_ngrams_2 = ngram2.transform(df_filtered)



In [46]:
df_ngrams_1.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- year: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- description: string (nullable = false)
 |-- meta_data: string (nullable = false)
 |-- tmdbId: string (nullable = true)
 |-- vote_count: double (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- popularity: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- filtered_words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- ngrams_1: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [47]:
df_combined = df_ngrams_1.join(df_ngrams_2, on="movieId")
df_combined = df_combined.withColumn("combined_ngrams",  concat(col("ngrams_1"), col("ngrams_2")))




In [48]:
df_combined.select("ngrams_1", "ngrams_2", "combined_ngrams").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [50]:
cv = CountVectorizer(inputCol="combined_ngrams", outputCol="features", vocabSize=500, minDF=2.0)

model = cv.fit(df_combined)

In [59]:
result = model.transform(df_combined)

result.show(5)

+-------+----+--------------+--------------------+--------------------+---------+----------+------------+----------+--------------------+--------------------+--------------------+----+--------------+--------------------+--------------------+---------+----------+------------+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|movieId|year|         title|         description|           meta_data|   tmdbId|vote_count|vote_average|popularity|               words|      filtered_words|            ngrams_1|year|         title|         description|           meta_data|   tmdbId|vote_count|vote_average|popularity|               words|      filtered_words|            ngrams_2|     combined_ngrams|            features|
+-------+----+--------------+--------------------+--------------------+---------+----------+------------+----------+--------------------+--------------------+--------------------+----+--------------+-----------------

In [116]:
result.select("features").show(5,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [58]:
from pyspark.sql.types import FloatType
from pyspark.ml.linalg import DenseVector

@udf(ArrayType(FloatType()))

def toDense(v):
    v = DenseVector(v)
    new_array = list([float(x) for x in v])
    return new_array


In [60]:
result = result.withColumn("features", toDense(col("features")))

In [61]:
meta_d = result.select("movieId", col("features").alias("meta_factor"))


In [62]:
meta_d.select("meta_factor").show(5, truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [63]:
meta_d.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- meta_factor: array (nullable = true)
 |    |-- element: float (containsNull = true)



In [64]:
meta_d.write.format("es") \
    .option("es.mapping.id", "movieId") \
    .option("es.write.operation", "update") \
    .save("movies", mode="append")

In [66]:
es.search(index="movies", size=1)

ObjectApiResponse({'took': 5, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 9082, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '2', '_score': 1.0, '_source': {'movieId': 2, 'year': 1988, 'title': 'Ariel', 'description': " Taisto Kasurinen is a Finnish coal miner whose father has just committed suicide and who is framed for a crime he did not commit. In jail, he starts to dream about leaving the country and starting a new life. He escapes from prison but things don't go as planned...", 'meta_data': 'underdog prison factorywork prison helsinki independentfilm fallinginlov turopajala susannahaavisto mattipellonpää akikaurismäkiakikaurismäkiakikaurismäki Drama Crime', 'tmdbId': 'tt0094675', 'vote_count': 44.0, 'vote_average': 7.1, 'popularity': '3.860491', 'meta_factor': [1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0

In [72]:
ratings.show(5, truncate=False)

+------+-------+------+----------+
|userId|movieId|rating|timestamp |
+------+-------+------+----------+
|1     |1371   |2.5   |1260759135|
|1     |1405   |1.0   |1260759203|
|1     |2105   |4.0   |1260759139|
|1     |2193   |2.0   |1260759198|
|1     |2294   |2.0   |1260759108|
+------+-------+------+----------+
only showing top 5 rows



In [71]:
ratings.count()

32131

In [73]:
from pyspark.ml.recommendation import ALS
from pyspark.sql.functions import col
als = ALS(userCol="userId", itemCol="movieId", ratingCol="rating", regParam=0.02, rank=VECTOR_DIM, seed=54)
model = als.fit(ratings)
model.userFactors.show(5)
model.itemFactors.show(5)

+---+--------------------+
| id|            features|
+---+--------------------+
| 10|[0.44645143, -0.0...|
| 20|[0.66196495, 0.22...|
| 30|[0.43170795, -0.3...|
| 40|[0.6738147, -0.32...|
| 50|[0.8255866, 0.006...|
+---+--------------------+
only showing top 5 rows

+---+--------------------+
| id|            features|
+---+--------------------+
| 20|[0.9512689, -0.04...|
| 70|[1.3781825, 0.325...|
| 80|[1.6277056, -0.90...|
|100|[1.3458409, -0.49...|
|110|[1.3195972, 0.106...|
+---+--------------------+
only showing top 5 rows



In [75]:
from pyspark.sql.functions import lit, current_timestamp, unix_timestamp
ver = model.uid
ts = unix_timestamp(current_timestamp())
movie_vectors = model.itemFactors.select(col("id").alias("movieId"),\
                                         col("features").alias("model_factor"),\
                                         lit(ver).alias("model_version"),\
                                         ts.alias("model_timestamp"))
movie_vectors.show(5)
user_vectors = model.userFactors.select(col("id").alias("userId"),\
                                        col("features").alias("model_factor"),\
                                        lit(ver).alias("model_version"),\
                                        ts.alias("model_timestamp"))
user_vectors.show(5)

+-------+--------------------+----------------+---------------+
|movieId|        model_factor|   model_version|model_timestamp|
+-------+--------------------+----------------+---------------+
|     20|[0.9512689, -0.04...|ALS_bfc13b564e51|     1734363858|
|     70|[1.3781825, 0.325...|ALS_bfc13b564e51|     1734363858|
|     80|[1.6277056, -0.90...|ALS_bfc13b564e51|     1734363858|
|    100|[1.3458409, -0.49...|ALS_bfc13b564e51|     1734363858|
|    110|[1.3195972, 0.106...|ALS_bfc13b564e51|     1734363858|
+-------+--------------------+----------------+---------------+
only showing top 5 rows

+------+--------------------+----------------+---------------+
|userId|        model_factor|   model_version|model_timestamp|
+------+--------------------+----------------+---------------+
|    10|[0.44645143, -0.0...|ALS_bfc13b564e51|     1734363858|
|    20|[0.66196495, 0.22...|ALS_bfc13b564e51|     1734363858|
|    30|[0.43170795, -0.3...|ALS_bfc13b564e51|     1734363858|
|    40|[0.6738147, -

In [76]:
movie_vectors.write.format("es") \
    .option("es.mapping.id", "movieId") \
    .option("es.write.operation", "update") \
    .save("movies", mode="append")

In [77]:
user_vectors.write.format("es") \
    .option("es.mapping.id", "userId") \
    .save("users")


In [78]:
es.indices.refresh(index="movies")
es.indices.refresh(index="users")


ObjectApiResponse({'_shards': {'total': 2, 'successful': 1, 'failed': 0}})

In [49]:
es.search(index="movies", q="title:matrix", size=3)

ObjectApiResponse({'took': 58, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 3, 'relation': 'eq'}, 'max_score': 9.737689, 'hits': [{'_index': 'movies', '_id': '603', '_score': 9.737689, '_ignored': ['meta_data.keyword'], '_source': {'movieId': 603, 'year': 1999, 'title': 'The Matrix', 'description': 'Welcome to the Real World. Set in the 22nd century, The Matrix tells the story of a computer hacker who joins a group of underground insurgents fighting the vast and powerful computers who now rule the earth.', 'meta_data': 'savingtheworld artificialintellig manvsmachin philosophi propheci martialart selfsacrific fight insurg virtualr dystopia truth cyberpunk womandirector messiah gnostic keanureeves laurencefishburne carrie-annemoss lanawachowskilanawachowskilanawachowski Action Science Fiction', 'poster_path': "<img src='http://image.tmdb.org/t/p/w185//hEpWvX6Bp79eLxY1kX5ZZJcme5U.jpg' style='height:100px;'>", 'id': 60

In [31]:
def vector_query(query_vec, vector_field, q="*", cosine=False):
    if cosine:
        score_fn = "doc['{v}'].size() == 0 ? 0 : cosineSimilarity(params.vector, '{v}') + 1.0"
    else:
        score_fn = "doc['{v}'].size() == 0 ? 0 : sigmoid(1, Math.E, -dotProduct(params.vector, '{v}'))"

    score_fn = score_fn.format(v=vector_field, fn=score_fn)

    return {
    "query": {
        "script_score": {
            "query" : {
                "query_string": {
                    "query": q
                }
            },
            "script": {
                "source": score_fn,
                "params": {
                    "vector": query_vec
                }
            }
        }
    }
}



In [35]:
def compute_global_params(index="movies"):
    body = {
        "size": 0,
        "aggs": {
            "avg_vote": {"avg": {"field": "vote_average"}},
            "min_votes": {"percentiles": {"field": "vote_count", "percents": [65]}},
            "max_popularity": {"max": {"field": "popularity"}}
        }
    }
    results = es.search(index=index, body=body)
    C = results['aggregations']['avg_vote']['value']
    m = results['aggregations']['min_votes']['values']['65.0']
    P_max = results['aggregations']['max_popularity']['value']
    return C, m, P_max

C, m, P_max = compute_global_params()


In [36]:
print(C, m, P_max)

6.362177934375689 210.14232195621454 547.488298


In [86]:
data_process_pandas = movies_data_deduplicated.toPandas()


In [94]:
rating_to_processed = ratings.toPandas()

In [95]:
rating_to_processed.to_csv(f"{PATH_TO_DATA}/ratings_processed.csv")

In [89]:
data_process_pandas.to_csv(f"{PATH_TO_DATA}/movie_data_process.csv", index=False)

In [14]:
lambda_popularity = 0.5

In [39]:
def calculate_weighted_rating(v, R, P, C, m, P_max, lambda_popularity=0.5):

    try:
        P = float(P)
    except (TypeError, ValueError):
        print("error p")
        P = 0

    if v > 0:
        wr = ((v / (v + m)) * R) + ((m / (v + m)) * C) + lambda_popularity * (P / P_max if P_max else 0)
    else:
        wr = lambda_popularity * (P / P_max if P_max else 0)
    return wr



In [18]:
def process_recommendations(hits, C, m, P_max, lambda_popularity=0.5):

    recommendations = []
    for hit in hits:
        rec = hit['_source']
        v = rec.get('vote_count', 0)
        R = rec.get('vote_average', 0)
        P = rec.get('popularity', 0)
        print(v, R, P)
        wr = calculate_weighted_rating(v, R, P, C, m, P_max, lambda_popularity)
        rec['weighted_rating'] = wr
        rec['original_score'] = hit['_score']
        recommendations.append(rec)
    recommendations.sort(key=lambda x: x['weighted_rating'], reverse=True)
    return recommendations


In [33]:
def get_similar(the_id, q="*", num=10, index="movies", vector_field='model_factor', cosine=False):
    response = es.get(index=index, id=the_id)
    src = response['_source']
    if vector_field in src:
        query_vec = src[vector_field]
        q = vector_query(query_vec, vector_field, q=q, cosine=cosine)
        results = es.search(index=index, body=q)
        hits = results['hits']['hits']
        recommendations = process_recommendations(hits, C, m, P_max, lambda_popularity)
        return src, recommendations[:num+1]

In [40]:
get_similar(2019)

({'movieId': 2019,
  'year': 1993,
  'title': 'Hard Target',
  'description': "Don't hunt what you can't kill. When a woman's father goes missing, she enlist a local to aid in her search.  The pair soon discover that her father has died at the hands of a wealthy sportsman who hunts homeless men as a form of recreation.",
  'meta_data': 'neworlean mercenari huntinghumanb money drifter union sailor humanprey jean-claudevandamme arnoldvosloo lancehenriksen johnwoojohnwoojohnwoo Action Adventure Crime Thriller',
  'tmdbId': 'tt0107076',
  'vote_count': 237.0,
  'vote_average': 6.1,
  'popularity': '7.834351',
  'meta_factor': [0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0

In [20]:

def get_user_recs(the_id, q="*", num=10, users="users", movies="movies", vector_field='model_factor'):
    response = es.get(index=users, id=the_id)
    src = response['_source']
    if vector_field in src:
        query_vec = src[vector_field]
        q = vector_query(query_vec, vector_field, q=q, cosine=False)
        results = es.search(index=movies, body=q)
        hits = results['hits']['hits']
        recommendations = process_recommendations(hits, C, m, P_max, lambda_popularity)
        return src, recommendations[:num]

In [103]:
import json
from requests import HTTPError
import requests
def get_poster_url(movie_id):
    try:
        IMAGE_URL = 'https://image.tmdb.org/t/p/w500'
        url = "https://api.themoviedb.org/3/movie/"+ str(movie_id) + "?language=en-US"

        headers = {
            "accept": "application/json",
            "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiJkNDM4ZjJmOGVmMjk5ZmI4ZTA5MWVlZDEyZWY0YzQyMiIsIm5iZiI6MTY3MDEyNzA0NC43Niwic3ViIjoiNjM4YzFkYzQwZTY0YWYwMGRlYWIyMTQ5Iiwic2NvcGVzIjpbImFwaV9yZWFkIl0sInZlcnNpb24iOjF9.o2bVzOt4bu9yJCQnB-7TLgyF-_VSS-sLPhQ8YAdz0-U"
        }
        movie_info  = requests.get(url, headers=headers)
        movie_info = json.loads(movie_info.text)
        movie_poster_url = IMAGE_URL + movie_info['poster_path']
        return movie_poster_url
    except HTTPError as e:
        if e.response.status_code == 401:
            j = json.loads(e.response.text)
            print(j)


In [55]:
def get_hybrid_recommendations(user_id, content_id, alpha=0.5, num=10,
                               user_index="users", movie_index="movies",
                               user_vector_field='model_factor', content_vector_field='meta_factor'):

    user_src, user_recs = get_user_recs(user_id, users=user_index, movies=movie_index, vector_field=user_vector_field, num=num * 2)
    user_scores = {rec['movieId']: rec['original_score'] for rec in user_recs}
    content_src, content_recs = get_similar(content_id, index=movie_index, vector_field=content_vector_field, num=num * 2, cosine=False)
    content_scores = {rec['movieId']: rec['original_score'] for rec in content_recs}
    combined_scores = {}
    for movie_id in set(user_scores.keys()).union(content_scores.keys()):
        user_score = user_scores.get(movie_id, 0)
        content_score = content_scores.get(movie_id, 0)
        combined_scores[movie_id] = alpha * user_score + (1 - alpha) * content_score
    ranked_movies = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:num]
    recommendations = []
    for movie_id, score in ranked_movies:
        movie_data = es.get(index=movie_index, id=movie_id)['_source']
        movie_data['score'] = score
        recommendations.append(movie_data)
    return recommendations


In [56]:
get_hybrid_recommendations(1, 2019)

[{'movieId': 2019,
  'year': 1993,
  'title': 'Hard Target',
  'description': "Don't hunt what you can't kill. When a woman's father goes missing, she enlist a local to aid in her search.  The pair soon discover that her father has died at the hands of a wealthy sportsman who hunts homeless men as a form of recreation.",
  'meta_data': 'neworlean mercenari huntinghumanb money drifter union sailor humanprey jean-claudevandamme arnoldvosloo lancehenriksen johnwoojohnwoojohnwoo Action Adventure Crime Thriller',
  'tmdbId': 'tt0107076',
  'vote_count': 237.0,
  'vote_average': 6.1,
  'popularity': '7.834351',
  'meta_factor': [0.0,
   0.0,
   1.0,
   0.0,
   1.0,
   1.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   1.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0

In [21]:
from IPython.core.display import HTML, Image


def get_movies_for_user(the_id, num=10, ratings="ratings", movies="movies"):
    response = es.search(index=ratings, q="userId:{}".format(the_id), size=num, sort=["rating:desc"])
    hits = response['hits']['hits']
    ids = [h['_source']['movieId'] for h in hits]
    movies = es.mget(body={"ids": ids}, index=movies, _source_includes=['tmdbId', 'title'])
    movies_hits = movies['docs']
    tmdbids = [h['_source'] for h in movies_hits]
    return tmdbids


def display_user_recs(the_id, q="*", num=10, num_last=10, users="users", movies="movies", ratings="ratings"):
    user, recs = get_user_recs(the_id, q, num, users, movies)
    user_movies = get_movies_for_user(the_id, num_last, ratings, movies)
    # check that posters can be displayed
    first_movie = user_movies[0]
    first_im_url = get_poster_url(first_movie['tmdbId'])
    if first_im_url == "NA":
        display(HTML("<i>Cannot import tmdbsimple. No movie posters will be displayed!</i>"))
    if first_im_url == "KEY_ERR":
        display(HTML("<i>Key error accessing TMDb API. Check your API key. No movie posters will be displayed!</i>"))

    # display the movies that this user has rated highly
    display(HTML("<h2>Get recommended movies for user id %s</h2>" % the_id))
    display(HTML("<h4>The user has rated the following movies highly:</h4>"))
    user_html = "<table border=0>"
    i = 0
    for movie in user_movies:
        movie_im_url = get_poster_url(movie['tmdbId'])
        movie_title = movie['title']
        user_html += "<td><h5>%s</h5><img src=%s width=150></img></td>" % (movie_title, movie_im_url)
        i += 1
        if i % 5 == 0:
            user_html += "</tr><tr>"
    user_html += "</tr></table>"
    display(HTML(user_html))
    # now display the recommended movies for the user
    display(HTML("<br>"))
    display(HTML("<h2>Recommended movies:</h2>"))
    rec_html = "<table border=0>"
    i = 0
    for rec in recs:
        r_im_url = get_poster_url(rec['_source']['tmdbId'])
        r_score = rec['_score']
        r_title = rec['_source']['title']
        rec_html += "<td><h5>%s</h5><img src=%s width=150></img></td><td><h5>%2.3f</h5></td>" % (r_title, r_im_url, r_score)
        i += 1
        if i % 5 == 0:
            rec_html += "</tr><tr>"
    rec_html += "</tr></table>"
    display(HTML(rec_html))


def display_similar(the_id, q="*", num=10, movies="movies"):
    """
    Display query movie, together with similar movies and similarity scores, in a table
    """
    movie, recs = get_similar(the_id, q, num, movies)
    q_im_url = get_poster_url(movie['tmdbId'])
    if q_im_url == "NA":
        display(HTML("<i>Cannot import tmdbsimple. No movie posters will be displayed!</i>"))
    if q_im_url == "KEY_ERR":
        display(HTML("<i>Key error accessing TMDb API. Check your API key. No movie posters will be displayed!</i>"))

    display(HTML("<h2>Get similar movies for:</h2>"))
    display(HTML("<h4>%s</h4>" % movie['title']))
    display(HTML("<p>%s</p>" % movie['description']))
    if q_im_url != "NA":
        display(Image(q_im_url, width=200))
    display(HTML("<br>"))
    display(HTML("<h2>People who liked this movie also liked these:</h2>"))
    sim_html = "<table border=0>"
    i = 0
    for rec in recs:
        r_im_url = get_poster_url(rec['_source']['tmdbId'])
        r_score = rec['_score']
        r_title = rec['_source']['title']
        sim_html += "<td><h5>%s</h5><img src=%s width=150></img></td><td><h5>%2.3f</h5></td>" % (r_title, r_im_url, r_score)
        i += 1
        if i % 5 == 0:
            sim_html += "</tr><tr>"
    sim_html += "</tr></table>"
    display(HTML(sim_html))


In [22]:
display_similar(2019)

BadRequestError: BadRequestError(400, 'search_phase_execution_exception', 'runtime error')