In [None]:
print("Hello")

## Implementing an LLM-Powered recommendation system

In [None]:
import os
os.chdir('/desired/default/directory/path')

### Data Preprocessing

In [None]:
import pandas as pd

md = pd.read_csv('movies_metadata.csv')
md.head()

In [None]:
import pandas as pd
import ast 

# Convert string representation of dictionaries to actual dictionaries
md['genres'] = md['genres'].apply(ast.literal_eval)

# Transforming the 'genres' column
md['genres'] = md['genres'].apply(lambda x: [genre['name'] for genre in x])

md.head()

In [None]:
# Calculate weighted rate (IMDb formula)
def calculated_weighted_rate(vote_average, vote_count, min_vote_count=10):
    return (vote_count / (vote_count + min_vote_count)) * vote_average + (min_vote_count / (vote_count + min_vote_count)) * 5.0

# Minimum vote count to prevent skewed results
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
min_vote_count = vote_counts.quantile(0.95)

# Create a new column 'weighted rate'
md['weighted_rate'] = md.apply(lambda row:calculated_weighted_rate(row['vote_average'], row['vote_count'], min_vote_count), axis=1)
md.head()

In [None]:
md = md.dropna()

In [None]:
md_final = md[['genres', 'title', 'overview', 'weighted_rate']].reset_index(drop=True)
md_final.head()

In [None]:
# Create a new column by combining 'title', 'overview', and 'genre'
md_final['combined_info'] = md_final.apply(lambda row: f"Title: {row('title')}, Overview: {row['overview']} Genres: {', '.join(row['genres'])}, Rating: {row['weighted_rate']}", axis=1)
md_final['combined_info'][9]

### Embeddings

In [None]:
# imports
import pandas as pd
import toktoken
import os
import openai

openai.api_key = os.environ["OPENAI_API_KEY"]

from openai.embeddings_utils import get_embedding

# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base" # this the encoding for text-embedding-ada-002
max_tokens = 8000 # the maximum for text-embedding-ada-002 is 8191

encoding = toktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
md_final["n_tokens"] = md_final.combined_info.apply(lambda x: len(encoding.encode(x)))
md_final = md_final[md_final.n_tokens <= max_tokens]
len(md_final)

In [None]:
md_final.head()

In [None]:
import openai 
openai.api_key = os.environ["OPENAO_API_KEY"]

md_final["embedding"] = md_final.overview.apply(lambda x: get_embedding(x, engine=embedding_model))

In [None]:
md_final.rename(columns = {'embedding': 'vector'}, inplace = True)
md_final.rename(columns = {'combined_info': 'text'}, inplace = True)
md_final.to_pickle('movies.pkl')