# Movie Recommendations Using OpenAI's Embeddings

## Libraries Installation

In [None]:
pip install datasets==3.0.0 openai==1.16.2 pandas==1.5.3

## Module 1
### Task 1: Data Loading
Load the dataset titled "AIatMongoDB/embedded_movies". This dataset is a collection of movie-related details that include attributes such as the title, release year, cast, plot, and more. A unique feature of this dataset is the `plot_embedding` field for each movie. These embeddings are generated using OpenAI's text-embedding-ada-002 model. But now, let's just read the dataframe.

In [None]:
from datasets import load_dataset
import pandas as pd

# Load the dataset from Hugging Face
dataset = load_dataset("AIatMongoDB/embedded_movies")
dataset_df = pd.DataFrame(dataset['train'])

# Inspect the first few rows
print(dataset_df.head())

###  WRITE YOUR CODE FOR TASK 2 

In [None]:
null_values = dataset_df.isnull().sum()

###  Inspect data 

In [None]:
print(null_values)

###  WRITE YOUR CODE FOR TASK 3 


In [None]:
# Remove rows where 'plot' is missing
dataset_df = dataset_df.dropna(subset=['plot'])

In [None]:
# Remove the 'plot_embedding' column
dataset_df = dataset_df.drop(columns=['plot_embedding'])

In [None]:
# Inspect the cleaned dataset
print(dataset_df.info())

## Task 4: Create Embeddings with OpenAI
### Generate new embeddings using OpenAI's advanced model.



In [None]:
import openai

# Set OpenAI API key
openai.api_key = "your-api-key-here"

# Define the embedding model
EMBEDDING_MODEL = "text-embedding-3-small"

def get_embedding(text):
    """Generate an embedding for the given text using OpenAI's API."""
    try:
        response = openai.Embedding.create(input=text, model=EMBEDDING_MODEL)
        return response['data'][0]['embedding']
    except Exception as e:
        print(f"Error generating embedding: {e}")
        return None

# Apply the embedding function to the 'plot' column
dataset_df['plot_embedding_optimized'] = dataset_df['plot'].apply(get_embedding)

# Save the dataset to a CSV file
dataset_df.to_csv('datasets.csv', index=False)


## Task 5: Generating Movie Recommendations and Responses
### Calculate similarity scores and provide recommendations based on user queries.

In [None]:
import numpy as np

# Function to perform vector search based on embeddings
def vector_search(query_embedding, df):
    """Perform vector search by calculating cosine similarity."""
    df['similarity'] = df['plot_embedding_optimized'].apply(
        lambda x: np.dot(query_embedding, x) / (np.linalg.norm(query_embedding) * np.linalg.norm(x))
    )
    return df.sort_values(by='similarity', ascending=False)

# Function to handle user query
def handle_user_query(query, df):
    """Generate recommendations based on user query."""
    query_embedding = get_embedding(query)
    if query_embedding is None:
        return "Invalid query or embedding generation failed.", None
    
    # Perform vector search
    result_df = vector_search(query_embedding, df)
    
    # Extract top 5 matches
    top_matches = result_df.head(5)
    
    # Format search result
    search_result = "\n".join([f"Title: {row['title']}, Plot: {row['plot']}" for _, row in top_matches.iterrows()])
    
    # Use OpenAI chat completions to generate response
    completion = openai.ChatCompletion.create(
        model="gpt-4",
        messages=[{"role": "user", "content": f"Recommend movies based on: {query}"}]
    )
    return completion.choices[0].message['content'].strip(), search_result

# Sample query
query = "What are the best action movies?"

# Generate recommendations
response, source_information = handle_user_query(query, dataset_df)

# Save results to a text file
with open("response.txt", "w") as file:
    file.write(response)
    file.write("\n\nSource Information:\n")
    file.write(source_information)
