In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [25]:
class CosineSimilarityPromptModel:
    def __init__(self, data_path):
        # Load and preprocess data
        self.movies = pd.read_csv(data_path)
        self.movies = self.movies[['movie_id', 'tags']]
        self.movies['tags'] = self.movies['tags'].fillna('')
        
        # Create a TF-IDF matrix for the overview feature
        self.tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.tfidf.fit_transform(self.movies['tags'])
    
    def get_recommendations_from_prompt(self, prompt, num_recommendations=10):
        # Convert the input prompt into a TF-IDF vector
        prompt_tfidf = self.tfidf.transform([prompt])
        
        # Compute cosine similarity between the input prompt and all movie overviews
        cosine_sim_prompt = cosine_similarity(prompt_tfidf, self.tfidf_matrix)
        
        # Get the similarity scores for all movies
        sim_scores = list(enumerate(cosine_sim_prompt[0]))
        
        # Sort the movies based on the similarity scores
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        
        # Get the indices of the top 'num_recommendations' similar movies
        sim_scores = sim_scores[:num_recommendations]  # No need to exclude the prompt itself
        
        # Get the movie indices
        movie_indices = [i[0] for i in sim_scores]
        
        # Return the titles of the top 'num_recommendations' similar movies
        return self.movies['movie_id'].iloc[movie_indices].values

    def save_model(self, filename):
        # Save the model (TF-IDF vectorizer, TF-IDF matrix, and movie data) using pickle
        with open(filename, 'wb') as f:
            pickle.dump({
                'tfidf': self.tfidf,
                'tfidf_matrix': self.tfidf_matrix,
                'movies': self.movies
            }, f)

    @staticmethod
    def load_model(filename):
        with open(filename, 'rb') as f:
            data = pickle.load(f)
        
        model = CosineSimilarityPromptModel.__new__(CosineSimilarityPromptModel)
        model.tfidf = data['tfidf']
        model.tfidf_matrix = data['tfidf_matrix']
        model.movies = data['movies']
        return model

In [43]:
if __name__ == "__main__":
    df= pd.read_csv('model/data/Movies.csv')
    # Step 1: Build the model
    model = CosineSimilarityPromptModel('model/data/Movies.csv')
    
    # Step 2: Save the model to a pickle file
    model.save_model('cosine_similarity_prompt_model.pkl')
    
    # Step 3: Load the model from the pickle file
    loaded_model = CosineSimilarityPromptModel.load_model('cosine_similarity_prompt_model.pkl')
    
    # Step 4: Get recommendations from a prompt (movie description or overview)
    prompt = "christian bale"
    
    # Call the method with the correct parameters: prompt and number of recommendations
    recommendations = loaded_model.get_recommendations_from_prompt(prompt, num_recommendations=10)
    
    # Print recommended movies based on the prompt
    print(f"Movies recommended based on the prompt:")
    for movie in recommendations:
        print(df[df['movie_id']==movie]['title'])

Movies recommended based on the prompt:
1651    The New World
Name: title, dtype: object
317    The Flowers of War
Name: title, dtype: object
2255    Equilibrium
Name: title, dtype: object
3616    Saved!
Name: title, dtype: object
2534    Little Women
Name: title, dtype: object
3259    American Psycho
Name: title, dtype: object
873    Shaft
Name: title, dtype: object
4103    Harsh Times
Name: title, dtype: object
43    Terminator Salvation
Name: title, dtype: object
2245    I'm Not There.
Name: title, dtype: object
