In [5]:
# Randomly sample 500 movies from kaggle dataset of all top rated movies (5000)
import pandas as pd

df = pd.read_csv('top_rated_movies.csv')

print(df.columns)

df_sampled = df.sample(n=500, random_state=1)  # n = 500 movies

df_sampled.to_csv('sampled_movies.csv', index=False)

Index(['Unnamed: 0', 'id', 'original_language', 'title', 'overview',
       'popularity', 'release_date', 'vote_average', 'vote_count'],
      dtype='object')


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_data(filepath):
    """
    Load dataset containing movie overviews.
    """
    return pd.read_csv(filepath)

def preprocess_data(df, text_column):
    """
    Preprocess the text data by filling nulls, removing special characters,
    and converting to lowercase.
    """
    if text_column not in df.columns:
        raise ValueError(f"Column '{text_column}' not found in dataframe")

    df[text_column] = df[text_column].fillna('')
    df[text_column] = df[text_column].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))
    df[text_column] = df[text_column].str.lower().str.strip()
    df[text_column] = df[text_column].replace('\s+', ' ', regex=True)
    return df

def build_tfidf_matrix(df, text_column):
    """
    Convert text data into TF-IDF vectors.
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df[text_column])
    return tfidf_matrix, vectorizer

def get_recommendations(user_input, tfidf_matrix, vectorizer, df, top_n=5):
    """
    Recommend top N items based on cosine similarity, where N = 5.
    """
    user_tfidf = vectorizer.transform([user_input.lower()])
    cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix).flatten()
    top_indices = cosine_sim.argsort()[-top_n:][::-1]
    return df.iloc[top_indices][['title', 'overview']], cosine_sim[top_indices]



In [4]:
if __name__ == "__main__":
    # Load and preprocess data
    filepath = 'sampled_movies.csv'  # Update with correct path
    df = load_data(filepath)
    df = preprocess_data(df, 'overview')
    
    # Build TF-IDF matrix
    tfidf_matrix, vectorizer = build_tfidf_matrix(df, 'overview')
    
    # Get user input
    user_input = input("Enter a short text description of your preferences: ")
    
    # Get recommendations
    recommendations, scores = get_recommendations(user_input, tfidf_matrix, vectorizer, df)
    
    # Display results
    print("Top recommendations:")
    for i, (index, row) in enumerate(recommendations.iterrows()):
        print(f"{i+1}. {row['title']} (Score: {scores[i]:.4f})")


Top recommendations:
1. The Beach Bum (Score: 0.1883)
2. Becoming Jane (Score: 0.1536)
3. The Importance of Being Earnest (Score: 0.1514)
4. Rifkin’s Festival (Score: 0.1387)
5. Only Lovers Left Alive (Score: 0.1125)
