In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/movielens-20m-dataset/rating.csv
/kaggle/input/movielens-20m-dataset/link.csv
/kaggle/input/movielens-20m-dataset/genome_tags.csv
/kaggle/input/movielens-20m-dataset/genome_scores.csv
/kaggle/input/movielens-20m-dataset/tag.csv
/kaggle/input/movielens-20m-dataset/movie.csv
/kaggle/input/svd/scikitlearn/default/1/svd_model.pkl


In [5]:
import pandas as pd

# Load data
ratings = pd.read_csv('/kaggle/input/movielens-20m-dataset/rating.csv')
movies = pd.read_csv('/kaggle/input/movielens-20m-dataset/movie.csv')
tags = pd.read_csv('/kaggle/input/movielens-20m-dataset/tag.csv')
genome_tags = pd.read_csv('/kaggle/input/movielens-20m-dataset/genome_tags.csv')
genome_scores = pd.read_csv('/kaggle/input/movielens-20m-dataset/genome_scores.csv')
links = pd.read_csv('/kaggle/input/movielens-20m-dataset/link.csv')

# Merge tags with genome scores for semantic features
genome = genome_scores.merge(genome_tags, on='tagId')

# Merge movie metadata with tags and genres
movie_data = movies.merge(genome_scores, on='movieId', how='left').merge(genome_tags, on='tagId', how='left')

In [6]:
# Check for missing values
ratings.isnull().sum()
movies.isnull().sum()
tags.isnull().sum()


userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

In [7]:
movie_data.dropna()

Unnamed: 0,movieId,title,genres,tagId,relevance,tag
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,0.02500,007
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,0.02500,007 (series)
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,0.05775,18th century
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,0.09675,1920s
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,0.14675,1930s
...,...,...,...,...,...,...
11726643,131170,Parallels (2015),Sci-Fi,1124.0,0.58775,writing
11726644,131170,Parallels (2015),Sci-Fi,1125.0,0.01075,wuxia
11726645,131170,Parallels (2015),Sci-Fi,1126.0,0.01575,wwii
11726646,131170,Parallels (2015),Sci-Fi,1127.0,0.11450,zombie


In [50]:
ratings.head(1)

Unnamed: 0,userId,movieId,rating,timestamp,normalized_rating,hour,day_of_week,weekend
0,1,2,3.5,2005-04-02 23:53:47,-0.635279,23,5,1


In [8]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'])
tags['timestamp'] = pd.to_datetime(tags['timestamp'])


In [9]:
user_rating_counts = ratings.groupby('userId').size()
active_users = user_rating_counts[user_rating_counts > 5].index
ratings = ratings[ratings['userId'].isin(active_users)]


In [10]:
ratings['normalized_rating'] = ratings.groupby('userId')['rating'].transform(
    lambda x: (x - x.mean()) / x.std()
)

In [11]:
# Fix: Replace NaN genres with empty list
movies['genres'] = movies['genres'].fillna('')  # replace NaN with empty string
movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])

In [12]:
ratings['hour'] = ratings['timestamp'].dt.hour
ratings['day_of_week'] = ratings['timestamp'].dt.dayofweek
ratings['weekend'] = ratings['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
genre_features = pd.DataFrame(mlb.fit_transform(movies['genres']), columns=mlb.classes_)
movies = pd.concat([movies, genre_features], axis=1)

In [51]:
movies.head()

Unnamed: 0,movieId,title,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",0,0,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),[Comedy],0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
!pip install scikit-surprise



In [14]:
from surprise import Dataset, Reader, SVD, accuracy
from surprise.model_selection import train_test_split

In [15]:
# Use only the columns needed
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2)

In [16]:
# algo = SVD()
# algo.fit(trainset)
# predictions = algo.test(testset)

# # Evaluate performance
# from surprise import accuracy
# rmse = accuracy.rmse(predictions)
# print(f"RMSE: {rmse:.4f}")

In [19]:
from sklearn.metrics.pairwise import cosine_similarity

# Build movie feature matrix
movie_features = movies.set_index('movieId')[mlb.classes_]  # genre binary vectors
cosine_sim = cosine_similarity(movie_features)

# Function to get top N similar movies
def get_similar_movies(movie_id, top_n=10):
    idx = movie_features.index.get_loc(movie_id)
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    similar_movie_ids = [movie_features.index[i] for i, _ in sim_scores[1:top_n+1]]
    return similar_movie_ids

# Example usage
similar_movies = get_similar_movies(1, top_n=5)
print(similar_movies)

[2294, 3114, 3754, 4016, 4886]


In [20]:
def rule_based_filter(movies_df, min_year=2000, exclude_genre='Horror'):
    return movies_df[
        (~movies_df[exclude_genre]) & 
        (movies_df['title'].str.extract(r'\((\d{4})\)').astype(float)[0] > min_year)
    ]

In [63]:
reco = rule_based_filter(movies)
recom = reco.iloc[:, :3]
print(recom.head(5))

      movieId                       title                             genres
3958     4052            Antitrust (2001)           [Crime, Drama, Thriller]
3959     4053          Double Take (2001)                   [Action, Comedy]
3960     4054  Save the Last Dance (2001)                   [Drama, Romance]
3962     4056          Pledge, The (2001)  [Crime, Drama, Mystery, Thriller]
3974     4068        Sugar & Spice (2001)                           [Comedy]


  return op(a, b)


In [61]:
recom = reco.iloc[:, :3]
print(recom)

       movieId                          title  \
3958      4052               Antitrust (2001)   
3959      4053             Double Take (2001)   
3960      4054     Save the Last Dance (2001)   
3962      4056             Pledge, The (2001)   
3974      4068           Sugar & Spice (2001)   
...        ...                            ...   
27270   131248          Brother Bear 2 (2006)   
27273   131254   Kein Bund für's Leben (2007)   
27274   131256  Feuer, Eis & Dosenbier (2002)   
27275   131258             The Pirates (2014)   
27276   131260            Rentun Ruusu (2001)   

                                                  genres  
3958                            [Crime, Drama, Thriller]  
3959                                    [Action, Comedy]  
3960                                    [Drama, Romance]  
3962                   [Crime, Drama, Mystery, Thriller]  
3974                                            [Comedy]  
...                                                  ... 

In [21]:
def recommend_movies(user_id, model, ratings, movies, genre_sim_df=None, hybrid=True, top_n=10):
    # 1. Get all movies the user hasn't rated
    rated_movies = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    all_movies = movies['movieId'].tolist()
    unseen_movies = [m for m in all_movies if m not in rated_movies]

    # 2. Predict ratings for unseen movies
    predictions = [model.predict(user_id, movie_id) for movie_id in unseen_movies]
    pred_df = pd.DataFrame(predictions)
    pred_df = pred_df[['iid', 'est']].rename(columns={'iid': 'movieId', 'est': 'predicted_rating'})

    # 3. Merge with movie titles
    result = pred_df.merge(movies[['movieId', 'title']], on='movieId')

    # 4. Optional: Boost scores using genre similarity to user’s favorites
    if hybrid and genre_sim_df is not None:
        # Get top liked movies by this user
        top_user_movies = ratings[(ratings['userId'] == user_id) & (ratings['rating'] >= 4)]['movieId'].tolist()
        
        # Average similarity of each unseen movie to user's top liked movies
        def genre_score(mid):
            sims = [genre_sim_df.at[mid, liked] for liked in top_user_movies if liked in genre_sim_df.index and mid in genre_sim_df.columns]
            return np.mean(sims) if sims else 0

        result['genre_score'] = result['movieId'].apply(genre_score)
        # Combine predicted CF score and genre similarity
        result['hybrid_score'] = 0.7 * result['predicted_rating'] + 0.3 * result['genre_score']
        result = result.sort_values(by='hybrid_score', ascending=False)
    else:
        result = result.sort_values(by='predicted_rating', ascending=False)

    return result.head(top_n)

In [22]:
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Extract genre binary matrix
genre_matrix = movies.set_index('movieId')[mlb.classes_]  # mlb.classes_ from Phase 1

# Step 2: Compute cosine similarity
cosine_sim = cosine_similarity(genre_matrix)

# Step 3: Convert to DataFrame
cosine_sim_df = pd.DataFrame(cosine_sim, index=genre_matrix.index, columns=genre_matrix.index)


In [23]:
import joblib

# Step 1: Load the trained SVD model
algo = joblib.load('/kaggle/input/svd/scikitlearn/default/1/svd_model.pkl')


In [24]:
recommendations = recommend_movies(
    user_id=2,
    model=algo,  
    ratings=ratings,
    movies=movies,
    genre_sim_df=cosine_sim_df,
    hybrid=True,
    top_n=5
)

print(recommendations[['title', 'predicted_rating']])


                                                   title  predicted_rating
1157   Raiders of the Lost Ark (Indiana Jones and the...          4.993207
3437                                    Gladiator (2000)          4.926493
6980   Lord of the Rings: The Return of the King, The...          4.926263
14531                                      Avatar (2009)          4.900830
1194                              Terminator, The (1984)          4.866769


In [25]:
import pandas as pd

def split_ratings_by_time(ratings_df, user_id, short_term_days=90):
    user_ratings = ratings_df[ratings_df['userId'] == user_id]
    now = ratings_df['timestamp'].max()
    cutoff = now - pd.Timedelta(days=short_term_days)

    short_term = user_ratings[user_ratings['timestamp'] > cutoff]
    long_term = user_ratings[user_ratings['timestamp'] <= cutoff]

    return short_term, long_term


In [26]:
def get_model_predictions(user_id, model, ratings_df, movies_df, unseen_movies):
    preds = [model.predict(user_id, mid) for mid in unseen_movies]
    pred_df = pd.DataFrame(preds)[['iid', 'est']].rename(columns={'iid': 'movieId', 'est': 'predicted_rating'})
    pred_df = pred_df.merge(movies_df[['movieId', 'title']], on='movieId', how='left')
    return pred_df


In [27]:
def blended_recommendation(user_id, model, ratings_df, movies_df, alpha=0.7, short_term_days=90, top_n=10):
    # Step 1: Unseen movies
    rated = ratings_df[ratings_df['userId'] == user_id]['movieId'].tolist()
    all_movies = movies_df['movieId'].tolist()
    unseen_movies = [mid for mid in all_movies if mid not in rated]

    # Step 2: Split ratings
    short_term, long_term = split_ratings_by_time(ratings_df, user_id, short_term_days)

    # Step 3: Short-Term and Long-Term Model Predictions
    short_scores = get_model_predictions(user_id, model, short_term, movies_df, unseen_movies)
    long_scores = get_model_predictions(user_id, model, long_term, movies_df, unseen_movies)

    # Step 4: Merge + Blend
    merged = short_scores.merge(long_scores, on='movieId', suffixes=('_short', '_long'))
    merged['blended_score'] = alpha * merged['predicted_rating_long'] + (1 - alpha) * merged['predicted_rating_short']
    merged = merged.sort_values(by='blended_score', ascending=False)

    return merged[['movieId', 'title_short', 'blended_score']].head(top_n).rename(columns={'title_short': 'title'})


In [28]:
recommendations = blended_recommendation(
    user_id=2,
    model=algo,                # your trained SVD model
    ratings_df=ratings,        # full ratings DataFrame
    movies_df=movies,          # full movies DataFrame
    alpha=0.6,                 # 60% long-term, 40% short-term
    short_term_days=90,
    top_n=5
)

print(recommendations)


      movieId                                              title  \
1157     1198  Raiders of the Lost Ark (Indiana Jones and the...   
308       318                   Shawshank Redemption, The (1994)   
3437     3578                                   Gladiator (2000)   
6980     7153  Lord of the Rings: The Return of the King, The...   
7295     7502                            Band of Brothers (2001)   

      blended_score  
1157       4.993207  
308        4.931973  
3437       4.926493  
6980       4.926263  
7295       4.918573  


In [29]:
def get_user_history(user_id, ratings_df, min_rating=4.0):
    """Return movieIds the user watched and liked"""
    return ratings_df[(ratings_df['userId'] == user_id) & (ratings_df['rating'] >= min_rating)]['movieId'].tolist()


In [30]:
def get_genre_profile(movie_ids, movies_df, genre_columns):
    """Average genre vector for a list of movies"""
    genre_data = movies_df[movies_df['movieId'].isin(movie_ids)][genre_columns]
    return genre_data.mean().values.reshape(1, -1)  # 2D vector


In [31]:
from sklearn.metrics.pairwise import cosine_similarity

def recommend_from_history(user_id, ratings_df, movies_df, genre_columns, top_n=10):
    watched = get_user_history(user_id, ratings_df)
    if not watched:
        return pd.DataFrame(columns=['movieId', 'title', 'score'])

    # Step 1: Build content profile
    user_profile = get_genre_profile(watched, movies_df, genre_columns)

    # Step 2: Filter unseen movies
    seen = set(ratings_df[ratings_df['userId'] == user_id]['movieId'])
    candidates = movies_df[~movies_df['movieId'].isin(seen)].copy()

    # Step 3: Compute similarity
    candidate_genres = candidates[genre_columns].values
    sims = cosine_similarity(user_profile, candidate_genres).flatten()
    candidates['score'] = sims

    return candidates.sort_values(by='score', ascending=False)[['movieId', 'title', 'score']].head(top_n)


In [32]:
genre_columns = list(mlb.classes_)  # genres from Phase 1
recommendations = recommend_from_history(
    user_id=1,
    ratings_df=ratings,
    movies_df=movies,
    genre_columns=genre_columns,
    top_n=5
)

print(recommendations)


       movieId                                            title     score
2532      2617                                Mummy, The (1999)  0.884538
14409    72165  Cirque du Freak: The Vampire's Assistant (2009)  0.884538
24922   117646            Dragonheart 2: A New Beginning (2000)  0.879678
2344      2429                          Mighty Joe Young (1998)  0.851835
10644    41569                                 King Kong (2005)  0.851835


In [35]:
import requests

user_query = "movie like interstellar but more romantic"

prompt = f"""
You are a movie assistant. Extract the key elements from the following query:

"{user_query}"

Return like this:
{{
    "reference_movie": "Movie Title",
    "modification": "What is added or changed (e.g. more romantic, darker)",
    "target_genres": ["List of inferred genres"],
    "tone": "Mood or tone of request"
}}
"""

# DeepSeek API call
url = "https://openrouter.ai/api/v1/chat/completions"
headers = {
    "Authorization": "Bearer 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",
    "Content-Type": "application/json"
}
data = {
    "model": "deepseek/deepseek-chat:free",
    "messages": [
        {"role": "system", "content": "You are an intelligent assistant."},
        {"role": "user", "content": prompt}
    ]
}

response = requests.post(url, headers=headers, json=data)

if response.status_code == 200:
    output = response.json()
    print("Parsed Output:\n", output["choices"][0]["message"]["content"].strip())
else:
    print(f"Error {response.status_code}: {response.text}")

Parsed Output:
 {
    "reference_movie": "Interstellar",
    "modification": "more romantic",
    "target_genres": ["Science Fiction", "Romance", "Adventure"],
    "tone": "Reflective and emotional"
}


In [39]:
def recommend_based_on_query(parsed, movies_df, svd_model, ratings_df, top_n=5):
    import joblib
    import numpy as np


    # Step 2: Get list of genres from parsed query
    target_genres = parsed.get("target_genres", [])
    tone = parsed.get("tone", "").lower()
    reference_movie = parsed.get("reference_movie", "")

    print(f"\nParsed Query:\nReference: {reference_movie}\nGenres: {target_genres}\nTone: {tone}")

    # Step 3: Filter movies based on genre (match at least one genre)
    filtered_movies = movies_df[
        movies_df['genres'].apply(
            # lambda g: any(genre.lower() in g.lower() for genre in target_genres)
            lambda g: any(genre.lower() in [x.lower() for x in g] for genre in target_genres)
        )
    ]
    print(f"Movies after genre filter: {len(filtered_movies)}")

    # Step 4: Optionally filter based on tone (if your dataset has description or keywords)
    if tone and 'description' in movies_df.columns:
        filtered_movies = filtered_movies[
            filtered_movies['description'].str.contains(tone, case=False, na=False)
        ]
        print(f"Movies after tone filter: {len(filtered_movies)}")

    # Step 5: Fallback if no movies left
    if filtered_movies.empty:
        print("⚠️ No movies matched genre/tone — falling back to top-rated similar movies.")
        filtered_movies = movies_df.copy()

    # Step 6: Predict ratings for a fake new user (user_id = 999999)
    user_id = 999999
    all_movie_ids = filtered_movies['movieId'].tolist()

    # Predict using SVD
    predictions = []
    for movie_id in all_movie_ids:
        try:
            pred = svd_model.predict(user_id, movie_id).est
            predictions.append((movie_id, pred))
        except:
            continue

    # Sort by predicted rating
    predictions.sort(key=lambda x: x[1], reverse=True)

    # Step 7: Get top N movie details
    top_recs = []
    for movie_id, pred_rating in predictions[:top_n]:
        title = movies_df[movies_df['movieId'] == movie_id]['title'].values[0]
        top_recs.append({
            'movieId': movie_id,
            'title': title,
            'predicted_rating': pred_rating
        })

    return top_recs


In [48]:
import requests

# Replace with your actual API key
DEEPSEEK_API_KEY = "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"
API_URL = "https://openrouter.ai/api/v1/chat/completions"
HEADERS = {
    "Authorization": f"Bearer {DEEPSEEK_API_KEY}",
    "Content-Type": "application/json"
}


def parse_user_query(user_query):
    """
    Uses DeepSeek to parse a natural-language movie query into structured JSON data.
    """
    prompt = f"""
You are a movie assistant. Extract the key elements from the following query:

"{user_query}"

Return like this:
{{
    "reference_movie": "Movie Title",
    "modification": "What is added or changed (e.g. more romantic, darker)",
    "target_genres": ["List of inferred genres"],
    "tone": "Mood or tone of request"
}}
"""

    data = {
        "model": "deepseek/deepseek-chat:free",
        "messages": [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(API_URL, headers=HEADERS, json=data)
    if response.status_code == 200:
        content = response.json()["choices"][0]["message"]["content"]
        return eval(content)  # Assumes DeepSeek returns valid JSON
    else:
        raise Exception(f"DeepSeek API error {response.status_code}: {response.text}")

def generate_explanation(movie_list, parsed_data):
    """
    Uses DeepSeek to generate a brief explanation of why the recommended movies match the user's intent.
    """
    # Fix: access movie title from dictionary
    titles = ", ".join([m["title"] for m in movie_list])

    prompt = f"""
A user wanted recommendations for movies like "{parsed_data['reference_movie']}" but {parsed_data['modification']}.
You recommended: {titles}.
Explain briefly why these movies match, considering tone, genre, and similarity.
"""

    data = {
        "model": "deepseek/deepseek-chat:free",
        "messages": [
            {"role": "system", "content": "You are a helpful movie assistant. Respond concisely and insightfully."},
            {"role": "user", "content": prompt}
        ]
    }

    response = requests.post(API_URL, headers=HEADERS, json=data)
    if response.status_code == 200:
        return response.json()["choices"][0]["message"]["content"].strip()
    else:
        raise Exception(f"DeepSeek API error {response.status_code}: {response.text}")


In [44]:
user_input = "movie like Interstellar but more romantic"
parsed = parse_user_query(user_input)


print("Parsed Query:", parsed)



Parsed Query: {'reference_movie': 'Interstellar', 'modification': 'more romantic', 'target_genres': ['Science Fiction', 'Romance', 'Adventure'], 'tone': 'Thoughtful and emotional'}


In [49]:
# Step 4: Get recommendations
recs = recommend_based_on_query(parsed, movies, algo, ratings, top_n=5)
explanation = generate_explanation(recs, parsed)

# Step 5: Print output
print("Top Recommendations:")
for rec in recs:
    print(f"{rec['title']} - Predicted Rating: {rec['predicted_rating']:.2f}")
print("Explanation:\n", explanation)


Parsed Query:
Reference: Interstellar
Genres: ['Science Fiction', 'Romance', 'Adventure']
Tone: thoughtful and emotional
Movies after genre filter: 6162
Top Recommendations:
Lives of Others, The (Das leben der Anderen) (2006) - Predicted Rating: 4.26
Life Is Beautiful (La Vita è bella) (1997) - Predicted Rating: 4.26
North & South (2004) - Predicted Rating: 4.23
Spirited Away (Sen to Chihiro no kamikakushi) (2001) - Predicted Rating: 4.23
Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001) - Predicted Rating: 4.21
Explanation:
 These films, while not identical to *Interstellar*, share themes of emotional depth, human connection, and a blend of realism with elements of wonder or fantasy. *The Lives of Others* explores intimate, human stories within a tense, artistic framework. *Life Is Beautiful* balances romance and heartbreak with resilience and imagination. *North & South* combines grand themes with a tender love story. *Spirited Away* and *Amelie* both infuse whimsy and magical re