<a href="https://colab.research.google.com/github/musti9311/CodTech-Internship/blob/main/Recommendation_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import zipfile
import requests
import io


In [2]:
# 1. Download MovieLens Small Dataset (Runs automatically)
print("Downloading MovieLens Dataset...")
url = "https://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
r = requests.get(url)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall()

Downloading MovieLens Dataset...


In [3]:
# 2. Load Data into Pandas
print("Loading Data...")
ratings = pd.read_csv("ml-latest-small/ratings.csv")
movies = pd.read_csv("ml-latest-small/movies.csv")

Loading Data...


In [4]:
# Merge data to get movie titles
data = pd.merge(ratings, movies, on='movieId')

print(f"Dataset Loaded: {data.shape[0]} ratings, {data['movieId'].nunique()} movies.")
print(data[['userId', 'title', 'rating']].head())
print("-" * 30)

Dataset Loaded: 100836 ratings, 9724 movies.
   userId                        title  rating
0       1             Toy Story (1995)     4.0
1       1      Grumpier Old Men (1995)     4.0
2       1                  Heat (1995)     4.0
3       1  Seven (a.k.a. Se7en) (1995)     5.0
4       1   Usual Suspects, The (1995)     5.0
------------------------------


In [5]:
# Create a User-Item Matrix (Rows = Users, Cols = Movies, Values = Ratings)
# We fill NaN with 0 (implying the user hasn't seen the movie yet)
user_movie_matrix = data.pivot_table(index='userId', columns='title', values='rating')
user_movie_matrix_filled = user_movie_matrix.fillna(0)

print(f"Matrix Shape: {user_movie_matrix_filled.shape}")

Matrix Shape: (610, 9719)


In [6]:
print("Training SVD Model (Matrix Factorization)...")

Training SVD Model (Matrix Factorization)...


In [7]:
# Transpose matrix so rows are movies (for SVD compatibility in this specific method)
X = user_movie_matrix_filled.values.T


In [8]:
# TruncatedSVD compresses the matrix into 12 latent components (features)
SVD = TruncatedSVD(n_components=12, random_state=42)
matrix_decomposed = SVD.fit_transform(X)

In [9]:
# Calculate Correlation Matrix (Pearson Correlation)
# This measures how similar every movie is to every other movie based on user patterns
corr_mat = np.corrcoef(matrix_decomposed)

print("Model Trained successfully.")
print("-" * 30)

Model Trained successfully.
------------------------------


In [10]:
def recommend_movies(movie_title, min_correlation=0.9):
    """
    Recommends movies similar to the given movie_title based on correlation.
    """
    if movie_title not in user_movie_matrix.columns:
        return "Movie not found in dataset."

    # Get index of the movie
    movie_idx = list(user_movie_matrix.columns).index(movie_title)

    # Get correlation vector for this movie
    corr_movie = corr_mat[movie_idx]

    # Filter movies with high correlation
    recommendations = list(user_movie_matrix.columns[(corr_movie > min_correlation)])

    # Remove the movie itself from the list
    if movie_title in recommendations:
        recommendations.remove(movie_title)

    return recommendations[:10]  # Return top 10


In [11]:
# Test the system: Recommend movies like "Toy Story (1995)"
target_movie = "Toy Story (1995)"
recs = recommend_movies(target_movie)

print(f"Top Recommendations for users who liked '{target_movie}':")
for i, movie in enumerate(recs, 1):
    print(f"{i}. {movie}")

Top Recommendations for users who liked 'Toy Story (1995)':
1. Aladdin (1992)
2. Babe (1995)
3. Back to the Future (1985)
4. Forrest Gump (1994)
5. Groundhog Day (1993)
6. Home Alone (1990)
7. Independence Day (a.k.a. ID4) (1996)
8. Jumanji (1995)
9. Jurassic Park (1993)
10. Lion King, The (1994)


In [12]:
# To evaluate, we reconstruct the matrix and compare it to the original ratings
print("\nEvaluating Model Performance...")


Evaluating Model Performance...


In [13]:
# Reconstruct the matrix (Approximate original ratings)
reconstructed_matrix = np.dot(matrix_decomposed, SVD.components_)
reconstructed_df = pd.DataFrame(reconstructed_matrix, index=user_movie_matrix_filled.columns, columns=user_movie_matrix_filled.index).T

In [14]:
# Flatten matrices to calculate RMSE on non-zero ratings only
original_flat = user_movie_matrix.values.flatten()
predicted_flat = reconstructed_df.values.flatten()

In [15]:
# Filter out NaN values (only evaluate on actual ratings provided by users)
mask = ~np.isnan(original_flat)
rmse = np.sqrt(mean_squared_error(original_flat[mask], predicted_flat[mask]))

print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print("Note: Lower RMSE indicates better accuracy in reconstructing user ratings.")

Root Mean Squared Error (RMSE): 2.5052
Note: Lower RMSE indicates better accuracy in reconstructing user ratings.
