In [None]:
from google.colab import drive
import numpy as np
import pandas as pd
import math

In [None]:
# Step 1: Mount Google Drive
def mount_drive():
    drive.mount('/content/drive')

In [None]:
# Step 2: Load User and Movie Data
def load_data():
    users = pd.read_csv("/content/drive/MyDrive/ColabNotebooks/movie.users.txt", sep="\t", header=None,
                        names=['user_id', 'movie_id', 'freebase_id', 'movie_name', 'release_year', 'box_office',
                               'runtime', 'language', 'countries', 'genres', 'plot_summary'])
    movies = pd.read_csv('/content/drive/MyDrive/ColabNotebooks/movie.metadata_plot_summary_test.txt', sep="\t", header=None,
                         names=['movie_id', 'freebase_id', 'movie_name', 'release_year', 'box_office', 'runtime',
                                'language', 'countries', 'genres', 'plot_summary'])

    users = users.dropna(subset=['plot_summary'])
    movies = movies.dropna(subset=['plot_summary'])
    return users, movies

In [None]:
# Step 3: Compute TF-IDF
import numpy as np
import math

def compute_tf_idf(texts):

    tf_idf_profiles = []
    idf_dict = {}
    N = len(texts)


    for text in texts:
        words = set(text.lower().split())
        for word in words:

            idf_dict[word] = idf_dict.get(word, 0) + 1

    for word in idf_dict:
        idf_dict[word] = np.log(N / (1 + idf_dict[word]))


    for text in texts:

        tf_dict = {}
        words = text.lower().split()
        word_count = len(words)


        for word in words:
            tf_dict[word] = tf_dict.get(word, 0) + 1
        tf_dict = {word: count / word_count for word, count in tf_dict.items()}

        tf_idf_dict = {}
        for word, tf in tf_dict.items():
            idf = idf_dict.get(word, 0)
            tf_idf_dict[word] = tf * idf


        norm = math.sqrt(sum(val ** 2 for val in tf_idf_dict.values()))
        if norm > 0:
            normalized_tfidf = {word: val / norm for word, val in tf_idf_dict.items()}
        else:
            normalized_tfidf = tf_idf_dict

        tf_idf_profiles.append(normalized_tfidf)

    return tf_idf_profiles


In [None]:
# Step 4: Create User and Movie Profiles
def create_profiles(users, movies):
    user_profiles = compute_tf_idf(users['plot_summary'].tolist())
    movie_profiles = compute_tf_idf(movies['plot_summary'].tolist())

    user_profiles_dict = dict(zip(users['user_id'], user_profiles))
    movie_profiles_dict = dict(zip(movies['movie_id'], movie_profiles))

    return user_profiles_dict, movie_profiles_dict

In [None]:
# Step 5: Calculate Cosine Similarity
def cosine_similarity(profile1, profile2):
    common_keys = set(profile1.keys()) & set(profile2.keys())
    dot_product = sum(profile1[key] * profile2[key] for key in common_keys)
    magnitude1 = math.sqrt(sum(profile1[key] ** 2 for key in profile1))
    magnitude2 = math.sqrt(sum(profile2[key] ** 2 for key in profile2))
    return dot_product / (magnitude1 * magnitude2) if magnitude1 and magnitude2 else 0


In [None]:
# Step 6: Recommend Top 10 Movies for Each User
def recommend_movies(user_profiles, movie_profiles):
    recommendations = {}
    for user_id, user_profile in user_profiles.items():
        recs = []
        for movie_id, movie_profile in movie_profiles.items():
            similarity = cosine_similarity(user_profile, movie_profile)
            recs.append((movie_id, similarity))
        recs.sort(key=lambda x: x[1], reverse=True)
        recommendations[user_id] = recs[:10]
    return recommendations


In [None]:
# Step 7: Save and Run
def save_and_display_recommendations(recommendations, movies, filename='part2_0615971_output.txt'):
    with open(filename, 'w') as file:
        for user_id, recs in recommendations.items():
            header = f"Top 10 movies for User {user_id}:\n"
            file.write(header)
            print(header.strip())

            for movie_id, score in recs:
                movie_name = movies[movies['movie_id'] == movie_id]['movie_name'].values[0]
                line = f"Movie: {movie_name}, Similarity: {score:.4f}\n"
                file.write(line)
                print(line.strip())



mount_drive()

users, movies = load_data()

user_profiles, movie_profiles = create_profiles(users, movies)

recommendations = recommend_movies(user_profiles, movie_profiles)

save_and_display_recommendations(recommendations, movies)

Mounted at /content/drive
Top 10 movies for User 1:
Movie: G.I. Joe: Retaliation, Similarity: 0.0459
Movie: U.F.O., Similarity: 0.0345
Movie: Safe Haven, Similarity: 0.0317
Movie: The Abandoned State, Similarity: 0.0282
Movie: Pain & Gain, Similarity: 0.0270
Movie: Machete Kills, Similarity: 0.0256
Movie: Ruby, Similarity: 0.0245
Movie: Ender's Game, Similarity: 0.0233
Movie: Kai Po Che!, Similarity: 0.0227
Movie: The Tomb, Similarity: 0.0227
Top 10 movies for User 2:
Movie: Deconstruction Red, Similarity: 0.0789
Movie: Monsters University, Similarity: 0.0737
Movie: Oz: The Great and Powerful, Similarity: 0.0717
Movie: The Railway Man, Similarity: 0.0549
Movie: Turbo, Similarity: 0.0474
Movie: After Earth, Similarity: 0.0450
Movie: Ruby, Similarity: 0.0434
Movie: The Seventh Son, Similarity: 0.0373
Movie: Machete Kills, Similarity: 0.0373
Movie: The Croods, Similarity: 0.0360
Top 10 movies for User 3:
Movie: The Seventh Son, Similarity: 0.0820
Movie: The Abandoned State, Similarity: 0.