In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import os

def load_and_filter_data(ratings_path, min_ratings=3):
    # Load with tab separator and strip headers
    ratings = pd.read_csv(ratings_path, sep="\t", engine="python")
    
    # Show actual column names
    print("✅ Raw Columns:", ratings.columns.tolist())

    # Rename for consistency
    ratings.rename(columns={
        'user_id': 'userId',
        'movie_id': 'movieId',
        'rating': 'rating'
    }, inplace=True)

    # Filter users with >= min_ratings
    user_counts = ratings['userId'].value_counts()
    valid_users = user_counts[user_counts >= min_ratings].index

    item_counts = ratings['movieId'].value_counts()
    valid_items = item_counts[item_counts >= min_ratings].index

    filtered = ratings[
        ratings['userId'].isin(valid_users) &
        ratings['movieId'].isin(valid_items)
    ]

    return filtered[['userId', 'movieId', 'rating']]  # only needed columns

def create_user_item_matrix(ratings_df):
    return ratings_df.pivot_table(index='userId', columns='movieId', values='rating').fillna(0)

def compute_similarity(matrix, kind='user'):
    if kind == 'user':
        sim = cosine_similarity(matrix)
        return pd.DataFrame(sim, index=matrix.index, columns=matrix.index)
    elif kind == 'item':
        sim = cosine_similarity(matrix.T)
        return pd.DataFrame(sim, index=matrix.columns, columns=matrix.columns)
    else:
        raise ValueError("Choose kind='user' or kind='item'.")

def main():
    # 🔧 Adjust this to your actual file path
    ratings_path = r"C:\Users\DELL\Downloads\ratings.csv"

    if not os.path.exists(ratings_path):
        print(f"❌ File not found: {ratings_path}")
        return

    filtered_ratings = load_and_filter_data(ratings_path, min_ratings=3)
    user_item_matrix = create_user_item_matrix(filtered_ratings)

    user_similarity = compute_similarity(user_item_matrix, kind='user')
    item_similarity = compute_similarity(user_item_matrix, kind='item')

    print("\n👥 User-User Cosine Similarity Matrix (Top 5 rows):\n", user_similarity.round(2).head())
    print("\n🎬 Item-Item Cosine Similarity Matrix (Top 5 rows):\n", item_similarity.round(2).head())

if __name__ == "__main__":
    main()


✅ Raw Columns: ['Unnamed: 0', 'user_id', 'movie_id', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id']

👥 User-User Cosine Similarity Matrix (Top 5 rows):
 userId  1     2     3     4     5     6     7     8     9     10    ...  6031  \
userId                                                              ...         
1       1.00  0.10  0.12  0.13  0.09  0.18  0.06  0.14  0.23  0.26  ...  0.17   
2       0.10  1.00  0.15  0.17  0.11  0.10  0.31  0.20  0.19  0.23  ...  0.11   
3       0.12  0.15  1.00  0.15  0.06  0.07  0.14  0.08  0.13  0.21  ...  0.09   
4       0.13  0.17  0.15  1.00  0.05  0.01  0.13  0.10  0.09  0.12  ...  0.16   
5       0.09  0.11  0.06  0.05  1.00  0.05  0.13  0.22  0.26  0.12  ...  0.10   

userId  6032  6033  6034  6035  6036  6037  6038  6039  6040  
userId                                                        
1       0.08  0.07  0.03  0.11  0.19  0.14  0.00  0.17  0.13  
2       0.09  0.27  0.01  0.18  0.23  0.21  0.07  0.07  0.22  
3       0.13  0.16  