## 09. Collaborative Filtering (NMF Based)

In this section, we implement **Collaborative Filtering using Non-negative Matrix Factorization (NMF)**.  
NMF decomposes the user–item interaction matrix into two lower-dimensional matrices — one for users and one for items.  
It learns hidden (latent) features that represent user preferences and course characteristics.  
Using these, we can predict ratings for unseen courses and generate personalized recommendations.

In [2]:
# Step 1: Mount Google Drive to access saved data
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Import required liberaries

import os
import pandas as pd
import warnings

In [5]:

project_path = "/content/drive/MyDrive/Projects/Machine_Learning/Course_Recommendation_System"
raw_path = os.path.join(project_path, "data", "raw")
raw_file = os.path.join(raw_path, "udemy_course_data.csv")

# Check folder
print("📂 Checking contents of:", raw_path)
print(os.listdir(raw_path))

# Load dataset
df = pd.read_csv(raw_file)

# Display confirmation
print("\n✅ Dataset loaded successfully!")
print("📊 Shape of data:", df.shape)
print(df.head())

📂 Checking contents of: /content/drive/MyDrive/Projects/Machine_Learning/Course_Recommendation_System/data/raw
['udemy_course_data.csv']

✅ Dataset loaded successfully!
📊 Shape of data: (3683, 18)
   course_id                                       course_title  \
0    1070968                 Ultimate Investment Banking Course   
1    1113822  Complete GST Course & Certification - Grow You...   
2    1006314  Financial Modeling for Business Analysts and C...   
3    1210588  Beginner to Pro - Financial Analysis in Excel ...   
4    1011058       How To Maximize Your Profits Trading Options   

                                                 url  is_paid  price  \
0  https://www.udemy.com/ultimate-investment-bank...     True    200   
1      https://www.udemy.com/goods-and-services-tax/     True     75   
2  https://www.udemy.com/financial-modeling-for-b...     True     45   
3  https://www.udemy.com/complete-excel-finance-c...     True     95   
4  https://www.udemy.com/how-to-maximize

In [6]:
# Step 1: Import required libraries
import pandas as pd
import numpy as np
import os

# Step 2: Define file paths
base_path = "/content/drive/MyDrive/Projects/Machine_Learning/Course_Recommendation_System/data"
raw_path = os.path.join(base_path, "raw")
processed_path = os.path.join(base_path, "processed")

# Step 3: Load actual Udemy course data
udemy_df = pd.read_csv(os.path.join(raw_path, "udemy_course_data.csv"))

# Step 4: Create dummy users and sample random courses
num_users = 50        # total simulated users
num_ratings = 500     # total random ratings to generate

# Randomly select courses
sample_courses = udemy_df.sample(200, random_state=42)["course_id"].tolist()
users = [f"user_{i}" for i in range(1, num_users + 1)]

# Step 5: Generate random (user, course, rating) tuples
np.random.seed(42)
dummy_data = {
    "user_id": np.random.choice(users, num_ratings),
    "course_id": np.random.choice(sample_courses, num_ratings),
    "rating": np.random.randint(1, 6, size=num_ratings)  # ratings from 1 to 5
}

ratings_df = pd.DataFrame(dummy_data)

# Step 6: Save processed dummy dataset
os.makedirs(processed_path, exist_ok=True)
dummy_file = os.path.join(processed_path, "dummy_user_ratings.csv")
ratings_df.to_csv(dummy_file, index=False)

# Step 7: Display confirmation
print("✅ Dummy user-course rating dataset created successfully!")
print(f"📁 Saved at: {dummy_file}")
print(f"📊 Shape: {ratings_df.shape}")
print(ratings_df.head())

✅ Dummy user-course rating dataset created successfully!
📁 Saved at: /content/drive/MyDrive/Projects/Machine_Learning/Course_Recommendation_System/data/processed/dummy_user_ratings.csv
📊 Shape: (500, 3)
   user_id  course_id  rating
0  user_39    1193536       4
1  user_29     228934       3
2  user_15     258894       1
3  user_43     297742       3
4   user_8     474150       3


In [7]:
# Step 1: Import required libraries
import pandas as pd
import os

# Step 2: Load the dummy user-course rating data
processed_path = "/content/drive/MyDrive/Projects/Machine_Learning/Course_Recommendation_System/data/processed"
ratings_file = os.path.join(processed_path, "dummy_user_ratings.csv")

ratings_df = pd.read_csv(ratings_file)
print("✅ Dummy Ratings Data Loaded Successfully!")
print(ratings_df.head())

# Step 3: Create a User–Item Matrix (rows = users, columns = courses)
user_item_matrix = ratings_df.pivot_table(
    index="user_id",
    columns="course_id",
    values="rating",
    fill_value=0  # fill missing ratings with 0
)

# Step 4: Display matrix summary
print("\n📊 User–Item Matrix Created!")
print(f"Shape: {user_item_matrix.shape}")
print(user_item_matrix.head())

✅ Dummy Ratings Data Loaded Successfully!
   user_id  course_id  rating
0  user_39    1193536       4
1  user_29     228934       3
2  user_15     258894       1
3  user_43     297742       3
4   user_8     474150       3

📊 User–Item Matrix Created!
Shape: (50, 186)
course_id  19653    41295    45136    46010    46524    64585    98066    \
user_id                                                                    
user_1         0.0      0.0      0.0      0.0      0.0      0.0      0.0   
user_10        3.0      0.0      0.0      0.0      0.0      0.0      0.0   
user_11        0.0      0.0      0.0      0.0      4.0      2.0      0.0   
user_12        0.0      0.0      0.0      0.0      0.0      0.0      0.0   
user_13        0.0      0.0      5.0      0.0      0.0      0.0      0.0   

course_id  125806   134144   140168   ...  1201054  1208638  1216752  1217778  \
user_id                               ...                                       
user_1         1.0      0.0      0.0 

In [8]:
# Step 1: Import KNN model tools
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Step 2: Initialize KNN model (using cosine similarity)
knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
knn_model.fit(user_item_matrix.T)  # transpose for course-based similarity

print("✅ KNN Model Trained for Course Similarity!")

# Step 3: Choose a random course and find similar ones
random_course_id = np.random.choice(user_item_matrix.columns)
course_index = list(user_item_matrix.columns).index(random_course_id)

# Step 4: Get 6 nearest courses (top 5 + itself)
distances, indices = knn_model.kneighbors(
    user_item_matrix.T.iloc[course_index, :].values.reshape(1, -1),
    n_neighbors=6
)

# Step 5: Display results
print(f"\n Target Course ID: {random_course_id}")
print("\n Top 5 Similar Courses:")
for i in range(1, len(distances.flatten())):
    similar_course_id = user_item_matrix.columns[indices.flatten()[i]]
    print(f"{i}. Course ID: {similar_course_id} | Similarity Score: {1 - distances.flatten()[i]:.2f}")


✅ KNN Model Trained for Course Similarity!

 Target Course ID: 1009452

 Top 5 Similar Courses:
1. Course ID: 419308 | Similarity Score: 0.66
2. Course ID: 943386 | Similarity Score: 0.51
3. Course ID: 352748 | Similarity Score: 0.51
4. Course ID: 1109736 | Similarity Score: 0.51
5. Course ID: 770486 | Similarity Score: 0.50




In [11]:
# Step 1: Define a recommendation function
import warnings
warnings.filterwarnings("ignore", message="X does not have valid feature names")

def recommend_courses(user_id, n_recommendations=5):
    # Get user’s ratings vector
    user_vector = user_item_matrix.loc[user_id].values.reshape(1, -1)

    # Find courses user hasn't rated yet
    unrated_courses = user_item_matrix.columns[user_item_matrix.loc[user_id] == 0]

    # Predict scores for unrated courses
    scores = []
    for course_id in unrated_courses:
        course_index = list(user_item_matrix.columns).index(course_id)
        distances, indices = knn_model.kneighbors(
            user_item_matrix.T.iloc[course_index, :].values.reshape(1, -1),
            n_neighbors=6
        )
        similarity = 1 - distances.flatten()[1:]  # skip self
        scores.append((course_id, np.mean(similarity)))  # average similarity score

    # Sort by similarity score
    recommended = sorted(scores, key=lambda x: x[1], reverse=True)[:n_recommendations]

    # Display top recommendations
    print(f"\n Recommended Courses for {user_id}:")
    for i, (course_id, score) in enumerate(recommended, 1):
        print(f"{i}. Course ID: {course_id} | Predicted Similarity: {score:.2f}")

# Step 2: Test recommendation for any user
recommend_courses("user_5")


 Recommended Courses for user_5:
1. Course ID: 140168 | Predicted Similarity: 0.93
2. Course ID: 709160 | Predicted Similarity: 0.93
3. Course ID: 792703 | Predicted Similarity: 0.93
4. Course ID: 1193536 | Predicted Similarity: 0.93
5. Course ID: 294292 | Predicted Similarity: 0.91


In [12]:
# Step 1: Search course by title
def get_course_id_by_name(course_name, df):
    """
    Finds course_id using partial name match.
    """
    result = df[df['course_title'].str.contains(course_name, case=False, na=False)]
    return result[['course_id', 'course_title']]

# 🔍 Example test
get_course_id_by_name("Python", df)

Unnamed: 0,course_id,course_title
14,1196544,Python Algo Trading: Sentiment Trading with News
30,1170894,Python Algo Stock Trading: Automate Your Trading!
41,1035472,Python for Finance: Investment Fundamentals & ...
149,1070886,Python Algo Trading: FX Trading with Oanda
336,815482,Stock Technical Analysis with Python
538,529828,Python for Trading & Investing
764,1088656,Quantitative Trading Analysis with Python
866,902888,Investment Portfolio Analysis with Python
1686,546848,Learn to code in Python and learn Adobe Photos...
2502,16646,Web Programming with Python


In [17]:
# ✅ Auto Smart Recommender — Just type course name like: Python, Excel, Finance, etc.

from sklearn.neighbors import NearestNeighbors

# ensure model and data are ready
try:
    knn_model
except NameError:
    print("⏳ Training KNN model first...")
    knn_model = NearestNeighbors(metric='cosine', algorithm='brute')
    knn_model.fit(user_item_matrix.T)

def get_recommendations(course_name):
    """
    Type course name only — and get top 5 similar courses instantly.
    """
    matches = df[df['course_title'].str.contains(course_name, case=False, na=False)]
    if matches.empty:
        print("❌ No matching course found. Try a different keyword.")
        return

    selected_course = matches.iloc[0]
    selected_course_id = selected_course['course_id']
    print(f"\n🎯 Selected Course: {selected_course['course_title']} (ID: {selected_course_id})")

    if selected_course_id not in user_item_matrix.columns:
        print("⚠️ This course not found in user-item matrix. Try another one.")
        return

    distances, indices = knn_model.kneighbors(
        user_item_matrix[selected_course_id].values.reshape(1, -1),
        n_neighbors=6
    )

    similar_course_ids = user_item_matrix.columns[indices.flatten()[1:]]
    similar_courses = df[df['course_id'].isin(similar_course_ids)][['course_id', 'course_title']]

    print("\n📘 Top Recommended Courses:\n")
    for i, row in enumerate(similar_courses.itertuples(), 1):
        print(f"{i}. {row.course_title} (Course ID: {row.course_id})")

    print("\n✅ Recommendation complete!\n")

# 🔹 Auto interactive part — user just types the course name
course_query = input("🎓 Enter Course Name: ")
get_recommendations(course_query)


🎓 Enter Course Name: python

🎯 Selected Course: Python Algo Trading: Sentiment Trading with News (ID: 1196544)

📘 Top Recommended Courses:

1. Option Trading for Rookies: Make & Manage Profitable Trades (Course ID: 941120)
2. Accounting for Depreciation (Collage Level) (Course ID: 258174)
3. 5 Exotic Guitar Scales and How to Use Them Effectively (Course ID: 830568)
4. Learn to play and improve 12 bar blues harmonica solos (Course ID: 1236576)
5. Build CRUD Application - PHP & Mysql (Course ID: 1201054)

✅ Recommendation complete!

