In [1]:
# Install required packages
!pip install -r requirements.txt

# Base directory for the project
BASE_DIR = "/Users/moazam_a12/Learning Path Recommendation System"

# Import libraries
import os
import pandas as pd
import difflib
from utils.preprocess import load_and_preprocess
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import accuracy

# Define dataset paths using BASE_DIR
data_path = os.path.join(BASE_DIR, "data", "personalized_learning_dataset.csv")
course_metadata_path = os.path.join(BASE_DIR, "data", "Course Content Metadata", "edx.csv")



In [3]:
# Load and preprocess the datasets
interaction_df, user_encoder, course_encoder, course_df = load_and_preprocess(
    data_path,
    course_metadata_path
)

# Preview the processed data
print("Processed interaction data:")
display(interaction_df.head())

print("\nCourse metadata:")
display(course_df.head())

Processed interaction data:


Unnamed: 0,user_id,course_id,rating
0,0,2,51
1,1,3,92
2,2,3,45
3,3,1,59
4,4,3,93



Course metadata:


Unnamed: 0,title,link,institution,subject,level,prerequisites,language,videotranscript,associatedprograms,associatedskills
0,How to Learn Online,https://www.edx.org/learn/how-to-learn/edx-how...,edX,Education & Teacher Training,Introductory,,English,English,,Learning Design
1,The Science of Happiness,https://www.edx.org/learn/happiness/university...,"University of California, Berkeley",Social Sciences,Introductory,None.,English,English,,"Evolutionary Biology, Empathy, Psychology"
2,Remote Work Revolutionfor Everyone,https://www.edx.org/learn/remote-work/harvard-...,Harvard University,Business & Management,Introductory,,English,,Professional Certificate inLeading in a Remote...,"Customer Relationship Building, Telecommuting"
3,CS50's Introduction toComputer Science,https://www.edx.org/learn/computer-science/har...,Harvard University,Computer Science,Introductory,None.,English,English,Professional Certificate inComputer Science fo...,"Resource Management, JavaScript (Programming L..."
4,Data Visualization andBuilding Dashboards with...,https://www.edx.org/learn/data-visualization/i...,IBM,Data Analysis & Statistics,Introductory,,English,,Professional CertificateinData Analysis and Vi...,"Data Visualization, Data Analysis, IBM Cognos ..."


In [7]:
# Import training function
from models.train_model import train_svd_model
import pandas as pd

# Display Final_Exam_Score stats
student_df = pd.read_csv(data_path)
print("\n=== Dataset Statistics ===")
print(student_df['Final_Exam_Score'].describe().to_string())
print("=========================\n")

# Train and evaluate SVD model with tuning
print("Initiating model training and evaluation...")
model, trainset, testset, rmse, precision, recall = train_svd_model(
    interaction_df, 
    tune_params=True, 
    verbose=False  # Set to False for even less output
)


=== Dataset Statistics ===
count    10000.000000
mean        64.697000
std         20.096417
min         30.000000
25%         47.000000
50%         65.000000
75%         82.000000
max         99.000000

Initiating model training and evaluation...

=== Model Training Summary ===
Number of Interactions: 10000
Training Set Size: 8000 ratings
Test Set Size: 2000 ratings
Best Parameters: {'n_factors': 100, 'lr_all': 0.01, 'reg_all': 0.1}
RMSE: 19.7442
Precision@3: 0.3333
Recall@3: 1.0000



In [17]:
# Cell 4
# Import required libraries
import difflib
import pandas as pd
import numpy as np
from models.recommend import get_recommendations

# Setup fuzzy matching for course names to edx.csv titles
print("Setting up fuzzy matching...")
course_names = pd.unique(interaction_df['course_id'].map(lambda x: course_encoder.inverse_transform([x])[0]))
course_titles = course_df['title'].dropna().astype(str).tolist()  # Ensure strings, drop NaN
course_name_to_title = {}
for cn in course_names:
    if isinstance(cn, str) and pd.notna(cn):
        matches = difflib.get_close_matches(cn, course_titles, n=1, cutoff=0.6)
        course_name_to_title[cn] = matches[0] if matches else cn
    else:
        course_name_to_title[cn] = str(cn)  # Convert non-string to string as fallback
print(f"Fuzzy matching completed. Mapped {len(course_name_to_title)} courses.\n")

Setting up fuzzy matching...
Fuzzy matching completed. Mapped 5 courses.



In [15]:
# Select three random students
np.random.seed(42)  # For reproducibility
random_users = np.random.choice(interaction_df['user_id'].unique(), size=3, replace=False)
print(f"Selected random students: {list(random_users)}\n")

# Generate recommendations for random students
top_n = 3
print("=== Course Recommendations for Random Students ===")
for user_id in random_users:
    print(f"\nStudent ID: {user_id}")
    recommendations = get_recommendations(
        user_id, model, interaction_df, course_encoder, course_name_to_title, course_df, top_n=top_n
    )
    if recommendations:
        for i, rec in enumerate(recommendations, 1):
            print(f"{i}. Course: {rec['course_title']}")
            print(f"   Predicted Rating: {rec['predicted_rating']:.2f}")
            print(f"   Subject: {rec['metadata']['subject']}")
            print(f"   Level: {rec['metadata']['level']}")
    else:
        print("   No recommendations available.")
print("=====================================\n")

Selected random students: [6252, 4684, 1731]

=== Course Recommendations for Random Students ===

Student ID: 6252
1. Course: Python Basics for DataScience
   Predicted Rating: 66.38
   Subject: Data Analysis & Statistics
   Level: Introductory
2. Course: New Product Development
   Predicted Rating: 63.22
   Subject: Business & Management
   Level: Introductory
3. Course: Cybersecurity Basics
   Predicted Rating: 62.19
   Subject: Computer Science
   Level: Introductory

Student ID: 4684
1. Course: Data Science: MachineLearning
   Predicted Rating: 67.35
   Subject: Data Analysis & Statistics
   Level: Introductory
2. Course: New Product Development
   Predicted Rating: 65.39
   Subject: Business & Management
   Level: Introductory
3. Course: Python Basics for DataScience
   Predicted Rating: 64.13
   Subject: Data Analysis & Statistics
   Level: Introductory

Student ID: 1731
1. Course: Data Science: MachineLearning
   Predicted Rating: 65.11
   Subject: Data Analysis & Statistics
   