In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import OneHotEncoder

# GENERATING SYNTHETIC DATA

In [36]:
np.random.seed(20)

In [47]:
num_students = 1000
courses = ['Computer Science', 'Mechanical Engineering', 'Electrical Engineering', 'Civil Engineering', 'Chemical Engineering']
years = [1, 2, 3, 4]
interests = ['AI', 'Blockchain', 'Data Science', 'Environmental Science','Fintech', 'Robotics', 'IOT','Sustainablity', 'Embedded Systems']

students_data = pd.DataFrame({
    'student_id': np.arange(1, num_students + 1),
    'course': np.random.choice(courses, num_students),
    'year': np.random.choice(years, num_students),
    'interest': [",".join(np.random.choice(interests, size=np.random.randint(1, 4), replace=False)) for _ in range(num_students)],
    'avg_quiz_score': np.random.randint(50, 100, num_students)
})


In [48]:
students_data.head()

Unnamed: 0,student_id,course,year,interest,avg_quiz_score
0,1,Civil Engineering,3,Data Science,73
1,2,Civil Engineering,1,"Embedded Systems,AI,Sustainablity",51
2,3,Civil Engineering,1,"Blockchain,Embedded Systems,Sustainablity",62
3,4,Mechanical Engineering,4,"Environmental Science,Embedded Systems",64
4,5,Civil Engineering,1,"Blockchain,Robotics,Sustainablity",54


In [49]:
num_materials = 100
subjects = ['AI', 'Blockchain', 'Data Science', 'Robotics', 'Environmental Science']
difficulty_levels = [1, 2, 3, 4, 5]

materials_data = pd.DataFrame({
    'material_id': np.arange(101, 101 + num_materials),
    'subject': np.random.choice(subjects, num_materials),
    'difficulty_level': np.random.choice(difficulty_levels, num_materials),
    'popularity_score': np.random.randint(1, 100, num_materials),
    'content_length': np.random.randint(500, 2000, num_materials)  
})


In [50]:
materials_data.head()

Unnamed: 0,material_id,subject,difficulty_level,popularity_score,content_length
0,101,Robotics,1,34,1355
1,102,AI,4,8,561
2,103,Robotics,4,10,799
3,104,Robotics,4,48,902
4,105,Blockchain,5,55,1454


In [51]:
engagement_data = pd.DataFrame({
    'student_id': np.random.choice(students_data['student_id'], num_materials * 3),
    'material_id': np.random.choice(materials_data['material_id'], num_materials * 3),
    'viewed': np.random.choice([0, 1], num_materials * 3),
    'rating': np.random.choice([None, 1, 2, 3, 4, 5], num_materials * 3, p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
})


In [52]:
engagement_data.head()

Unnamed: 0,student_id,material_id,viewed,rating
0,909,179,0,
1,944,147,1,
2,515,133,0,
3,193,143,1,1.0
4,585,191,0,5.0


# FEATURE ENGINEERING

In [53]:
def calculate_interest_similarity(students_df, materials_df):
    onehot_encoder = OneHotEncoder()
    
    material_subjects = onehot_encoder.fit_transform(materials_df[['subject']]).toarray()
    
    student_interests = []
    for interests in students_df['interest']:
        interest_list = interests.split(',')
        student_interests.append([1 if subject in interest_list else 0 for subject in onehot_encoder.categories_[0]])
    
    student_interests = np.array(student_interests)
    return cosine_similarity(student_interests, material_subjects)

# ALGORITHM BUILDING

In [54]:
def recommend_study_materials(students_df, materials_df, engagement_df, top_n=5):
    recommendations = []
    interest_similarity = calculate_interest_similarity(students_df, materials_df)
    
    for i, student in students_df.iterrows():
        student_id = student['student_id']
        student_performance = student['avg_quiz_score']
        engagement_for_student = engagement_df[engagement_df['student_id'] == student_id]
        scores = []
        for j, material in materials_df.iterrows():
            material_id = material['material_id']
            interest_score = interest_similarity[i, j] * 0.3            
            difficulty_match = 1 - abs(material['difficulty_level'] - (student_performance / 20)) / 5
            performance_score = difficulty_match * 0.3
            popularity_score = (material['popularity_score'] / 100) * 0.2

            if material_id in engagement_for_student['material_id'].values:
                engagement_row = engagement_for_student[engagement_for_student['material_id'] == material_id]
                engagement_score = (engagement_row['rating'].fillna(0).values[0] / 5) * 0.1
            else:
                engagement_score = 0
            total_score = interest_score + performance_score + popularity_score + engagement_score
            scores.append((material_id, total_score))
        top_materials = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
        recommendations.append((student_id, [material_id for material_id, score in top_materials]))
    
    return recommendations

In [55]:
recommendations = recommend_study_materials(students_data, materials_data, engagement_data, top_n=5)

for rec in recommendations[:10]:  
    student_id, material_ids = rec
    print(f"Student {student_id} -> Recommended Material IDs: {material_ids}")

  engagement_score = (engagement_row['rating'].fillna(0).values[0] / 5) * 0.1


Student 1 -> Recommended Material IDs: [148, 117, 106, 193, 122]
Student 2 -> Recommended Material IDs: [161, 134, 194, 156, 192]
Student 3 -> Recommended Material IDs: [120, 127, 115, 152, 180]
Student 4 -> Recommended Material IDs: [112, 116, 123, 198, 135]
Student 5 -> Recommended Material IDs: [169, 147, 177, 153, 151]
Student 6 -> Recommended Material IDs: [177, 153, 179, 120, 115]
Student 7 -> Recommended Material IDs: [116, 148, 125, 135, 112]
Student 8 -> Recommended Material IDs: [124, 175, 107, 169, 147]
Student 9 -> Recommended Material IDs: [125, 161, 114, 190, 197]
Student 10 -> Recommended Material IDs: [169, 147, 177, 153, 151]


In [59]:
# Function to recommend study materials for each student with adjusted weights
def recommend_study_materials_v2(students_df, materials_df, engagement_df, top_n=5):
    recommendations = []
    
    # Calculate interest similarity
    interest_similarity = calculate_interest_similarity(students_df, materials_df)
    
    for i, student in students_df.iterrows():
        student_id = student['student_id']
        student_performance = student['avg_quiz_score']
        
        # Get previous engagement data for the student
        engagement_for_student = engagement_df[engagement_df['student_id'] == student_id]
        
        # Calculate scores for each material
        scores = []
        for j, material in materials_df.iterrows():
            material_id = material['material_id']
            
            # Adjusted weights: Interest Match = 40%, Performance Match = 30%, Popularity = 20%, Engagement = 10%
            
            # Interest score (increased to 0.4)
            interest_score = interest_similarity[i, j] * 0.4
            
            # Performance-based score (reduced to 0.3)
            difficulty_match = 1 - abs(material['difficulty_level'] - (student_performance / 20)) / 5
            performance_score = difficulty_match * 0.3
            
            # Popularity score (remains at 0.2)
            popularity_score = (material['popularity_score'] / 100) * 0.2
            
            # Engagement score: Give higher score if the student rated similar materials highly
            if material_id in engagement_for_student['material_id'].values:
                engagement_row = engagement_for_student[engagement_for_student['material_id'] == material_id]
                engagement_score = (engagement_row['rating'].fillna(0).values[0] / 5) * 0.1
            else:
                engagement_score = 0
            
            # Total score
            total_score = interest_score + performance_score + popularity_score + engagement_score
            scores.append((material_id, total_score))
        
        # Sort by score and select top N
        top_materials = sorted(scores, key=lambda x: x[1], reverse=True)[:top_n]
        recommendations.append((student_id, [material_id for material_id, score in top_materials]))
    
    return recommendations

### 3. Re-run the Evaluation with Updated Weights ###

# Generate updated recommendations for all students
recommendations_v2 = recommend_study_materials_v2(students_data, materials_data, engagement_data, top_n=5)

# Evaluate the updated recommendation system using MAP
map_score_v2 = evaluate_recommendations(students_data, materials_data, engagement_data, recommendations_v2)

# Display recommendations for the first 10 students (updated)
for rec in recommendations_v2[:10]:  # Displaying first 10 students' recommendations
    student_id, material_ids = rec
    print(f"Student {student_id} -> Recommended Material IDs: {material_ids}")

# Display the updated Mean Average Precision (MAP) score
print(f"\nUpdated Mean Average Precision (MAP): {map_score_v2:.4f}")


  engagement_score = (engagement_row['rating'].fillna(0).values[0] / 5) * 0.1


Student 1 -> Recommended Material IDs: [135, 127, 126, 104, 119]
Student 2 -> Recommended Material IDs: [115, 135, 126, 131, 106]
Student 3 -> Recommended Material IDs: [107, 114, 121, 141, 124]
Student 4 -> Recommended Material IDs: [116, 135, 139, 150, 126]
Student 5 -> Recommended Material IDs: [142, 122, 129, 149, 109]
Student 6 -> Recommended Material IDs: [110, 125, 119, 138, 123]
Student 7 -> Recommended Material IDs: [135, 107, 126, 145, 146]
Student 8 -> Recommended Material IDs: [107, 115, 139, 148, 127]
Student 9 -> Recommended Material IDs: [135, 126, 127, 142, 104]
Student 10 -> Recommended Material IDs: [107, 121, 113, 128, 120]

Updated Mean Average Precision (MAP): 0.0278


