### **Content-based Course Recommender System Using User Profile and Course Genres**

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing

#### Generating course recommendations based on user profile and course genre vectors


In [2]:
course_genres_df = pd.read_csv('course_genre.csv')
course_genres_df

Unnamed: 0,COURSE_ID,TITLE,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,ML0201EN,Robots Are Coming Build Iot Apps With Watson ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
1,ML0122EN,Accelerating Deep Learning With Gpu,0,1,0,0,0,1,0,1,0,0,0,0,0,0
2,GPXX0ZG0EN,Consuming Restful Services Using The Reactive ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
3,RP0105EN,Analyzing Big Data In R Using Apache Spark,1,0,0,1,0,0,0,0,1,0,1,0,0,0
4,GPXX0Z2PEN,Containerizing Packaging And Running A Sprin...,0,0,0,0,1,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
302,excourse89,Javascript Jquery And Json,0,0,0,0,0,0,0,0,0,0,0,1,1,0
303,excourse90,Programming Foundations With Javascript Html ...,0,0,0,0,0,0,0,0,0,0,0,1,1,0
304,excourse91,Front End Web Development With React,0,0,0,0,0,0,0,0,0,0,0,0,1,0
305,excourse92,Introduction To Web Development,0,0,0,0,0,0,0,0,0,0,0,1,1,0


In [3]:
users_df = pd.read_csv('ratings.csv')
users_df.columns = ['User_ID', 'COURSE_ID', 'Rating']
users_df

Unnamed: 0,User_ID,COURSE_ID,Rating
0,1889878,CC0101EN,3.0
1,1342067,CL0101EN,3.0
2,1990814,ML0120ENv3,3.0
3,380098,BD0211EN,3.0
4,779563,DS0101EN,3.0
...,...,...,...
233301,1540125,DS0101EN,3.0
233302,1250651,PY0101EN,3.0
233303,1003832,CB0105ENv1,3.0
233304,922065,BD0141EN,3.0


In [4]:
course_genres_matrix = course_genres_df.iloc[:,2:].to_numpy()

In [5]:
profile_df = pd.DataFrame(columns = ['User_ID'] + list(course_genres_df.columns[2:]))

for user_id in users_df['User_ID'].sort_values().unique():
    user = np.array(
        pd.merge(course_genres_df, users_df[users_df['User_ID'] == user_id], how='left', on='COURSE_ID')['Rating'].fillna(0.0)
    ).reshape(1, -1)

    user_profile = pd.DataFrame(
        np.matmul(user, course_genres_matrix),
        columns=course_genres_df.columns[2:]
    )

    user_profile['User_ID'] = user_id  
    profile_df = pd.concat([profile_df, user_profile], ignore_index=True)

  profile_df = pd.concat([profile_df, user_profile], ignore_index=True)


In [6]:
profile_df

Unnamed: 0,User_ID,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
0,2,52.0,14.0,6.0,43.0,3.0,33.0,0.0,29.0,41.0,2.0,18.0,34.0,9.0,6.0
1,4,40.0,2.0,4.0,28.0,0.0,14.0,0.0,20.0,24.0,0.0,6.0,6.0,0.0,2.0
2,5,24.0,8.0,18.0,24.0,0.0,30.0,0.0,22.0,14.0,2.0,14.0,26.0,4.0,6.0
3,7,2.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
4,8,6.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33896,2102054,3.0,3.0,3.0,6.0,0.0,0.0,0.0,6.0,0.0,0.0,6.0,6.0,0.0,3.0
33897,2102356,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
33898,2102680,3.0,6.0,6.0,0.0,0.0,14.0,0.0,9.0,0.0,0.0,3.0,0.0,0.0,0.0
33899,2102983,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


The profile dataframe contains the course interests for each user, for example, user 8 is very interested in R, data analysis, database, and big data:


In [7]:
profile_df[profile_df['User_ID'] == 8]

Unnamed: 0,User_ID,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
4,8,6.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,6.0,0.0,2.0,0.0,0.0,0.0


In [8]:
test_users_df = pd.read_csv('ratings_test.csv')
test_users_df

Unnamed: 0,user,item,rating
0,1502801,RP0105EN,3.0
1,1609720,CNSC02EN,2.0
2,1347188,CO0301EN,3.0
3,755067,ML0103EN,3.0
4,538595,BD0115EN,3.0
...,...,...,...
9397,1385217,EE0101EN,3.0
9398,1864644,DA0101EN,3.0
9399,435858,TMP0105EN,3.0
9400,1888188,DB0101EN,3.0


In [9]:
print(f"Total numbers of test users {len(test_users_df['user'].unique())}")

Total numbers of test users 1000


In [10]:
test_user_profile = profile_df[profile_df['User_ID'] == 1078030]
test_user_profile

Unnamed: 0,User_ID,Database,Python,CloudComputing,DataAnalysis,Containers,MachineLearning,ComputerVision,DataScience,BigData,Chatbot,R,BackendDev,FrontendDev,Blockchain
18204,1078030,0.0,12.0,0.0,9.0,0.0,12.0,0.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
test_user_vector = test_user_profile.iloc[0, 1:].values
test_user_vector

array([0.0, 12.0, 0.0, 9.0, 0.0, 12.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0,
       0.0], dtype=object)

In [12]:
all_courses = set(course_genres_df['COURSE_ID'].values)
enrolled_courses = set(test_users_df[test_users_df['user'] == 1078030]['item'].to_list())
unknown_courses = all_courses.difference(enrolled_courses)

In [13]:
unknown_course_genres = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]
course_matrix = unknown_course_genres.iloc[:, 2:].values
course_matrix

array([[0, 0, 0, ..., 1, 1, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 1, 1, 0],
       [0, 0, 0, ..., 1, 1, 0]], dtype=int64)

In [14]:
score = np.dot(course_matrix, test_user_vector)
score

array([0.0, 30.0, 0.0, 9.0, 0.0, 0.0, 6.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.0,
       18.0, 12.0, 12.0, 0.0, 0.0, 0.0, 12.0, 0.0, 12.0, 0.0, 6.0, 12.0,
       0.0, 18.0, 0.0, 0.0, 9.0, 18.0, 0.0, 9.0, 0.0, 0.0, 0.0, 0.0, 9.0,
       27.0, 0.0, 9.0, 0.0, 0.0, 12.0, 12.0, 6.0, 12.0, 9.0, 9.0, 12.0,
       21.0, 12.0, 0.0, 9.0, 9.0, 0.0, 0.0, 18.0, 18.0, 18.0, 12.0, 12.0,
       0.0, 18.0, 12.0, 9.0, 9.0, 18.0, 12.0, 0.0, 0.0, 0.0, 0.0, 24.0,
       21.0, 12.0, 0.0, 0.0, 18.0, 0.0, 0.0, 0.0, 0.0, 0.0, 9.0, 9.0, 0.0,
       0.0, 0.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0, 0.0, 24.0,
       0.0, 9.0, 0.0, 0.0, 0.0, 9.0, 9.0, 0.0, 12.0, 0.0, 0.0, 12.0, 0.0,
       0.0, 0.0, 0.0, 12.0, 0.0, 9.0, 9.0, 9.0, 0.0, 0.0, 0.0, 9.0, 0.0,
       0.0, 0.0, 6.0, 6.0, 6.0, 0.0, 6.0, 6.0, 0.0, 0.0, 0.0, 0.0, 18.0,
       0.0, 0.0, 0.0, 12.0, 0.0, 0.0, 0.0, 18.0, 0.0, 12.0, 0.0, 0.0,
       15.0, 30.0, 0.0, 12.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0, 0.0,
       0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 12.0, 9.0, 6.0

In [15]:
def generate_recommendation_scores(score_threshold):

    course_recommended = pd.DataFrame(columns = ['USER_ID', 'COURSE_ID', 'SCORE'])

    for user_id in test_users_df['user'].unique():
        test_user_vector = profile_df[profile_df['User_ID'] == user_id].iloc[0, 1:].values

        enrolled_courses = set(test_users_df[test_users_df['user'] == user_id]['item'].to_list())
        unknown_courses = all_courses.difference(enrolled_courses)
        unknown_course_df = course_genres_df[course_genres_df['COURSE_ID'].isin(unknown_courses)]
        unknown_course_ids = unknown_course_df['COURSE_ID'].values

        recommendation_scores = np.dot(unknown_course_df.iloc[:, 2:].values, test_user_vector)

        for i in range(0, len(unknown_course_ids)):
            score = recommendation_scores[i]
            if score >= score_threshold:
                course_score = pd.DataFrame({'USER_ID' : [user_id], 'COURSE_ID' : [unknown_course_ids[i]], 'SCORE' : [score]})
                course_recommended = pd.concat([course_recommended, course_score], ignore_index= True)

    return course_recommended

In [16]:
df = generate_recommendation_scores(10)
df = df.sort_values(by = ['USER_ID','SCORE'] , ascending = [True, False])
df

  course_recommended = pd.concat([course_recommended, course_score], ignore_index= True)


Unnamed: 0,USER_ID,COURSE_ID,SCORE
27779,37465,RP0105EN,27.0
27784,37465,TMP0105EN,27.0
27788,37465,BD0212EN,27.0
27789,37465,SC0103EN,27.0
27834,37465,excourse31,27.0
...,...,...,...
26796,2087663,excourse64,12.0
26798,2087663,excourse66,12.0
26805,2087663,excourse77,12.0
26806,2087663,excourse78,12.0


In [17]:
profile_df.to_csv('User_profile.csv', index=False)