In [129]:
import os
from google.colab import userdata

In [130]:
userdata = userdata.get('github')

os.environ['GITHUB_TOKEN'] = userdata

In [131]:
!git clone https://{userdata}@github.com/miguroi/sistech.git

Cloning into 'sistech'...
remote: Enumerating objects: 93, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Compressing objects: 100% (2/2), done.[K
remote: Total 93 (delta 1), reused 1 (delta 1), pack-reused 90 (from 1)[K
Receiving objects: 100% (93/93), 49.28 MiB | 19.56 MiB/s, done.
Resolving deltas: 100% (32/32), done.


In [132]:
cd sistech

/content/sistech/sistech/sistech/sistech


In [133]:
!ls

data  model  README.md


In [134]:
!git status

On branch main
Your branch is up to date with 'origin/main'.

nothing to commit, working tree clean


In [135]:
!git add .

### Setup

In [136]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re

import random
from datetime import datetime, timedelta

In [137]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

In [138]:
clean_df = pd.read_csv('data/csv/coursera_courses_cleaned.csv')
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1704 non-null   object 
 1   organization  1704 non-null   object 
 2   rating        1704 non-null   float64
 3   review_count  1704 non-null   int64  
 4   difficulty    1704 non-null   object 
 5   course_type   1704 non-null   object 
 6   duration      1704 non-null   object 
 7   skills        1704 non-null   object 
 8   url           1704 non-null   object 
 9   is_free       1704 non-null   bool   
 10  course_id     1704 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 134.9+ KB


In [139]:
clean_df = clean_df.rename(columns={'rating': 'course_rating'})

In [140]:
clean_df.head()

Unnamed: 0,title,organization,course_rating,review_count,difficulty,course_type,duration,skills,url,is_free,course_id
0,google data analytics,Google,4.76339,170291,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,data storytelling rmarkdown data_literacy_data visualization_data_presentation data_ethics_data cleansing interactive_data_visualization data_validation ggplot tableau_software sampling_statistics presentation spreadsheet_software_data analysis_data_visualization software stakeholder_communications linkedin interviewing_skills applicant_tracking_systems,/professional-certificates/google-data-analytics,False,s12n~kr43OcbTEeqeNBKhfgCLyw
1,google cybersecurity,Google,4.82285,53304,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,threat_modeling network security_incident response vulnerability_management computer security_incident_management hardening intrusion_detection prevention cyber_threat_intelligence threat_management cyber_attacks cybersecurity_network protocol cloud_security vulnerability_assessments bash_scripting_language debugging linux interviewing_skills python_programming sql,/professional-certificates/google-cybersecurity,False,s12n~Dy6K-2UKEe2PIRJn6nL9pQ
2,google project_management,Google,4.842615,130704,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,quality management_project_management life_cycle requirements_analysis project scoping project_closure project_management_project planning agile_project_management continuous_improvement_process project control backlog stakeholder_communications milestones_project_management quality assessment team_management agile_methodology_project documentation change_management interviewing_skills applicant_tracking_systems,/professional-certificates/google-project-management,False,s12n~fq9UWMbTEeqpthJ2RmWGow
3,google ai essential,Google,4.835467,2749,Beginner,Specialization,ONE_TO_THREE_MONTHS,prompt_engineering_generative ai artificial_intelligence machine_learning_aiml large_language_modeling process_optimization productivity_software workforce_development digital_transformation innovation technical_writing emerging_technologies operational_efficiency business solution machine learning_software data_security critical_thinking analysis_data_analysis data_quality,/specializations/ai-essentials-google,False,s12n~3tzIujTqTk-YdcEFZ9r3sQ
4,google digital_marketing ecommerce,Google,4.80108,40653,Beginner,Professional Certificate,THREE_TO_SIX_MONTHS,data storytelling search_engine_marketing medium planning social_media_marketing google ad email_marketing social_media_strategy search_engine_optimization order fulfillment social_media management_performance_measurement spreadsheet_software testing customer_retention ecommerce campaign_management loyalty program marketing interviewing_skills applicant_tracking_systems,/professional-certificates/google-digital-marketing-ecommerce,False,s12n~aYYrIEl-EeyCjQ5Y8Mzdsw


### Create Synthetic User Dataset

In [141]:
course_ids = clean_df['course_id'].tolist()

num_users = 500
user_ids = [f"user_{i}" for i in range(1, num_users + 1)]

In [142]:
data = []
start_date = datetime(2025, 1, 1)
end_date = datetime(2025, 12, 31)

In [143]:
# buat generate synthetic user dataset

for user_id in user_ids:
    num_interactions = random.randint(5, 50)
    for _ in range(num_interactions):
        course_id = random.choice(course_ids)

        is_click = random.choices([True, False], weights=[0.7, 0.3], k=1)[0]

        is_enroll = False
        if is_click:
            is_enroll = random.choices([True, False], weights=[0.5, 0.5], k=1)[0]

        rating = np.nan
        if is_enroll:
            rating = random.randint(1, 5)

        random_days = random.randint(0, (end_date - start_date).days)
        interaction_date = start_date + timedelta(days=random_days)

        data.append({
            "user_id": user_id,
            "course_id": course_id,
            "is_click": is_click,
            "is_enroll": is_enroll,
            "user_rating": rating,
            "interaction_date": interaction_date
        })

In [144]:
synthetic_user_df = pd.DataFrame(data)

In [145]:
synthetic_user_df = pd.merge(synthetic_user_df, clean_df[['course_id', 'title']], on='course_id', how='left')
synthetic_user_df.head()

Unnamed: 0,user_id,course_id,is_click,is_enroll,user_rating,interaction_date,title
0,user_1,course~nUm5Ry-iEeWUWxIOAnoaFQ,False,False,,2025-01-05,clinical kidney pancreas islet transplantation
1,user_1,course~fi8dQEAhEe-xLxIpRr2Opw,False,False,,2025-10-25,strategy game_theory management
2,user_1,course~JlKWCXH0Ee-Quwr_1ABTmQ,False,False,,2025-01-03,sustainability reporting regulation implementation
3,user_1,course~kPVg0fKPEe-lDA7RWcxbZw,True,False,,2025-06-20,analysis shallow foundation
4,user_1,course~jvtkQ7rdEeqBKA5NvH_vlw,True,False,,2025-07-13,measuring maximizing impact covid contact tracing


### Text Vectorization

In [146]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import hstack
from sklearn.preprocessing import MinMaxScaler

In [147]:
# title
tfidf_title = TfidfVectorizer(stop_words='english')
tfidf_matrix_title = tfidf_title.fit_transform(clean_df['title'])

# skills
tfidf_skills = TfidfVectorizer(stop_words='english')
tfidf_matrix_skills = tfidf_skills.fit_transform(clean_df['skills'])

# organization
tfidf_organization = TfidfVectorizer(stop_words='english')
tfidf_matrix_organization = tfidf_organization.fit_transform(clean_df['organization'])

### Features Normalization

In [148]:
scaler = MinMaxScaler()

numerical_features = ['course_rating', 'review_count']
numerical_normalized = scaler.fit_transform(clean_df[numerical_features])

In [149]:
combined_features = hstack([
    tfidf_matrix_title,
    tfidf_matrix_skills,
    tfidf_matrix_organization,
    numerical_normalized
])

In [150]:
combined_features.shape

(1704, 4833)

### Calculate Similarity

In [151]:
cosine_sim = cosine_similarity(combined_features)

In [152]:
cosine_sim.shape

(1704, 1704)

In [153]:
similarity_df = pd.DataFrame(cosine_sim, index=clean_df['course_id'], columns=clean_df['title'])
similarity_df.head()

title,google data analytics,google cybersecurity,google project_management,google ai essential,google digital_marketing ecommerce,google support,google ux design,machine_learning,google prompting essential,ai everyone,...,health_care innovation,drug safety pharmacovigilance,version_control git,supply_chain_management strategy,environmental_management ethic,america unwritten constitution,introduction ableton live,fixing healthcare delivery advanced lean,simulation modeling natural process,operation patient_safety healthcare staff
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
s12n~kr43OcbTEeqeNBKhfgCLyw,1.0,0.586268,0.634764,0.544732,0.574757,0.632852,0.602229,0.241426,0.532713,0.243976,...,0.208765,0.22459,0.226501,0.228576,0.213023,0.219911,0.218696,0.215965,0.19501,0.221259
s12n~Dy6K-2UKEe2PIRJn6nL9pQ,0.586268,1.0,0.603276,0.580715,0.569589,0.596878,0.582896,0.24118,0.55805,0.238366,...,0.220127,0.236939,0.229156,0.231583,0.224576,0.231897,0.230285,0.227826,0.217391,0.239202
s12n~fq9UWMbTEeqpthJ2RmWGow,0.634764,0.603276,1.0,0.567781,0.58243,0.628606,0.602691,0.245193,0.545455,0.245895,...,0.216099,0.232524,0.225625,0.23329,0.220492,0.227643,0.226268,0.22359,0.201844,0.22905
s12n~3tzIujTqTk-YdcEFZ9r3sQ,0.544732,0.580715,0.567781,1.0,0.550964,0.53783,0.555186,0.250569,0.668225,0.366091,...,0.229647,0.238647,0.23031,0.247922,0.226128,0.233525,0.23176,0.229464,0.228553,0.235006
s12n~aYYrIEl-EeyCjQ5Y8Mzdsw,0.574757,0.569589,0.58243,0.550964,1.0,0.55979,0.56551,0.239193,0.535156,0.235816,...,0.219579,0.236361,0.228477,0.237001,0.224012,0.231321,0.229679,0.22727,0.205056,0.232777


### Recommendation System Modeling

In [154]:
synthetic_user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13971 entries, 0 to 13970
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   user_id           13971 non-null  object        
 1   course_id         13971 non-null  object        
 2   is_click          13971 non-null  bool          
 3   is_enroll         13971 non-null  bool          
 4   user_rating       4798 non-null   float64       
 5   interaction_date  13971 non-null  datetime64[ns]
 6   title             13971 non-null  object        
dtypes: bool(2), datetime64[ns](1), float64(1), object(3)
memory usage: 573.2+ KB


In [155]:
clean_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1704 entries, 0 to 1703
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   title          1704 non-null   object 
 1   organization   1704 non-null   object 
 2   course_rating  1704 non-null   float64
 3   review_count   1704 non-null   int64  
 4   difficulty     1704 non-null   object 
 5   course_type    1704 non-null   object 
 6   duration       1704 non-null   object 
 7   skills         1704 non-null   object 
 8   url            1704 non-null   object 
 9   is_free        1704 non-null   bool   
 10  course_id      1704 non-null   object 
dtypes: bool(1), float64(1), int64(1), object(8)
memory usage: 134.9+ KB


In [156]:
sample_course_id = clean_df['course_id'].iloc[0]
sample_course_name = clean_df['title'].iloc[0]
sample_user_id = synthetic_user_df['user_id'].iloc[0]

print(f"Sample Course ID: {sample_course_id}")
print(f"Sample Course Name: {sample_course_name}")
print(f"Sample User ID: {sample_user_id}")

print(f"Sample User Preferences for {sample_user_id}: ")
synthetic_user_df[synthetic_user_df['user_id'] == sample_user_id]

Sample Course ID: s12n~kr43OcbTEeqeNBKhfgCLyw
Sample Course Name: google data analytics
Sample User ID: user_1
Sample User Preferences for user_1: 


Unnamed: 0,user_id,course_id,is_click,is_enroll,user_rating,interaction_date,title
0,user_1,course~nUm5Ry-iEeWUWxIOAnoaFQ,False,False,,2025-01-05,clinical kidney pancreas islet transplantation
1,user_1,course~fi8dQEAhEe-xLxIpRr2Opw,False,False,,2025-10-25,strategy game_theory management
2,user_1,course~JlKWCXH0Ee-Quwr_1ABTmQ,False,False,,2025-01-03,sustainability reporting regulation implementation
3,user_1,course~kPVg0fKPEe-lDA7RWcxbZw,True,False,,2025-06-20,analysis shallow foundation
4,user_1,course~jvtkQ7rdEeqBKA5NvH_vlw,True,False,,2025-07-13,measuring maximizing impact covid contact tracing
5,user_1,s12n~FxNjYF2rEeytrA6P1gaieQ,True,False,,2025-10-02,fundamental flight mechanic
6,user_1,s12n~KUeTF8VkEeumMBLHmUEnqw,True,False,,2025-03-03,omnichannel retail strategy
7,user_1,course~111igE2DEe6t5QqZGaqvvw,False,False,,2025-09-26,operation excellence
8,user_1,s12n~iQ74s3ezTiaqBIxAvADy5g,False,False,,2025-08-10,oracle database_administration zero hero
9,user_1,s12n~lXyKQUpTSeKvQZHVE_bROA,True,True,3.0,2025-09-14,grant writing health researcher


#### Content-based Filtering Recommendation System

In [157]:
course_id_to_index = {course_id: idx for idx, course_id in enumerate(clean_df['course_id'])}
course_index = course_id_to_index[sample_course_id]

list(course_id_to_index.items())[:10]

[('s12n~kr43OcbTEeqeNBKhfgCLyw', 0),
 ('s12n~Dy6K-2UKEe2PIRJn6nL9pQ', 1),
 ('s12n~fq9UWMbTEeqpthJ2RmWGow', 2),
 ('s12n~3tzIujTqTk-YdcEFZ9r3sQ', 3),
 ('s12n~aYYrIEl-EeyCjQ5Y8Mzdsw', 4),
 ('s12n~7lHCSlFIEeeffRIHljDI_g', 5),
 ('s12n~Z-5wCcbTEeqeNBKhfgCLyw', 6),
 ('s12n~nOOfCDWeEeuiZgo2K4rorQ', 7),
 ('s12n~1a_Qa92eQkOSjt6VLZ6lsg', 8),
 ('course~daG-a-O1EeijKBISCWxf6g', 9)]

In [158]:
similarity_scores = cosine_sim[course_index]
course_similarities = pd.Series(similarity_scores, index=clean_df['course_id'])

top_similar = course_similarities.sort_values(ascending=False).head(11)[1:]
top_similar

Unnamed: 0_level_0,0
course_id,Unnamed: 1_level_1
s12n~kx6DXkl-EeyQ6Qp2PlG9FQ,0.729613
course~F3InFyfJEe2-5Aqhzcov3w,0.667512
course~m0fACXB0EeulIxJCZb_vVQ,0.646216
s12n~fq9UWMbTEeqpthJ2RmWGow,0.634764
s12n~7lHCSlFIEeeffRIHljDI_g,0.632852
s12n~agEcxHgeRASq8yHxWPRSWA,0.605815
s12n~Z-5wCcbTEeqeNBKhfgCLyw,0.602229
course~iLNlSQp9Eeun_RJEc0KNDw,0.600765
s12n~elzGL0l-EeyHXRKqb0U9Hw,0.591336
s12n~Dy6K-2UKEe2PIRJn6nL9pQ,0.586268


In [159]:
recommendations = []
for course_id, similarity_score in top_similar.items():
    course_info = clean_df[clean_df['course_id'] == course_id].iloc[0]
    recommendations.append({
        'course_id': course_id,
        'title': course_info['title'],
        'similarity_score': round(similarity_score, 4),
        'course_rating': course_info['course_rating'],
        'organization': course_info['organization']
    })

content_recommendations_df = pd.DataFrame(recommendations)
content_recommendations_df

Unnamed: 0,course_id,title,similarity_score,course_rating,organization
0,s12n~kx6DXkl-EeyQ6Qp2PlG9FQ,google advanced data analytics,0.7296,4.739401,Google
1,course~F3InFyfJEe2-5Aqhzcov3w,google advanced data analytics capstone,0.6675,4.851438,Google
2,course~m0fACXB0EeulIxJCZb_vVQ,google data analytics capstone complete case study,0.6462,4.782299,Google
3,s12n~fq9UWMbTEeqpthJ2RmWGow,google project_management,0.6348,4.842615,Google
4,s12n~7lHCSlFIEeeffRIHljDI_g,google support,0.6329,4.783729,Google
5,s12n~agEcxHgeRASq8yHxWPRSWA,google cloud data analytics,0.6058,4.426667,Google Cloud
6,s12n~Z-5wCcbTEeqeNBKhfgCLyw,google ux design,0.6022,4.815436,Google
7,course~iLNlSQp9Eeun_RJEc0KNDw,share data art visualization,0.6008,4.604054,Google
8,s12n~elzGL0l-EeyHXRKqb0U9Hw,google business_intelligence,0.5913,4.759131,Google
9,s12n~Dy6K-2UKEe2PIRJn6nL9pQ,google cybersecurity,0.5863,4.82285,Google


#### Collaborative Filtering Recommendation System

In [160]:
course_user_merged = pd.merge(synthetic_user_df, clean_df, on='course_id', how='left')
course_user_merged.head(1)

Unnamed: 0,user_id,course_id,is_click,is_enroll,user_rating,interaction_date,title_x,title_y,organization,course_rating,review_count,difficulty,course_type,duration,skills,url,is_free
0,user_1,course~nUm5Ry-iEeWUWxIOAnoaFQ,False,False,,2025-01-05,clinical kidney pancreas islet transplantation,clinical kidney pancreas islet transplantation,"Universiteit Leiden, Leiden University Medical Center",4.816712,371,Intermediate,Course,ONE_TO_THREE_MONTHS,nephrology surgery dialysis chronic_diseases clinical_trial clinical_research patient_evaluation patient_education counseling patient_treatment pharmacology infectious_disease,/learn/clinical-kidney-transplantation,True


In [161]:
ratings_matrix = synthetic_user_df.pivot_table(index=['course_id'],columns=['user_id'],values='user_rating', fill_value=0)
ratings_matrix.head(3)

user_id,user_1,user_10,user_100,user_101,user_102,user_103,user_104,user_105,user_106,user_107,...,user_90,user_91,user_92,user_93,user_94,user_95,user_96,user_97,user_98,user_99
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
course~-1YwAnTLEeSjmyIAC0aXFg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-3PuGcl0Eeuiewq9VRQV-Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-6EEg0s4Ee-4ug4_QNLLSQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [162]:
course_similarity = cosine_similarity(ratings_matrix.values)
np.fill_diagonal(course_similarity, 0)
course_similarity

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [163]:
course_similarity_df = pd.DataFrame(course_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)
course_similarity_df.head()

course_id,course~-1YwAnTLEeSjmyIAC0aXFg,course~-3PuGcl0Eeuiewq9VRQV-Q,course~-6EEg0s4Ee-4ug4_QNLLSQ,course~-7ZymCvBEee6gA5XksfBbg,course~-8cOpydnEeaTvRKF2VtmxQ,course~-A_ToPNPEeSAEiIAC9TCSQ,course~-CoDwy4CEe-GuAr_9hcaqw,course~-Eu38u08EeSKeyIACwQXPg,course~-FCmAR6kEfCpIQr_6QDb7Q,course~-FpKiRmcEem6cA6kboa4Lg,...,s12n~zQ29Z0cJEey1jhLpt2nhVQ,s12n~zRnwN2HTSqeM6PwLXqKLRg,s12n~zb8J0mWKEeaomwoYfGdF7Q,s12n~zgGHK0SjRIaYm5j9ptmjfA,s12n~zgK_5Q8CEeu-sxLx_g1Pbw,s12n~zh7400XtEeWUww73KBYvPw,s12n~zj2VppjQEeWh0Q5bBaG7rw,s12n~zjWaXRMhEeudwgrn4wF8EQ,s12n~zkV7-RZKEea3nArFC-QKGw,s12n~zn_b0Y_lEeqE2Q5NANyNFw
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
course~-1YwAnTLEeSjmyIAC0aXFg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-3PuGcl0Eeuiewq9VRQV-Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.095491,0.0,0.0,0.0
course~-6EEg0s4Ee-4ug4_QNLLSQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-7ZymCvBEee6gA5XksfBbg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-8cOpydnEeaTvRKF2VtmxQ,0.0,0.0,0.0,0.0,0.0,0.353381,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331295,0.0,0.0


In [164]:
course_similarities = course_similarity_df.loc[sample_course_id].sort_values(ascending=False)
course_similarities.head()

Unnamed: 0_level_0,s12n~kr43OcbTEeqeNBKhfgCLyw
course_id,Unnamed: 1_level_1
s12n~inm9VWBZEeigDQ6PCFb-lg,0.559888
course~15Vdj48GEe6xzArT8MnIvQ,0.512148
s12n~fwiX71ycToGvHvSczQmgig,0.512148
s12n~ZtbS2tDqEemmzBL13JFZTg,0.482857
s12n~FLy-fOLlQ2auYW-Tay7VEA,0.467525


In [165]:
top_similar = course_similarities.sort_values(ascending=False).head(11)[1:]
top_similar

Unnamed: 0_level_0,s12n~kr43OcbTEeqeNBKhfgCLyw
course_id,Unnamed: 1_level_1
course~15Vdj48GEe6xzArT8MnIvQ,0.512148
s12n~fwiX71ycToGvHvSczQmgig,0.512148
s12n~ZtbS2tDqEemmzBL13JFZTg,0.482857
s12n~FLy-fOLlQ2auYW-Tay7VEA,0.467525
s12n~P9vATaxqQpKfPM3vYkCYjw,0.447039
course~7dBeI8uGEeWoOwoxwyRjgQ,0.439163
s12n~5xcDP2tDRXylIi_FVNo7jw,0.432844
s12n~tsjfjljzTMapT5CirlZjmg,0.384111
course~1h1TW8f-Ee2EthIjloVnZw,0.384111
s12n~GS8LvZDYRMiCtefJMi8YZw,0.362143


In [166]:
recommendations = []
for course_id, similarity_score in top_similar.items():
  course_info = clean_df[clean_df['course_id'] == course_id]
  if not course_info.empty:
      recommendations.append({
          'course_id': course_id,
          'title': course_info['title'].iloc[0],
          'similarity_score': round(similarity_score, 4),
          'course_rating': course_info['course_rating'].iloc[0],
          'organization': course_info['organization'].iloc[0]
      })

collaborative_recommendations_df = pd.DataFrame(recommendations)
collaborative_recommendations_df

Unnamed: 0,course_id,title,similarity_score,course_rating,organization
0,course~15Vdj48GEe6xzArT8MnIvQ,business_intelligence bi analyst capstone project,0.5121,4.735294,SkillUp EdTech
1,s12n~fwiX71ycToGvHvSczQmgig,adp entrylevel payroll specialist,0.5121,4.722177,"Automatic Data Processing, Inc. (ADP)"
2,s12n~ZtbS2tDqEemmzBL13JFZTg,data_science foundation using r,0.4829,4.555969,Johns Hopkins University
3,s12n~FLy-fOLlQ2auYW-Tay7VEA,advanced tableau,0.4675,4.529412,Corporate Finance Institute
4,s12n~P9vATaxqQpKfPM3vYkCYjw,comptia security full training guide,0.447,4.136364,Packt
5,course~7dBeI8uGEeWoOwoxwyRjgQ,recovering humankind past saving universal heritage,0.4392,4.408638,Sapienza University of Rome
6,s12n~5xcDP2tDRXylIi_FVNo7jw,microsoft azure security engineer associate,0.4328,4.605042,Microsoft
7,s12n~tsjfjljzTMapT5CirlZjmg,chatgpt project_management leveraging ai success,0.3841,4.811311,Vanderbilt University
8,course~1h1TW8f-Ee2EthIjloVnZw,advanced creative_thinking ai tool success,0.3841,4.682927,Imperial College London
9,s12n~GS8LvZDYRMiCtefJMi8YZw,sap business analyst,0.3621,4.285714,SAP


#### Hybrid Systems Recommendation System

In [167]:
content_weight = 0.6
collab_weight = 0.4

In [168]:
course_user_merged = pd.merge(synthetic_user_df, clean_df, on='course_id', how='left')
ratings_matrix = synthetic_user_df.pivot_table(index=['course_id'],columns=['user_id'],values='user_rating', fill_value=0)

course_similarity = cosine_similarity(ratings_matrix.values)
np.fill_diagonal(course_similarity, 0)

course_similarity_df = pd.DataFrame(course_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)
course_similarity_df.head()

course_id,course~-1YwAnTLEeSjmyIAC0aXFg,course~-3PuGcl0Eeuiewq9VRQV-Q,course~-6EEg0s4Ee-4ug4_QNLLSQ,course~-7ZymCvBEee6gA5XksfBbg,course~-8cOpydnEeaTvRKF2VtmxQ,course~-A_ToPNPEeSAEiIAC9TCSQ,course~-CoDwy4CEe-GuAr_9hcaqw,course~-Eu38u08EeSKeyIACwQXPg,course~-FCmAR6kEfCpIQr_6QDb7Q,course~-FpKiRmcEem6cA6kboa4Lg,...,s12n~zQ29Z0cJEey1jhLpt2nhVQ,s12n~zRnwN2HTSqeM6PwLXqKLRg,s12n~zb8J0mWKEeaomwoYfGdF7Q,s12n~zgGHK0SjRIaYm5j9ptmjfA,s12n~zgK_5Q8CEeu-sxLx_g1Pbw,s12n~zh7400XtEeWUww73KBYvPw,s12n~zj2VppjQEeWh0Q5bBaG7rw,s12n~zjWaXRMhEeudwgrn4wF8EQ,s12n~zkV7-RZKEea3nArFC-QKGw,s12n~zn_b0Y_lEeqE2Q5NANyNFw
course_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
course~-1YwAnTLEeSjmyIAC0aXFg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-3PuGcl0Eeuiewq9VRQV-Q,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.095491,0.0,0.0,0.0
course~-6EEg0s4Ee-4ug4_QNLLSQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-7ZymCvBEee6gA5XksfBbg,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
course~-8cOpydnEeaTvRKF2VtmxQ,0.0,0.0,0.0,0.0,0.0,0.353381,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.331295,0.0,0.0


In [169]:
if course_id in course_similarity_df.index:
    collab_similarities = course_similarity_df.loc[course_id]
else:
    collab_similarities = pd.Series(0, index=clean_df['course_id'])

In [170]:
hybrid_scores = {}
for cid in clean_df['course_id']:
    if cid != course_id:
        content_score = course_similarities.get(cid, 0)
        collab_score = collab_similarities.get(cid, 0)

        hybrid_score = ((content_weight * content_score) + (collab_weight * collab_score))
        hybrid_scores[cid] = hybrid_score

In [171]:
sorted_recommendations = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)
list(sorted_recommendations)[:10]

[('s12n~ZtbS2tDqEemmzBL13JFZTg', np.float64(0.556381054002726)),
 ('s12n~FLy-fOLlQ2auYW-Tay7VEA', np.float64(0.5387136390744718)),
 ('s12n~P9vATaxqQpKfPM3vYkCYjw', np.float64(0.515108762928361)),
 ('course~7dBeI8uGEeWoOwoxwyRjgQ', np.float64(0.5060333500909682)),
 ('course~sK74dCWgEeW8-A6tkjXxWQ', np.float64(0.4172857905020444)),
 ('s12n~8xjb0SjZT32EQ_gU2mscFw', np.float64(0.40615616371749175)),
 ('course~Zqm9TFXfEeuDVRIP_gaKjw', np.float64(0.37323175744292303)),
 ('s12n~inm9VWBZEeigDQ6PCFb-lg', np.float64(0.33593282015328235)),
 ('s12n~fwiX71ycToGvHvSczQmgig', np.float64(0.30728851183895034)),
 ('course~15Vdj48GEe6xzArT8MnIvQ', np.float64(0.30728851183895034))]

In [172]:
sorted_recommendations = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)

recommendations = []
for course_id, score in sorted_recommendations[:10]:
    course_info = clean_df[clean_df['course_id'] == course_id]
    if not course_info.empty:
        recommendations.append({
            'course_id': course_id,
            'title': course_info['title'].iloc[0],
            'hybrid_score': round(score, 4),
            'course_rating': course_info['course_rating'].iloc[0],
            'organization': course_info['organization'].iloc[0]
        })

hybrid_recommendations_df = pd.DataFrame(recommendations)
hybrid_recommendations_df

Unnamed: 0,course_id,title,hybrid_score,course_rating,organization
0,s12n~ZtbS2tDqEemmzBL13JFZTg,data_science foundation using r,0.5564,4.555969,Johns Hopkins University
1,s12n~FLy-fOLlQ2auYW-Tay7VEA,advanced tableau,0.5387,4.529412,Corporate Finance Institute
2,s12n~P9vATaxqQpKfPM3vYkCYjw,comptia security full training guide,0.5151,4.136364,Packt
3,course~7dBeI8uGEeWoOwoxwyRjgQ,recovering humankind past saving universal heritage,0.506,4.408638,Sapienza University of Rome
4,course~sK74dCWgEeW8-A6tkjXxWQ,leadership century organization,0.4173,4.755832,Copenhagen Business School
5,s12n~8xjb0SjZT32EQ_gU2mscFw,ibm java developer,0.4062,4.648842,"IBM, SkillUp EdTech"
6,course~Zqm9TFXfEeuDVRIP_gaKjw,dynamic programming greedy algorithm,0.3732,4.635135,University of Colorado Boulder
7,s12n~inm9VWBZEeigDQ6PCFb-lg,clinical_data science,0.3359,4.458898,University of Colorado System
8,s12n~fwiX71ycToGvHvSczQmgig,adp entrylevel payroll specialist,0.3073,4.722177,"Automatic Data Processing, Inc. (ADP)"
9,course~15Vdj48GEe6xzArT8MnIvQ,business_intelligence bi analyst capstone project,0.3073,4.735294,SkillUp EdTech


### Pipeline

In [173]:
import json
from scipy.sparse import hstack, save_npz, load_npz

In [174]:
def content_recommendation(user_id, course_id, save_files=True):
    tfidf_title = TfidfVectorizer(stop_words='english')
    tfidf_matrix_title = tfidf_title.fit_transform(clean_df['title'])

    tfidf_skills = TfidfVectorizer(stop_words='english')
    tfidf_matrix_skills = tfidf_skills.fit_transform(clean_df['skills'])

    tfidf_organization = TfidfVectorizer(stop_words='english')
    tfidf_matrix_organization = tfidf_organization.fit_transform(clean_df['organization'])

    scaler = MinMaxScaler()
    numerical_features = ['course_rating', 'review_count']
    numerical_normalized = scaler.fit_transform(clean_df[numerical_features])

    combined_features = hstack([
        tfidf_matrix_title,
        tfidf_matrix_skills,
        tfidf_matrix_organization,
        numerical_normalized
    ])

    cosine_sim = cosine_similarity(combined_features)

    if save_files:
        save_npz('tfidf_title_matrix.npz', tfidf_matrix_title)
        save_npz('tfidf_skills_matrix.npz', tfidf_matrix_skills)
        save_npz('tfidf_organization_matrix.npz', tfidf_matrix_organization)
        save_npz('combined_features_matrix.npz', combined_features)

        numerical_df = pd.DataFrame(numerical_normalized, columns=numerical_features)
        numerical_df.to_csv('numerical_features_normalized.csv', index=False)

        similarity_df = pd.DataFrame(cosine_sim, index=clean_df['course_id'], columns=clean_df['course_id'])
        similarity_df.to_csv('content_similarity_matrix.csv')

        vocabularies = {
            'title_vocabulary': tfidf_title.vocabulary_,
            'skills_vocabulary': tfidf_skills.vocabulary_,
            'organization_vocabulary': tfidf_organization.vocabulary_
        }
        with open('tfidf_vocabularies.json', 'w') as f:
            json.dump(vocabularies, f, indent=2)

    course_id_to_index = {course_id: idx for idx, course_id in enumerate(clean_df['course_id'])}
    course_index = course_id_to_index[course_id]

    similarity_scores = cosine_sim[course_index]
    course_similarities = pd.Series(similarity_scores, index=clean_df['course_id'])

    top_similar = course_similarities.sort_values(ascending=False).head(11)[1:]

    recommendations = []
    for cid, similarity_score in top_similar.items():
        course_info = clean_df[clean_df['course_id'] == cid].iloc[0]
        recommendations.append({
            'course_id': cid,
            'title': course_info['title'],
            'similarity_score': round(similarity_score, 4),
            'course_rating': course_info['course_rating'],
            'organization': course_info['organization']
        })

    content_recommendations_df = pd.DataFrame(recommendations)

    if save_files:
        content_recommendations_df.to_csv('content_recommendations.csv', index=False)

    return content_recommendations_df

In [175]:
def collaborative_recommendation(user_id, course_id, save_files=True):
    course_user_merged = pd.merge(synthetic_user_df, clean_df, on='course_id', how='left')
    ratings_matrix = synthetic_user_df.pivot_table(index=['course_id'],columns=['user_id'],values='user_rating', fill_value=0)

    course_similarity = cosine_similarity(ratings_matrix.values)
    np.fill_diagonal(course_similarity, 0)

    course_similarity_df = pd.DataFrame(course_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

    if save_files:
        ratings_matrix.to_csv('user_ratings_matrix.csv')

        course_similarity_df.to_csv('collaborative_similarity_matrix.csv')

        user_interactions = {}
        for user in synthetic_user_df['user_id'].unique():
            user_data = synthetic_user_df[synthetic_user_df['user_id'] == user]
            user_interactions[user] = {
                'interactions': user_data[['course_id', 'is_click', 'is_enroll', 'user_rating']].to_dict('records')
            }

        with open('user_interactions.json', 'w') as f:
            json.dump(user_interactions, f, indent=2)

    course_similarities = course_similarity_df.loc[course_id].sort_values(ascending=False)
    top_similar = course_similarities.sort_values(ascending=False).head(11)[1:]

    recommendations = []
    for cid, similarity_score in top_similar.items():
        course_info = clean_df[clean_df['course_id'] == cid]
        if not course_info.empty:
            recommendations.append({
                'course_id': cid,
                'title': course_info['title'].iloc[0],
                'similarity_score': round(similarity_score, 4),
                'course_rating': course_info['course_rating'].iloc[0],
                'organization': course_info['organization'].iloc[0]
            })

    collaborative_recommendations_df = pd.DataFrame(recommendations)

    if save_files:
        collaborative_recommendations_df.to_csv('collaborative_recommendations.csv', index=False)

    return collaborative_recommendations_df

In [176]:
def hybrid_recommendation(user_id, course_id, content_weight, collab_weight, save_files=True):
    tfidf_title = TfidfVectorizer(stop_words='english')
    tfidf_matrix_title = tfidf_title.fit_transform(clean_df['title'])

    tfidf_skills = TfidfVectorizer(stop_words='english')
    tfidf_matrix_skills = tfidf_skills.fit_transform(clean_df['skills'])

    tfidf_organization = TfidfVectorizer(stop_words='english')
    tfidf_matrix_organization = tfidf_organization.fit_transform(clean_df['organization'])

    scaler = MinMaxScaler()
    numerical_features = ['course_rating', 'review_count']
    numerical_normalized = scaler.fit_transform(clean_df[numerical_features])

    combined_features = hstack([
        tfidf_matrix_title,
        tfidf_matrix_skills,
        tfidf_matrix_organization,
        numerical_normalized
    ])

    content_cosine_sim = cosine_similarity(combined_features)

    course_user_merged = pd.merge(synthetic_user_df, clean_df, on='course_id', how='left')
    ratings_matrix = synthetic_user_df.pivot_table(index=['course_id'],columns=['user_id'],values='user_rating', fill_value=0)

    course_similarity = cosine_similarity(ratings_matrix.values)
    np.fill_diagonal(course_similarity, 0)

    course_similarity_df = pd.DataFrame(course_similarity, index=ratings_matrix.index, columns=ratings_matrix.index)

    if course_id in course_similarity_df.index:
        collab_similarities = course_similarity_df.loc[course_id]
    else:
        collab_similarities = pd.Series(0, index=clean_df['course_id'])

    course_id_to_index = {cid: idx for idx, cid in enumerate(clean_df['course_id'])}
    course_index = course_id_to_index[course_id]
    content_similarities = pd.Series(content_cosine_sim[course_index], index=clean_df['course_id'])

    hybrid_scores = {}
    for cid in clean_df['course_id']:
        if cid != course_id:
            content_score = content_similarities.get(cid, 0)
            collab_score = collab_similarities.get(cid, 0)

            hybrid_score = ((content_weight * content_score) + (collab_weight * collab_score))
            hybrid_scores[cid] = hybrid_score

    if save_files:
        hybrid_matrix = pd.DataFrame(index=clean_df['course_id'], columns=clean_df['course_id'])
        for i, cid1 in enumerate(clean_df['course_id']):
            for j, cid2 in enumerate(clean_df['course_id']):
                if cid1 != cid2:
                    content_score = content_cosine_sim[i][j]
                    collab_score = course_similarity_df.loc[cid1, cid2] if cid1 in course_similarity_df.index and cid2 in course_similarity_df.columns else 0
                    hybrid_score = (content_weight * content_score) + (collab_weight * collab_score)
                    hybrid_matrix.loc[cid1, cid2] = hybrid_score
                else:
                    hybrid_matrix.loc[cid1, cid2] = 0

        hybrid_matrix.to_csv('hybrid_similarity_matrix.csv')

        hybrid_params = {
            'content_weight': content_weight,
            'collaborative_weight': collab_weight,
            'user_id': user_id,
            'course_id': course_id,
            'timestamp': pd.Timestamp.now().isoformat()
        }

        with open('hybrid_parameters.json', 'w') as f:
            json.dump(hybrid_params, f, indent=2)

    sorted_recommendations = sorted(hybrid_scores.items(), key=lambda x: x[1], reverse=True)

    recommendations = []
    for cid, score in sorted_recommendations:
        course_info = clean_df[clean_df['course_id'] == cid]
        if not course_info.empty:
            recommendations.append({
                'course_id': cid,
                'title': course_info['title'].iloc[0],
                'hybrid_score': round(score, 4),
                'course_rating': course_info['course_rating'].iloc[0],
                'organization': course_info['organization'].iloc[0]
            })
            if len(recommendations) >= 10:
                break

    hybrid_recommendations_df = pd.DataFrame(recommendations)

    if save_files:
        hybrid_recommendations_df.to_csv('hybrid_recommendations.csv', index=False)

    return hybrid_recommendations_df

In [177]:
sample_course_id = clean_df['course_id'].iloc[1]
sample_course_name = clean_df['title'].iloc[1]
sample_user_id = synthetic_user_df['user_id'].iloc[1]

content_weight = 0.6
collab_weight = 0.4

print(f"Sample Course ID: {sample_course_id}")
print(f"Sample Course Name: {sample_course_name}")
print(f"Sample User ID: {sample_user_id}")

print(f"Sample User Preferences for {sample_user_id}: ")
synthetic_user_df[synthetic_user_df['user_id'] == sample_user_id]

Sample Course ID: s12n~Dy6K-2UKEe2PIRJn6nL9pQ
Sample Course Name: google cybersecurity
Sample User ID: user_1
Sample User Preferences for user_1: 


Unnamed: 0,user_id,course_id,is_click,is_enroll,user_rating,interaction_date,title
0,user_1,course~nUm5Ry-iEeWUWxIOAnoaFQ,False,False,,2025-01-05,clinical kidney pancreas islet transplantation
1,user_1,course~fi8dQEAhEe-xLxIpRr2Opw,False,False,,2025-10-25,strategy game_theory management
2,user_1,course~JlKWCXH0Ee-Quwr_1ABTmQ,False,False,,2025-01-03,sustainability reporting regulation implementation
3,user_1,course~kPVg0fKPEe-lDA7RWcxbZw,True,False,,2025-06-20,analysis shallow foundation
4,user_1,course~jvtkQ7rdEeqBKA5NvH_vlw,True,False,,2025-07-13,measuring maximizing impact covid contact tracing
5,user_1,s12n~FxNjYF2rEeytrA6P1gaieQ,True,False,,2025-10-02,fundamental flight mechanic
6,user_1,s12n~KUeTF8VkEeumMBLHmUEnqw,True,False,,2025-03-03,omnichannel retail strategy
7,user_1,course~111igE2DEe6t5QqZGaqvvw,False,False,,2025-09-26,operation excellence
8,user_1,s12n~iQ74s3ezTiaqBIxAvADy5g,False,False,,2025-08-10,oracle database_administration zero hero
9,user_1,s12n~lXyKQUpTSeKvQZHVE_bROA,True,True,3.0,2025-09-14,grant writing health researcher


In [178]:
content_recommendation(sample_user_id, sample_course_id, save_files=False)

Unnamed: 0,course_id,title,similarity_score,course_rating,organization
0,s12n~oEiB92pjRHOlsATyw2FzuQ,google cloud cybersecurity,0.6625,4.593293,Google Cloud
1,course~f6gZrWUIEe2piwrmyBNtEQ,foundation cybersecurity,0.6601,4.852938,Google
2,s12n~fq9UWMbTEeqpthJ2RmWGow,google project_management,0.6033,4.842615,Google
3,s12n~7_nEU3iaEeiVXgoT1iWlYg,google automation python,0.6031,4.750087,Google
4,course~-TDPq2UJEe2piwrmyBNtEQ,put work prepare cybersecurity job,0.6001,4.848463,Google
5,s12n~7lHCSlFIEeeffRIHljDI_g,google support,0.5969,4.783729,Google
6,s12n~elzGL0l-EeyHXRKqb0U9Hw,google business_intelligence,0.5957,4.759131,Google
7,course~7LHOTGUJEe21jBLFGcIQ1w,automate cybersecurity task python,0.5882,4.766086,Google
8,s12n~kr43OcbTEeqeNBKhfgCLyw,google data analytics,0.5863,4.76339,Google
9,s12n~Z-5wCcbTEeqeNBKhfgCLyw,google ux design,0.5829,4.815436,Google


In [179]:
collaborative_recommendation(sample_user_id, sample_course_id, save_files=False)

Unnamed: 0,course_id,title,similarity_score,course_rating,organization
0,course~-1YwAnTLEeSjmyIAC0aXFg,supply_chain_management learning perspective,0.686,4.705948,Korea Advanced Institute of Science and Technology(KAIST)
1,course~YReFaj2LEeiTjQ4kFvy-Pg,density functional theory,0.416,4.869198,École Polytechnique
2,course~uYPONw9REeeYSwoRrxflRA,life health radiation,0.3922,4.868347,The University of Sydney
3,course~3oznDIM5EeeW1A7RTAITCg,ioefficient algorithm,0.378,4.616667,EIT Digital
4,course~Afw0WV4PEe6sPwqSOa_9fQ,advanced data_analysis collaboration qlik sense,0.3706,5.0,Coursera Instructor Network
5,course~x_7YCR_1Ee6lxgqp2XjTZQ,responsible medication prescribing older adult,0.3638,4.806452,Icahn School of Medicine at Mount Sinai
6,s12n~_Psa5V0DEeiCpAqWIh3KHA,music business,0.2774,4.842049,Berklee
7,course~UR_feZ1MEeq-VApbJVRrTw,increase seo traffic wordpress,0.2425,4.507653,Coursera Project Network
8,s12n~UIEF9fgAQn-w0vtnJcYF7Q,mental_health care fundamental,0.2132,4.786692,MedCerts
9,s12n~pofy9I8EEeqAihIVddil_w,ibm cybersecurity analyst,0.2063,4.596965,IBM


In [180]:
hybrid_recommendation(sample_user_id, sample_course_id, content_weight, collab_weight, save_files=False)

Unnamed: 0,course_id,title,hybrid_score,course_rating,organization
0,s12n~VEskWWftEeeRcArRQWecQg,introduction discrete mathematics computer_science,0.4158,4.463896,University of California San Diego
1,course~-1YwAnTLEeSjmyIAC0aXFg,supply_chain_management learning perspective,0.4107,4.705948,Korea Advanced Institute of Science and Technology(KAIST)
2,s12n~oEiB92pjRHOlsATyw2FzuQ,google cloud cybersecurity,0.3975,4.593293,Google Cloud
3,course~f6gZrWUIEe2piwrmyBNtEQ,foundation cybersecurity,0.3961,4.852938,Google
4,s12n~fq9UWMbTEeqpthJ2RmWGow,google project_management,0.362,4.842615,Google
5,s12n~7_nEU3iaEeiVXgoT1iWlYg,google automation python,0.3619,4.750087,Google
6,course~-TDPq2UJEe2piwrmyBNtEQ,put work prepare cybersecurity job,0.36,4.848463,Google
7,s12n~7lHCSlFIEeeffRIHljDI_g,google support,0.3581,4.783729,Google
8,s12n~elzGL0l-EeyHXRKqb0U9Hw,google business_intelligence,0.3574,4.759131,Google
9,course~7LHOTGUJEe21jBLFGcIQ1w,automate cybersecurity task python,0.3529,4.766086,Google


### Result

In [181]:
!ls model/csv

collaborative_recommendations.csv    hybrid_recommendations.csv
collaborative_similarity_matrix.csv  hybrid_similarity_matrix.csv
content_recommendations.csv	     numerical_features_normalized.csv
content_similarity_matrix.csv	     user_ratings_matrix.csv


In [182]:
!ls model/json

hybrid_parameters.json	tfidf_vocabularies.json  user_interactions.json


In [183]:
!ls model/npz

combined_features_matrix.npz   tfidf_skills_matrix.npz
tfidf_organization_matrix.npz  tfidf_title_matrix.npz


In [184]:
collaborative_recommendation = pd.read_csv('model/csv/collaborative_recommendations.csv')
collaborative_recommendation

Unnamed: 0,course_id,title,similarity_score,course_rating,organization
0,course~T6G7p4vcEe--oAr_y6Q6Xw,capstone project applying business_analysis skill,0.8559,4.75,IBM
1,s12n~9FGXduzMEeaPUAr6dzYJCA,software_development lifecycle,0.5819,4.718841,University of Minnesota
2,s12n~3RDP-YtuEeixCgp16I3FUg,realtime embedded_systems,0.4305,3.925714,University of Colorado Boulder
3,course~B8E2c3blEear9RKoJLO5Cw,divide conquer sorting searching randomized algorithm,0.4305,4.762776,Stanford University
4,course~R-r2uwp-Eeuf7w5EwYPThw,start ux design_process empathize define ideate,0.3706,4.808546,Google
5,course~gftBfIl0EeqZsg72h675gw,supervised machine_learning regression,0.3101,4.67246,IBM
6,course~KiBHguBOEemNmw4gGersHg,machine translation,0.2378,4.5,Karlsruhe Institute of Technology
7,course~tGbJuel4EeSnMSIACzuFJw,introduction negotiation strategic playbook becoming principled persuasive negotiator,0.1961,4.847206,Yale University
8,course~ik8m6awxEeuLCQ6P36XDuw,serverless data_processing dataflow operation,0.1601,3.588235,Google Cloud
9,s12n~HT4T71eFSpyZj6gZNsoaHQ,generative ai customer_support,0.1569,4.718868,"IBM, SkillUp EdTech"
