In [1]:
import numpy as np
import pandas as pd
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

df = pd.read_csv("dataset/edx_courses.csv")
df.head()
df.info()
df.drop_duplicates()
missing_values = df.isnull().sum()
print(missing_values)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 975 entries, 0 to 974
Data columns (total 16 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   title               975 non-null    object
 1   summary             922 non-null    object
 2   n_enrolled          855 non-null    object
 3   course_type         975 non-null    object
 4   institution         975 non-null    object
 5   instructors         972 non-null    object
 6   Level               975 non-null    object
 7   subject             975 non-null    object
 8   language            975 non-null    object
 9   subtitles           972 non-null    object
 10  course_effort       975 non-null    object
 11  course_length       975 non-null    object
 12  price               975 non-null    object
 13  course_description  935 non-null    object
 14  course_syllabus     414 non-null    object
 15  course_url          975 non-null    object
dtypes: object(16)
memory usage

In [2]:
text_columns = ['summary', 'instructors', 'subtitles', 'course_description', 'course_syllabus']
df[text_columns] = df[text_columns].fillna('not available')

df.isnull().sum()
df.head()
df.isnull().sum()

title                   0
summary                 0
n_enrolled            120
course_type             0
institution             0
instructors             0
Level                   0
subject                 0
language                0
subtitles               0
course_effort           0
course_length           0
price                   0
course_description      0
course_syllabus         0
course_url              0
dtype: int64

In [3]:
df['text'] = df['title'] + ' ' + df['summary'] + ' ' + df['course_description'] + ' ' + df['course_url']
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text'])

## compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

cosine_sim_list = cosine_sim.tolist()
with open('cosine_sim.json', 'w') as f:
  json.dump(cosine_sim_list, f)

df.to_json('course.json', orient='records')

# save cosine similarity matrix
np.save('cosine_sim.npy', cosine_sim)

# save dataframe as csv
df.to_csv('edx_courses_processed.csv', index=False)

In [4]:
def get_recommendations_by_title(title, cosine_sim=cosine_sim):
  if title not in df['title'].values:
    return f"Course titled '{title}' not found"
  idx = df[df['title'] == title].index[0]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:4]
  course_indices = [i[0] for i in sim_scores]
  return df[['title', 'summary', 'course_description', 'course_url']].iloc[course_indices]

In [9]:
recommended_courses = get_recommendations_by_title('Network and Security Foundations')
print(recommended_courses)

                                                 title  \
728                         IoT Networks and Protocols   
721  Introduction to Open Source Networking Technol...   
295                    Preparing to Network in English   

                                               summary  \
728  Learn about IoT networks and the protocols and...   
721  Learn technical fundamentals needed to adopt S...   
295  Learn the basics of networking while you grow ...   

                                    course_description  \
728  The Internet of Things (IoT) is expanding at a...   
721  Explore open source networking projects, from ...   
295  Networking in the business world is key to car...   

                                            course_url  
728  https://www.edx.org/course/iot-networks-and-pr...  
721  https://www.edx.org/course/introduction-to-ope...  
295  https://www.edx.org/course/preparing-to-networ...  


In [6]:
unique_values = df['subject'].unique()
print(unique_values)

['Education & Teacher Training' 'Computer Science'
 'Data Analysis & Statistics' 'Business & Management' 'Communication'
 'Health & Safety' 'Math' 'Humanities' 'Ethics' 'Economics & Finance'
 'Food & Nutrition' 'Biology & Life Sciences' 'Social Sciences'
 'Architecture' 'Medicine' 'Environmental Studies' 'Chemistry'
 'Art & Culture' 'Language' 'Engineering' 'Philosophy & Ethics'
 'Electronics' 'History' 'Literature' 'Physics' 'Law' 'Design' 'Science'
 'Music' 'Energy & Earth Sciences' 'Philanthropy']


In [7]:
def get_recommenations_by_subject(subject, cosine_sim=cosine_sim):
  subject_df = df[df['subject'].str.contains(subject, case=False, na=False)]

  if subject_df.empty:
    return f"No courses found for the subject '{subject}'"
  subject_df = subject_df.reset_index(drop=True)
  tfidf = TfidfVectorizer(stop_words='english')
  tfidf_matrix = tfidf.fit_transform(subject_df['text'])
  cosine_sim_subject = linear_kernel(tfidf_matrix, tfidf_matrix)

  recommendations = {}
  for idx, row in subject_df.iterrows():
      # Get the pairwise similarity scores of all courses with that course
      sim_scores = list(enumerate(cosine_sim_subject[idx]))

      # Sort the courses based on the similarity scores
      sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

      # Get the scores of the 3 most similar courses
      sim_scores = sim_scores[1:4]

      # Get the course indices
      course_indices = [i[0] for i in sim_scores]

      # Store the recommendations
      recommendations[row['title']] = subject_df[['title', 'summary', 'course_description']].iloc[course_indices]

  return recommendations

In [11]:
subject_recommendations = get_recommenations_by_subject('Math')
for title, recs in subject_recommendations.items():
  print(f"Recommendations for course '{title}' ")
  print(recs)
  print()

Recommendations for course 'Fat Chance: Probability from the Ground Up' 
                                                title  \
26                            MathTrackX: Probability   
23                             MathTrackX: Statistics   
25  A-level Mathematics for Year 13 - Course 2: Ge...   

                                              summary  \
26  Understand probability and how it manifests in...   
23  Understand fundamental concepts relating to st...   
25  Develop your thinking skills, fluency and conf...   

                                   course_description  
26  This course is part five of the MathTrackX XSe...  
23  This course is part six of the MathTrackX XSer...  
25  This course by Imperial College London is desi...  

Recommendations for course 'Introduction to Linear Models and Matrix Algebra' 
                                       title  \
11  The Math of Data Science: Linear Algebra   
22          Data for Effective Policy Making   
28        Linear Alge