In [2]:
import numpy as np
import pandas as pd

In [1]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("khusheekapoor/edx-courses-dataset-2021")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Path to dataset files: /home/ajtaboada/.cache/kagglehub/datasets/khusheekapoor/edx-courses-dataset-2021/versions/1


In [6]:
import os

# List files in the dataset directory
dataset_dir = path  # This is the directory you downloaded
files = os.listdir(dataset_dir)
print("Files in dataset directory:", files)

# Assuming there's a CSV file in the directory, read it into a DataFrame
csv_file = os.path.join(dataset_dir, 'EdX.csv')  # Replace 'your_file_name.csv' with the actual file name
data = pd.read_csv(csv_file)

Files in dataset directory: ['EdX.csv']


In [10]:
data = pd.read_csv(csv_file)
data.head()

Unnamed: 0,Name,University,Difficulty Level,Link,About,Course Description
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t..."
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct..."
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab..."
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...


In [11]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'data' is your DataFrame
# Step 1: Clean the Data
data.dropna(inplace=True)  # Remove missing values
data.drop_duplicates(inplace=True)  # Remove duplicates

# Step 2: Text Normalization
data['Course Description'] = data['Course Description'].str.lower()  # Lowercase
data['About'] = data['About'].str.lower()  # Lowercase

# Step 3: Combine relevant features for vectorization
data['combined_features'] = data['Name'] + ' ' + data['About'] + ' ' + data['Course Description']

# Step 4: Vectorization
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['combined_features'])

# Step 5: Create a Similarity Matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Now you can use 'cosine_sim' to recommend courses based on user input

In [14]:
# Display the similarity matrix
print("Cosine Similarity Matrix:")
print(cosine_sim)

Cosine Similarity Matrix:
[[1.         0.10890569 0.16681949 ... 0.09328029 0.07217271 0.06965867]
 [0.10890569 1.         0.19848065 ... 0.09119756 0.10173518 0.13744779]
 [0.16681949 0.19848065 1.         ... 0.12668423 0.09110504 0.10398242]
 ...
 [0.09328029 0.09119756 0.12668423 ... 1.         0.08495621 0.10044155]
 [0.07217271 0.10173518 0.09110504 ... 0.08495621 1.         0.06437253]
 [0.06965867 0.13744779 0.10398242 ... 0.10044155 0.06437253 1.        ]]


In [16]:
# Function to recommend courses based on user input
def recommend_courses(user_input, data, cosine_sim):
    # Normalize the user input
    user_input = user_input.lower()
    
    # Create a DataFrame to hold the user input
    user_df = pd.DataFrame({'combined_features': [user_input]})
    
    # Vectorize the user input
    user_tfidf = vectorizer.transform(user_df['combined_features'])
    
    # Calculate cosine similarity between user input and existing courses
    user_cosine_sim = cosine_similarity(user_tfidf, tfidf_matrix)
    
    # Get the index of the most similar courses
    similar_indices = user_cosine_sim[0].argsort()[-5:][::-1]  # Get top 5 recommendations
    
    # Return the recommended courses
    recommended_courses = data.iloc[similar_indices]
    return recommended_courses

# Example user input to test the recommender
user_input = "I want to learn programming basics"
recommended_courses = recommend_courses(user_input, data, cosine_sim)

# Display the recommended courses
print("Recommended Courses:")
recommended_courses[['Name', 'University', 'Link']]

Recommended Courses:


Unnamed: 0,Name,University,Link
62,Basics of Computing and Programming,New York University,https://www.edx.org/course/basics-of-computing...
137,Programming Basics,IITBombay,https://www.edx.org/course/programming-basics
273,Object-Oriented Programming,IITBombay,https://www.edx.org/course/object-oriented-pro...
1,Programming for Everybody (Getting Started wit...,The University of Michigan,https://www.edx.org/course/programming-for-eve...
309,Analytics in Python,Columbia University,https://www.edx.org/course/analytics-in-python


In [18]:
user_input = "I want to learn computer vision"
recommended_courses = recommend_courses(user_input, data, cosine_sim)

# Display the recommended courses
print("Recommended Courses:")
recommended_courses[['Name', 'University', 'Link']]

Recommended Courses:


Unnamed: 0,Name,University,Link
382,Computer Vision Fundamentals with Watson and O...,IBM,https://www.edx.org/course/computer-vision-fun...
306,Computer Hardware and Operating Systems,New York University,https://www.edx.org/course/computer-hardware-a...
137,Programming Basics,IITBombay,https://www.edx.org/course/programming-basics
69,Computer Science 101,Stanford University,https://www.edx.org/course/computer-science-101
629,Discover Political Science,Université catholique de Louvain,https://www.edx.org/course/discover-political-...
