In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('edx.csv')  # Replace with the correct file path

# Display the first few rows
print(df.head())

# Check columns
print(df.columns)

# Check for missing values
print(df.isnull().sum())

                                                Name  \
0                                How to Learn Online   
1  Programming for Everybody (Getting Started wit...   
2            CS50's Introduction to Computer Science   
3                                 The Analytics Edge   
4  Marketing Analytics: Marketing Measurement Str...   

                              University Difficulty Level  \
0                                    edX         Beginner   
1             The University of Michigan         Beginner   
2                     Harvard University         Beginner   
3  Massachusetts Institute of Technology     Intermediate   
4     University of California, Berkeley         Beginner   

                                                Link  \
0     https://www.edx.org/course/how-to-learn-online   
1  https://www.edx.org/course/programming-for-eve...   
2  https://www.edx.org/course/cs50s-introduction-...   
3      https://www.edx.org/course/the-analytics-edge   
4  https://www.e

In [2]:
# Combine relevant text columns into a single feature
df['combined_text'] = df['Name'] + ' ' + df['Course Description'] + ' ' + df['Difficulty Level']

# Drop rows with missing combined_text
df = df.dropna(subset=['combined_text'])

# Display the combined text
print(df['combined_text'].head())

0    How to Learn Online Designed for those who are...
1    Programming for Everybody (Getting Started wit...
2    CS50's Introduction to Computer Science This i...
3    The Analytics Edge In the last decade, the amo...
4    Marketing Analytics: Marketing Measurement Str...
Name: combined_text, dtype: object


In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the combined text
tfidf_matrix = tfidf.fit_transform(df['combined_text'])

# Check the shape of the TF-IDF matrix
print(tfidf_matrix.shape)  # (num_courses, num_features)

(720, 5000)


In [4]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check the shape of the similarity matrix
print(cosine_sim.shape)  # (num_courses, num_courses)

(720, 720)


In [10]:
# Create a mapping from course title to index
indices = pd.Series(df.index, index=df['Name']).drop_duplicates()

# Function to get recommendations
def get_recommendations(Name, cosine_sim=cosine_sim, top_n=10):
    # Get the index of the course
    idx = indices[Name]
    
    # Get pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort courses based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top-N similar courses
    sim_scores = sim_scores[1:top_n+1]  # Exclude the course itself
    course_indices = [i[0] for i in sim_scores]
    
    # Return the top-N recommended courses
    return df[['Name', 'University', 'Difficulty Level', 'Link', 'About', 'Course Description']].iloc[course_indices]

In [17]:
# Test the recommender system
course_title = "Python Basics for Data Science"  # Replace with a course title from the dataset
recommendations = get_recommendations(course_title)
print(recommendations)

                                                  Name  \
575  Introduction to Cloud Development with HTML5, ...   
110            Programming for the Web with JavaScript   
104                            JavaScript Introduction   
125  CS50's Computer Science for Business Professio...   
445  Human-Computer Interaction I: Fundamentals & D...   
596  User Experience (UX) Design: Human Factors and...   
19   CS50's Introduction to Artificial Intelligence...   
276                                    Creative Coding   
382  Computer Vision Fundamentals with Watson and O...   
334                      Designing the User Experience   

                                            University Difficulty Level  \
575                                                IBM         Beginner   
110                         University of Pennsylvania     Intermediate   
104                The World Wide Web Consortium (W3C)         Beginner   
125                                 Harvard University       