In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import kaggle
import matplotlib.pyplot as plt
import re
import numpy as np


# Data Collection

In [3]:
# !pip install kaggle

In [4]:
# # Download dataset
# !kaggle datasets download -d khusheekapoor/edx-courses-dataset-2021 --unzip


In [5]:
# Load the dataset into a pandas dataframe
kaggle_df = pd.read_csv('../raw_data/EdX.csv')
kaggle_df.head(10)

Unnamed: 0,Name,University,Difficulty Level,Link,About,Course Description
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t..."
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct..."
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab..."
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...
5,Introduction to Project Management,University of Adelaide,Beginner,https://www.edx.org/course/introduction-to-pro...,Learn the principles of project management and...,Project management is an essential skill-set f...
6,Leading High-Performing Teams,The University of Queensland,Intermediate,https://www.edx.org/course/leading-high-perfor...,"Learn how to motivate, engage and empower peop...",This course will blend business theory and rea...
7,The Foundations of Happiness at Work,"University of California, Berkeley",Beginner,https://www.edx.org/course/the-foundations-of-...,Learn why happiness at work matters and how to...,This course explains what happiness at work lo...
8,"Managing Study, Stress and Mental Health at Un...",Curtin University,Beginner,https://www.edx.org/course/managing-study-stre...,Develop a clearer understanding of what mental...,This short course is a German-Australian colla...
9,Six Sigma: Define and Measure,Technische Universität München,Beginner,https://www.edx.org/course/six-sigma-define-an...,An introduction to the Six Sigma methodology a...,Understand the background and meaning of Six S...


# Data Cleaning

In [6]:
kaggle_df.columns

Index(['Name', 'University', 'Difficulty Level', 'Link', 'About',
       'Course Description'],
      dtype='object')

In [7]:
# Define keywords for the subjects we want
keywords = [
    "business", "finance", "entrepreneur", "marketing", "management",
    "math", "statistics", "calculus", "algebra", "probability",
    "computer science", "programming", "software", "coding", "data science",
    "machine learning", "AI", "artificial intelligence", "deep learning",
    "data analytics", "big data", "SQL", "Python", "R", "Excel",
    "design", "graphic design", "UX", "UI", "web design"
]

# Combine into regex pattern
pattern = re.compile(r"\b(" + "|".join(keywords) + r")\b", re.IGNORECASE)


In [8]:
# Apply filter: Keep rows where the course title or subject contains a keyword
filtered_df = kaggle_df[kaggle_df["Name"].astype(str).apply(lambda x: bool(pattern.search(x))) |
                 kaggle_df["Course Description"].astype(str).apply(lambda x: bool(pattern.search(x)))]

# Reset index after filtering
filtered_df.reset_index(drop=True, inplace=True)

filtered_df

Unnamed: 0,Name,University,Difficulty Level,Link,About,Course Description
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t..."
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct..."
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab..."
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...
...,...,...,...,...,...,...
460,The Power of Data,Rolls-Royce,Beginner,https://www.edx.org/course/the-power-of-data,'The power of data’ is an interactive introduc...,Key course outcomes:High level overview of the...
461,Drones and Autonomous Systems 2: Applications ...,University of Maryland Global Campus-Universit...,Intermediate,https://www.edx.org/course/drones-and-autonomo...,Learn the latest applications of unmanned aeri...,If you want to be the technology specialist wh...
462,MathTrackX: Differential Calculus,University of Adelaide,Beginner,https://www.edx.org/course/mathtrackx-differen...,Discover concepts and techniques relating to d...,This course is part three of the MathTrackX XS...
463,Leaders in Citizen Security and Justice Manage...,Inter-American Development Bank,Intermediate,https://www.edx.org/course/leaders-in-citizen-...,"Learn about the latest in prevention, police a...",The high rates of crime and violence are two o...


In [9]:
# Define category mapping with keywords
category_keywords = {
    "Business": ["business", "finance", "entrepreneur", "management", "accounting", "economics", "project management", "leadership", "strategy", "investment"],
    
    "Math": [
        "math", "mathematics", "statistics", "calculus", "algebra", "geometry", "probability", "trigonometry",
        "differential equations", "linear algebra", "discrete math", "topology", "combinatorics", "set theory",
        "real analysis", "complex analysis", "abstract algebra", "number theory", "graph theory", "logic",
        "game theory", "measure theory", "mathematical modeling", "stochastic processes", "numerical analysis",
        "multivariable calculus", "optimization", "vector calculus", "applied mathematics"
    ],
    
    "Computer Science": [
        "computer science", "programming", "software", "coding", "java", "python", "C++", "AI",
        "artificial intelligence", "web development", "cs50", "technology", "algorithms", "autonomous systems",
        "systems programming", "cybersecurity", "blockchain", "cloud computing", "machine learning",
        "deep learning", "neural networks", "operating systems", "computational thinking", "networking",
        "computer architecture", "embedded systems", "database systems", "theory of computation"
    ],
    
    "Data Analytics": [
        "data analytics", "big data", "SQL", "machine learning", "deep learning", "data science",
        "excel", "r programming", "data", "predictive modeling", "business intelligence", "data mining",
        "data visualization", "data engineering", "time series analysis", "ETL", "hadoop", "spark"
    ],
    
    "Design": [
        "design", "graphic design", "ux", "ui", "web design", "visual", "animation", "illustration",
        "motion graphics", "product design", "typography", "brand design", "3D modeling", "video editing",
        "industrial design", "color theory", "interaction design"
    ],
    
    "Marketing": [
        "marketing", "advertising", "seo", "branding", "digital marketing", "social media",
        "consumer behavior", "market research", "public relations", "copywriting", "growth hacking",
        "email marketing", "content marketing", "performance marketing"
    ]
}


# Create regex patterns for each category
category_patterns = {category: re.compile(r"\b(" + "|".join(keywords) + r")\b", re.IGNORECASE)
                     for category, keywords in category_keywords.items()}

In [10]:

# Function to assign category based on keyword frequency
def assign_category(title, description):
    text = f"{title} {description}".lower()  # Combine title and description for matching
    category_counts = {category: len(pattern.findall(text)) for category, pattern in category_patterns.items()}
    
    # Get the category with the highest match count
    best_category = max(category_counts, key=category_counts.get)

    # If no keywords are found, default to "Other"
    return best_category if category_counts[best_category] > 0 else "Other"

# Apply function to DataFrame
filtered_df["Category"] = filtered_df.apply(lambda row: assign_category(row["Name"], row["Course Description"]), axis=1)

filtered_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["Category"] = filtered_df.apply(lambda row: assign_category(row["Name"], row["Course Description"]), axis=1)


Unnamed: 0,Name,University,Difficulty Level,Link,About,Course Description,Category
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t...",Design
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...,Computer Science
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct...",Computer Science
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab...",Data Analytics
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...,Marketing
...,...,...,...,...,...,...,...
460,The Power of Data,Rolls-Royce,Beginner,https://www.edx.org/course/the-power-of-data,'The power of data’ is an interactive introduc...,Key course outcomes:High level overview of the...,Data Analytics
461,Drones and Autonomous Systems 2: Applications ...,University of Maryland Global Campus-Universit...,Intermediate,https://www.edx.org/course/drones-and-autonomo...,Learn the latest applications of unmanned aeri...,If you want to be the technology specialist wh...,Business
462,MathTrackX: Differential Calculus,University of Adelaide,Beginner,https://www.edx.org/course/mathtrackx-differen...,Discover concepts and techniques relating to d...,This course is part three of the MathTrackX XS...,Math
463,Leaders in Citizen Security and Justice Manage...,Inter-American Development Bank,Intermediate,https://www.edx.org/course/leaders-in-citizen-...,"Learn about the latest in prevention, police a...",The high rates of crime and violence are two o...,Business


In [11]:
filtered_df.drop_duplicates(inplace=True)
filtered_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df.drop_duplicates(inplace=True)


Unnamed: 0,Name,University,Difficulty Level,Link,About,Course Description,Category
0,How to Learn Online,edX,Beginner,https://www.edx.org/course/how-to-learn-online,Learn essential strategies for successful onli...,"Designed for those who are new to elearning, t...",Design
1,Programming for Everybody (Getting Started wit...,The University of Michigan,Beginner,https://www.edx.org/course/programming-for-eve...,"This course is a ""no prerequisite"" introductio...",This course aims to teach everyone the basics ...,Computer Science
2,CS50's Introduction to Computer Science,Harvard University,Beginner,https://www.edx.org/course/cs50s-introduction-...,An introduction to the intellectual enterprise...,"This is CS50x , Harvard University's introduct...",Computer Science
3,The Analytics Edge,Massachusetts Institute of Technology,Intermediate,https://www.edx.org/course/the-analytics-edge,"Through inspiring examples and stories, discov...","In the last decade, the amount of data availab...",Data Analytics
4,Marketing Analytics: Marketing Measurement Str...,"University of California, Berkeley",Beginner,https://www.edx.org/course/marketing-analytics...,This course is part of a MicroMasters® Program,Begin your journey in a new career in marketin...,Marketing
...,...,...,...,...,...,...,...
460,The Power of Data,Rolls-Royce,Beginner,https://www.edx.org/course/the-power-of-data,'The power of data’ is an interactive introduc...,Key course outcomes:High level overview of the...,Data Analytics
461,Drones and Autonomous Systems 2: Applications ...,University of Maryland Global Campus-Universit...,Intermediate,https://www.edx.org/course/drones-and-autonomo...,Learn the latest applications of unmanned aeri...,If you want to be the technology specialist wh...,Business
462,MathTrackX: Differential Calculus,University of Adelaide,Beginner,https://www.edx.org/course/mathtrackx-differen...,Discover concepts and techniques relating to d...,This course is part three of the MathTrackX XS...,Math
463,Leaders in Citizen Security and Justice Manage...,Inter-American Development Bank,Intermediate,https://www.edx.org/course/leaders-in-citizen-...,"Learn about the latest in prevention, police a...",The high rates of crime and violence are two o...,Business


# Write to CSV

In [12]:
filtered_df.to_csv("../input_data/kaggle_filtered_courses.csv", index=False)