In [1]:
import pandas as pd
import ftfy
import spacy
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import ast
import pickle

  from .autonotebook import tqdm as notebook_tqdm


## Cleaning Data

In [2]:
df = pd.read_csv('data/courses_data_raw.csv') 
df.head()

Unnamed: 0,Title,Institution,Metadata,Rating,Skills,Description,Link,Category,Subcategory,Modules Name,Modules Description
0,.NET FullStack Developer,Board Infinity,Intermediate · Specialization · 1 - 3 Months,4.1,"HTML and CSS, Web Services, Javascript, Micros...",Build Fullstack webapp with .NET technologies....,https://www.coursera.org/specializations/dot-n...,Computer Science,Mobile and Web Development,"['.Net Full Stack Foundation', 'Frontend Devel...",['Build dynamic web applications using ASP.NET...
1,3D Interaction Design in Virtual Reality,University of London,Intermediate · Course · 1 - 4 Weeks,4.6,"Prototyping, Human Computer Interaction, Usabi...",This course is part ofVirtual Reality Speciali...,https://www.coursera.org/learn/3d-interaction-...,Computer Science,Design and Product,"['Interaction in VR', 'Moving around in VR', '...","[""Welcome to Week 1! In this week, we will cov..."
2,3D Modeling for 3D Printing and Laser Cutting ...,Packt,Intermediate · Course · 1 - 3 Months,5.0,"Autodesk, Engineering Tolerance, Visualization...",Instructor:Packt - Course Instructors,https://www.coursera.org/learn/packt-3d-modeli...,Physical Science and Engineering,Mechanical Engineering,"['Getting started', '3D Modeling Fundamentals'...","['In this module, we will explore the foundati..."
3,3D Printing and Additive Manufacturing,University of Illinois Urbana-Champaign,Beginner · Specialization · 3 - 6 Months,4.6,"3D Modeling, Hardware Troubleshooting, Design ...",Turn Your Ideas into Objects with 3D Printing....,https://www.coursera.org/specializations/3d-pr...,Computer Science,Design and Product,"['The 3D Printing Revolution', '3D Printing Ap...","['Obtain a rich understanding of 3D printing, ..."
4,3D Printing Hardware,University of Illinois Urbana-Champaign,Beginner · Course · 1 - 3 Months,4.2,"Electronic Components, Hardware Design, Manufa...",This course is part of3D Printing and Additive...,https://www.coursera.org/learn/3d-print-hardware,Business,Entrepreneurship,['Course Orientation & Module 1 Origins of Des...,"['In this welcome module, you will become fami..."


In [3]:
duplicate_count = df.duplicated().sum()
print("Total duplicate rows:", duplicate_count)

Total duplicate rows: 0


The Metadata column contains three attributes such as Level, Type, and Duration which are separated by a dot symbol (·). To ensure that all three attributes are present, each row must contain exactly two dots.

In [4]:
df[df['Metadata'].str.count('·') == 1]

Unnamed: 0,Title,Institution,Metadata,Rating,Skills,Description,Link,Category,Subcategory,Modules Name,Modules Description
66,Advanced iOS Development,Meta,Beginner · Course,,,Instructor:Taught by Meta Staff,https://www.coursera.org/learn/advanced-ios-de...,Computer Science,Mobile and Web Development,['When will I have access to the lectures and ...,['']
217,Android Mobile Lifecycle and Software Developm...,Meta,Beginner · Course,,,Instructor:Taught by Meta Staff,https://www.coursera.org/learn/android-mobile-...,Computer Science,Mobile and Web Development,['When will I have access to the lectures and ...,['']


Rows with only one dot indicate that one of the attributes is missing or the format is incomplete. Therefore, these rows are removed to maintain data consistency.

In [5]:
df = df[df['Metadata'].str.count('·') != 1]

The value in Metadata column is splitted into three attributes: Level, Type, and Duration, using the dot (·) as a separator.

In [6]:
# Known levels
levels = ['Beginner', 'Intermediate', 'Advanced', 'Mixed']

# Split metadata by '·' and strip whitespace
df['Metadata_Split'] = df['Metadata'].str.split('·').apply(lambda x: [i.strip() for i in x])

# Extract Level
df['Level'] = df['Metadata_Split'].apply(lambda parts: next((p for p in parts if p in levels), None))

# Extract Duration
df['Duration'] = df['Metadata_Split'].apply(lambda parts: next((p for p in parts if any(c.isdigit() for c in p)), None))

# Extract Type
df['Type'] = df['Metadata_Split'].apply(
    lambda parts: next((p for p in parts if p not in levels and not any(c.isdigit() for c in p)), None)
)
df = df.drop(['Metadata', 'Metadata_Split'], axis=1)

In [7]:
# Check NaN or empty values
df.apply(lambda col: ((col.isna()) | (col == '')).sum())

Title                    0
Institution              0
Rating                 871
Skills                   2
Description              0
Link                     0
Category                 0
Subcategory              0
Modules Name             0
Modules Description      0
Level                    0
Duration                 0
Type                     0
dtype: int64

In [8]:
df = df.dropna(subset=['Skills'])

In [9]:
# Clean description text
df['Description'] = df['Description'].str.replace(
    r'(This course is part of)(\S)', r'\1 \2', regex=True
)
df['Description'] = df['Description'].apply(
    lambda x: '' if isinstance(x, str) and x.startswith('Instructors:') else x
)
df['Description'] = df['Description'].apply(
    lambda x: '' if isinstance(x, str) and x.startswith('Instructor:') else x
)
df['Description'] = df['Description'].str.replace(
    r'\.?\s*Learn more$', '', regex=True
)

In [10]:
df['Rating'] = df['Rating'].apply(lambda x: 'No rating' if pd.isna(x) else x)
df['Description'] = df['Description'].apply(lambda x: 'No Description' if pd.isna(x) else x)

In [11]:
# Fix encoding issues
df = df.applymap(lambda x: ftfy.fix_text(x) if isinstance(x, str) else x)

  df = df.applymap(lambda x: ftfy.fix_text(x) if isinstance(x, str) else x)


In [12]:
df[['Title', 'Institution', 'Type', 'Level', 'Duration', 'Description', 'Rating', 'Category', 'Subcategory', 'Skills', 'Modules Name', 'Modules Description']].to_csv("data/courses_data.csv", index=False, encoding="utf-8-sig")

In [13]:
df["Modules Name"] = df["Modules Name"].apply(ast.literal_eval)
df["Modules Description"] = df["Modules Description"].apply(ast.literal_eval)

with open("data/courses_data.json", "w", encoding="utf-8-sig") as f:
    f.write(df.to_json(orient="records", force_ascii=False))

## Tokenization

In [17]:
nlp = spacy.load("en_core_web_trf")

generic_terms = {'final', 'assessment', 'submission', 'week', 'course', 'module', 'introduction', 'intro', 'chapter', 'phase', 'lesson', 'welcome', 'foundations'}

def clean_text(text):
    if pd.isna(text) or not isinstance(text, str) or not text.strip() or text.strip().lower() == 'no description':
        return ''
    
    doc = nlp(text.lower())
    
    # Only keep meaningful tokens: no stopwords/punctuations/generic terms
    tokens = [token.lemma_ for token in doc 
              if not token.is_stop and not token.is_punct and token.lemma_ not in generic_terms]
    
    return ' '.join(tokens)

In [21]:
# Apply spaCy-based cleaning to the main course description
df['Clean_Description'] = df['Description'].apply(clean_text)

# Apply spaCy-based cleaning to each item in the list of module names
df['Clean_Module_Name'] = df['Modules Name'].apply(
    lambda lst: ' '.join([clean_text(item) for item in lst if isinstance(item, str)]) if isinstance(lst, list) else ''
)

# Apply spaCy-based cleaning to each item in the list of module description
df['Clean_Module_Description'] = df['Modules Description'].apply(
    lambda lst: ' '.join([clean_text(item) for item in lst if isinstance(item, str)]) if isinstance(lst, list) else ''
)
df['Token_Text'] = (
    df['Clean_Description'].fillna('') + '. ' +
    df['Clean_Module_Name'].fillna('') + '. ' +
    df['Clean_Module_Description'].fillna('') + '. ' +
    df['Skills'].fillna('')
).str.lower().str.strip()

## Vectorization

In [23]:
model = SentenceTransformer('all-MiniLM-L6-v2')
model.save('models/sentence_model')

In [24]:
embeddings = model.encode(df['Token_Text'].tolist(), show_progress_bar=True)
with open('course_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)

Batches: 100%|██████████| 105/105 [00:28<00:00,  3.62it/s]
