In [1]:
import pandas as pd
import re
from collections import Counter, defaultdict
import matplotlib.pyplot as plt
from IPython.display import display

In [2]:
# Load CSV file
df = pd.read_csv("/Users/claudiapastores/Documents/FHTW_MWI/SS-2025-DDP/Master-Thesis/scraper/master_programs_data_cleaned.csv")
course_titles = df["Course Title"].dropna().astype(str).tolist()

# Keyword check
keyword_to_check = "innovation"

matching_courses = [title for title in course_titles if keyword_to_check.lower() in title.lower()]

print(f"\nCourses containing the keyword '{keyword_to_check}':")
for course in matching_courses:
    print("-", course)

print(f"\nTotal matches: {len(matching_courses)}")

# Define stopwords
stopwords = set([
    'to', 'and', 'of', 'in', 'for', 'on', 'with', 'the', 'a', 'an',
    'introduction', 'advanced', 'principles', 'i', 'ii', 'iii', 'basics',
    'course', 'topics', 'project', 'seminar', 'block', 'lecture'
])

# Tokenize and count
tokens = []
for title in course_titles:
    words = re.findall(r'\b\w+\b', title.lower())
    filtered = [w for w in words if w not in stopwords and len(w) > 2]
    tokens.extend(filtered)

word_freq = Counter(tokens)
top_words = word_freq.most_common(50)


Courses containing the keyword 'innovation':
- Digital Innovation
- Digital Innovation
- IT Innovation and Change ITIaC Discontinued*

Total matches: 3


In [17]:
# Word frequency table
df_top_words = pd.DataFrame(top_words[:10], columns=["Word", "Frequency"])
print("\nMOST FREQUENT WORDS IN COURSE TITLES")
display(df_top_words)
df_top_words.to_csv("word_frequencies.csv", index=False)


MOST FREQUENT WORDS IN COURSE TITLES


Unnamed: 0,Word,Frequency
0,computer,46
1,systems,36
2,data,31
3,software,25
4,learning,21
5,algorithms,20
6,security,20
7,development,19
8,programming,17
9,design,15


In [5]:
# Define initial category mapping
category_mapping = {
    "Programming": ["programming", "development", "design", "software"],
    "Data Science": ["data", "learning", "algorithms", "mining", "analysis"],
    "Artificial Intelligence": ["intelligence", "artificial", "intelligent"],
    "Systems and Architecture": ["systems", "computer", "architecture"],
    "Security and Privacy": ["security", "cyber"],
    "Business and Management": ["management", "business", "decision"],
    "User Experience and Interaction": ["human", "interaction", "interface"],
    "Web Technologies": ["web", "internet", "stack", "technologies"],
    "Cloud and Distributed Systems": ["distributed", "computing", "parallel", "cloud"],
    "Computational and Optimization Methods": ["computation", "numerical", "optimization", "theory", "methods"],
    "Visual Computing": ["graphics", "vision", "image", "animation"],
    "Research Skills and Thesis": ["thesis", "research", "capstone", "master"]
}

# Match top words to categories
grouped_keywords = defaultdict(list)
unmatched_keywords = []

for word, _ in top_words:
    matched = False
    for category, keywords in category_mapping.items():
        if word in keywords:
            grouped_keywords[category].append(word)
            matched = True
            break
    if not matched:
        unmatched_keywords.append(word)

# Convert to DataFrame
category_data = []
for category, keywords in grouped_keywords.items():
    category_data.append({
        "Category": category,
        "Keywords": ", ".join(sorted(set(keywords)))
    })

df_categories = pd.DataFrame(category_data)

# Output
print("CATEGORY & KEYWORDS")
print(df_categories.to_string(index=False))

print("\nUNMATCHED KEYWORDS")
print(", ".join(unmatched_keywords))

# Save results
df_categories.to_csv("category_keywords.csv", index=False)

CATEGORY & KEYWORDS
                              Category                                     Keywords
              Systems and Architecture              architecture, computer, systems
                          Data Science algorithms, analysis, data, learning, mining
                           Programming   design, development, programming, software
                  Security and Privacy                              cyber, security
         Cloud and Distributed Systems                       computing, distributed
                      Web Technologies                         internet, stack, web
       User Experience and Interaction                           human, interaction
               Business and Management               business, decision, management
Computational and Optimization Methods      computation, methods, numerical, theory
               Artificial Intelligence        artificial, intelligence, intelligent
            Research Skills and Thesis                  