### Purpose: To find similar courses between classcentral and skillup by matching keywords of skillup with the courses title of classcentral, so that we can publish popular courses on classcentral from skillup to generate leads.

In [1]:
# Import modules
import pandas as pd
import re
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore", "This pattern has match groups")

In [2]:
# List of files in the current directory
os.listdir()

['classCentral_26Aug.xlsx',
 '.ipynb_checkpoints',
 'classcentralVsSkillupCourseSimilarity.ipynb',
 'skillUp.xlsx']

In [3]:
# Read in class central data
df = pd.read_excel("classCentral_26Aug.xlsx")
df.head()

Unnamed: 0,courseTitle,courseLink,courseProvider,cat,unitSold,certificatePrice,freeOrAudit,review
0,Elements of AI,https://www.classcentral.com/course/independen...,University of Helsinki and Reaktor Education v...,Computer Science\n\n\n\n\nArtificial Intelligence,31.2k,Certificate Available,Free Online Course,574 reviews
1,Machine Learning,https://www.classcentral.com/course/machine-le...,Stanford University via Coursera,Computer Science\n\n\n\n\nMachine Learning,62.3k,Paid Certificate Available,Free Online Course (Audit),369 reviews
2,Unlocking Information Security: Part Ⅰ,https://www.classcentral.com/course/edx-unlock...,Tel Aviv University via edX,Computer Science\n\n\n\n\nCybersecurity,3.5k,"5 weeks long, 4-6 hours a week",Free Online Course (Audit),151 reviews
3,Introduction to Computer Science and Programmi...,https://www.classcentral.com/course/edx-introd...,Massachusetts Institute of Technology via edX,Computer Science,16.4k,$75 Certificate Available,Free Online Course (Audit),126 reviews
4,CS50's Introduction to Computer Science,https://www.classcentral.com/course/edx-cs50-s...,Harvard University via edX,Computer Science,26.6k,$90 Certificate Available,Free Online Course (Audit),96 reviews


In [4]:
# Extract subcategory and broadcategory from cat columns
df["subCat"] = df.cat.str.split("\n").str[-1]
df["broadCat"] = df.cat.str.split("\n").str[0]

# Extract digits from review
df["review"] = df.review.str.extract(r"(\d+,?)")

In [5]:
# Multiply by 1000 where unit sale contains "k", otherwise keep that as it is.
df["unitSold"] = np.where(df.unitSold.str.contains("k", na=False), 
        df.unitSold.str.replace("k", "").fillna(0).astype(float).multiply(1000).astype(int),
        df.unitSold).astype(int)

# Extract colaborating university from course provider column
df["colaboratingUniv"] = df.courseProvider.str.split("via").str[0].str.strip()


# Extract only course provider without colaborating university
df.courseProvider = df.courseProvider.str.split("via").str[-1].str.strip()

# Is the course is free, paid or auditable?
df["freeAuditOrPaid"] = np.where(df.freeOrAudit.str.contains("Audit"), "audit",
        np.where(df.freeOrAudit.str.contains("Free"), "free",
                np.where(df.freeOrAudit.str.contains("Paid"), "paid", df.freeOrAudit)))

# Is certificate free, paid or have some price?
df["certificatePrice"] = np.where(df.certificatePrice.str.contains("Paid"), "paid",
       np.where(df.certificatePrice.str.contains("\$"), df.certificatePrice,
                 np.where(df.certificatePrice.str.contains("Certificate Available"), "free", "na")))

# Split by certificate price, otherwise keep it as it is.
df["certificatePrice"] = np.where(df.certificatePrice.str.contains("\$"), 
         df.certificatePrice.str.split("Certificate Available").str[0], df.certificatePrice)

# Drop category column
df.drop("cat", axis=1, inplace=True)

# Sort by unit sale
df = df.sort_values("unitSold", ascending=False)

#### Read in skill up's data:

In [6]:
skill = pd.read_excel("skillUp.xlsx")
skill.head()

Unnamed: 0,date,courseId,courseTitle,keyword,subTitle,courseLink,courseProvider,soldOrEnq,category,broadCategory1,...,cpdAccreditedBy,othersAsCpd,awrBodyName,awrBodyQualName,courseLevel,savings,newOfferPrice,unitSold,offerPrice,savingsPercent
0,18_Aug,277314,Animal Care and Pet First Aid - 5 Courses Bundle,animal care,Special Bundle Offer | Accredited by CPD | 13 ...,https://www.reed.co.uk/courses/animal-care-and...,Skill Up,3 students purchased this course,"Animal care, Veterinary, Animal care, Dog care",Animal care,...,,,,,,,,3,39,96
1,18_Aug,273817,Dog Trainer - 8 Courses Complete Bundle,dog trainer|dog training|dog|dog walking|raw d...,Special Bundle Offer | Accredited by CPD | 40 ...,https://www.reed.co.uk/courses/dog-trainer-8-c...,Skill Up,11 students purchased this course,"Animal care, Dog training, Animal care, Dog tr...",Animal care,...,,,,,,,,11,49,94
2,18_Aug,273884,Photography Bundle for Professional Photographer,photography|wedding photography,Special Bundle Offer | Accredited by CPD | 45 ...,https://www.reed.co.uk/courses/photography-bun...,Skill Up,5 students purchased this course,"Media and art, Photography, Recreational, Phot...",Media and art,...,,,,,,,,5,49,94
3,18_Aug,277102,BARF - Feed Your Dog A Raw Diet,dog trainer|dog training|dog|dog walking|raw d...,Accredited by CPD | 3 CPD Points | Video train...,https://www.reed.co.uk/courses/barf-feed-your-...,Skill Up,5 students purchased this course,"Animal care, Dog agility, Animal care, Dog wal...",Animal care,...,,,,,,,,5,10,95
4,18_Aug,277218,Reiki Diploma - Level 1 to Master Level Certif...,reiki,Accredited by CPD | 4 CPD Points | Video train...,https://www.reed.co.uk/courses/reiki-diploma-l...,Skill Up,10 students purchased this course,"Health & care, Alternative medicine, Reiki",Health & care,...,,,,,,,,10,10,95


In [7]:
# This function match keywords of skillup courses with the title of class central courses
def matchByKeyword(keyword):
    """Keyword: keyword in the skill up dataset."""
    
    # Search classcentral course title
    x = df[df["courseTitle"].str.contains(fr"\b({keyword})\b", case=False, regex=True)]
    
    # Insert keyword
    x.insert(loc=0, value=keyword, column="keyword")
    
    # Match keyword with the skill up keyword column
    y = skill[skill.keyword==keyword]
    
    # Concat if a keyword is found on both of the dataframes
    if x.shape[0]>0 and y.shape[0]>0:
        return pd.concat([y, x.head(10)])

In [8]:
# Call the function on array of keywords
matched = pd.concat(list(map(matchByKeyword, skill.keyword.unique()))).reset_index(drop=True)
matched.sample(10)

Unnamed: 0,date,courseId,courseTitle,keyword,subTitle,courseLink,courseProvider,soldOrEnq,category,broadCategory1,...,unitSold,offerPrice,savingsPercent,certificatePrice,freeOrAudit,review,subCat,broadCat,colaboratingUniv,freeAuditOrPaid
277,,,Stress Management,cbt|stress,,https://www.classcentral.com/course/swayam-str...,Swayam,,,,...,196,,,paid,Free Online Course,0.0,Self Improvement,Personal Development,"Indian Institute of Technology, Kharagpur and ...",free
146,,,Character Setup and Animation,animation,,https://www.classcentral.com/course/character-...,Coursera,,,,...,122,,,paid,Free Online Course (Audit),0.0,Unity,Programming,Unity,audit
253,18_Aug,276951.0,American Sign Language (ASL) Course,American Sign Language|asl,Accredited by CPD | 10 CPD Points | Exam inclu...,https://www.reed.co.uk/courses/american-sign-l...,Skill Up,Tutor is available to students,"Language, Sign language, Health & care, Care, ...",Language,...,0,10.0,95.0,,,,,,,
188,,,Project Management: The Basics for Success,leadership|management,,https://www.classcentral.com/course/basicprojm...,Coursera,,,,...,16800,,,paid,Free Online Course (Audit),25.0,Project Management,Business,"University of California, Irvine",audit
161,,,Course 3: Strategic Self-Marketing and Persona...,branding,,https://www.classcentral.com/course/strategic-...,Coursera,,,,...,157,,,paid,Free Online Course (Audit),0.0,Branding,Business,State University of New York,audit
411,,,Front-End Web Development with React,App Development|react|redux,,https://www.classcentral.com/course/front-end-...,Coursera,,,,...,621,,,paid,Free Online Course (Audit),1.0,React,Programming,The Hong Kong University of Science and Techno...,audit
282,,,Software Processes and Agile Practices,agile,,https://www.classcentral.com/course/software-p...,Coursera,,,,...,3200,,,paid,Free Online Course (Audit),8.0,Agile,Programming,University of Alberta,audit
267,,,Social Media Marketing,Social Media Marketing,,https://www.classcentral.com/course/social-med...,Coursera Specialization,,,,...,337,,,free,Paid Course,1.0,Social Media Marketing,Business,Northwestern University,paid
286,,,Agile Meets Design Thinking,agile,,https://www.classcentral.com/course/uva-darden...,Coursera,,,,...,2100,,,paid,Free Online Course (Audit),1.0,Agile,Programming,University of Virginia,audit
252,,,Construct Stock Market Indices,Stock Market,,https://www.classcentral.com/course/construct-...,Coursera,,,,...,21,,,paid,Paid Course,0.0,Trading,Business,Coursera Project Network,paid
