In [1]:
import json
import pandas as pd
import numpy as np
import random

In [2]:
def load_json(provider_name):
    data = []
    with open(f'./raw_data/provider.{provider_name}.json') as json_file:
        for line in json_file:
            data.append(json.loads(line))
    return data

# credential_engine

In [3]:
cg_json = load_json('credential_engine')

In [4]:
cg_json[5]

{'_id': {'$oid': '5ccc9f44edc27e33d16cdc30'},
 'ce_id': 'ce-ed18f860-0603-4e5b-9992-60649a509001',
 'description': 'The Energy Industry Fundamentals course, for which the credential is based, takes a comprehensive look at the energy industry, including nuclear, natural gas, and renewable. Instructor guides and student materials are provided as part of the program, focusing on experiential learning techniques with a comprehensive online assessment at the end of the course. Students will be required to pass the assessment to receive the certificate.\nSince CEWD is a coalition made up of energy companies, this credential is being created by the industry for the industry. This ensures that students are learning the right material that will help them succeed at their jobs.\nEnergy Industry Fundamentals is divided into modules. For those who want to offer the opportunity for their students to earn the credentials, all modules must be completed. This may not be the case in all situations, so 

In [5]:
cg = pd.DataFrame({'title': [course['title'] for course in cg_json], 
              'description': [course['description'] for course in cg_json],
              'skill': [course['skill'] for course in cg_json]})

In [6]:
cg.shape

(363, 3)

In [7]:
cg.head(10)

Unnamed: 0,title,description,skill
0,Certified Safety Professional (CSP),The purpose of the recertification program is ...,[]
1,"HOSP 215, Front Office",This course presents a systematic approach to ...,[]
2,Computer Science Program at Texas Luthern Univ...,"We believe the analysis of problems, the desig...",[]
3,Bachelor of Science in Information Technology ...,Brandman University's Bachelor of Science in I...,[]
4,Pharmacy Technician Certification Board Certif...,Certificants must complete a minimum of twenty...,[]
5,Energy Industry Fundamentals Certificate,"The Energy Industry Fundamentals course, for w...",[Energy Industry]
6,Insurance Studies Certificate,The Certificate of Completion (CCL) in Insuran...,[Insurance Industry Knowledge]
7,Bachelor of Science in Information Technology ...,This program consists of the following courses...,[]
8,Respiratory Care Associate of Science,Students are admitted to the Respiratory Care ...,[]
9,BOC Level II Training Certificate of Completion,To earn a Level II Training Certificate of Com...,[]


In [8]:
cg['skill'] = cg['skill'].map(lambda x: x[0])

In [9]:
def empty_to_null(x):
    if x == '':
        return np.nan
    else:
        return x

In [10]:
cg['skill'] = cg['skill'].apply(empty_to_null)

In [11]:
cg['skill'].isnull().sum()

273

In [12]:
cg.shape

(363, 3)

In [13]:
cg_labeled = cg[cg['skill'].notna()]

In [14]:
cg_labeled

Unnamed: 0,title,description,skill
5,Energy Industry Fundamentals Certificate,"The Energy Industry Fundamentals course, for w...",Energy Industry
6,Insurance Studies Certificate,The Certificate of Completion (CCL) in Insuran...,Insurance Industry Knowledge
10,BOC Level I Training Certificate of Completion,Level I classes support the BOC operator skill...,HVAC
11,Official (ISC)² CISSP Training,Our training helps you fully prepare for this ...,Cybersecurity Strategy
12,NSF Certificate in Food Safety Programs,"NSF International, NSF Training & Education Se...",Food Safety
13,Certified Food Safety Manager Training,Certified Managers have a responsibility to sh...,Food Safety
42,Professional Development Certificate in SQL Se...,An 11-week in-depth program designed to provid...,Microsoft SQL
44,.Net Web Developer Program,A 10-week comprehensive program covering web d...,.NET
47,Graduate Certificate in Cybersecurity - Critic...,The Cybersecurity area of study blends both in...,Infrastructure Security Fundamentals
49,Welding & Fabrication Industry Certification P...,The Monroe Community College Career Welding & ...,MIG (PULSE) Welding


# edx-courses

In [15]:
ec_json = load_json('edx-courses')

In [16]:
ec_json[14]

{'_id': {'$oid': '5c638595edc27e865ed5380f'},
 'title': 'Calculus 1B: Integration',
 'description_short': 'Discover the integral - what it is and how to compute it. See how to use calculus to model real world phenomena. Part 2 of 3.',
 'description_full': '<p>How long should the handle of your spoon be so that your fingers do not burn while mixing chocolate fondue? Can you find a shape that has finite volume, but infinite surface area? How does the weight of the rider change the trajectory of a zip line ride? These and many other questions can be answered by harnessing the power of the integral.</p>\r\n<p>But what is an integral? You will learn to interpret it geometrically as an area under a graph, and discover its connection to the derivative. You will encounter functions that you cannot integrate without a computer and develop a big bag of tricks to attack the functions that you can integrate by hand. The integral is vital in engineering design, scientific analysis, probability and 

In [17]:
ec = pd.DataFrame({'title': [course['title'] for course in ec_json], 
              'description': [course['description_full'] for course in ec_json],
              'skill': [course['skills'] for course in ec_json]})

In [18]:
ec.shape

(1406, 3)

In [19]:
ec.head(10)

Unnamed: 0,title,description,skill
0,Evolution of the Human Sociality: A Quest for ...,"<p>Through the process of evolution, animals h...",
1,The Chemistry of Life,<p>Chemistry and biology are traditionally tau...,
2,The Extremes of Life: Microbes and Their Diver...,<p>Life on our planet is diverse. While we can...,
3,应对气候变化的中国视角 | China’s Perspective on Climate C...,<p>This course will provide China’s perspectiv...,
4,Introduction to Statistical Methods for Gene M...,<p>This data course is a primer to statistical...,
5,Chinese History From Warring States to the Tan...,<p><em>Zizhi Tongjian _is one of the earliest ...,
6,文物精品与文化中国：农业与制造业 | Relics in Chinese History -...,<p>Chinese archaeology is one of the fastest g...,
7,文物精品与文化中国：天文与医药 | Relics of Chinese History - ...,<p>Chinese archaeology is one of the fastest-g...,
8,文物精品与文化中国：文字与乐礼 | Relics of Chinese History - ...,<p>Chinese archaeology is one of the fastest-g...,
9,Introduction to Mao Zedong Thought | 毛泽东思想概论,<p>With the world’s biggest population and sec...,


In [20]:
ec['skill'].isnull().sum()

0

In [21]:
ec['skill'] = ec['skill'].apply(empty_to_null)

In [22]:
ec.head()

Unnamed: 0,title,description,skill
0,Evolution of the Human Sociality: A Quest for ...,"<p>Through the process of evolution, animals h...",
1,The Chemistry of Life,<p>Chemistry and biology are traditionally tau...,
2,The Extremes of Life: Microbes and Their Diver...,<p>Life on our planet is diverse. While we can...,
3,应对气候变化的中国视角 | China’s Perspective on Climate C...,<p>This course will provide China’s perspectiv...,
4,Introduction to Statistical Methods for Gene M...,<p>This data course is a primer to statistical...,


In [23]:
ec['skill'].isnull().sum()

1062

In [24]:
ec.shape

(1406, 3)

In [25]:
# take the skill out of the list for labeled entries
ec.loc[ec['skill'].notna(),['skill']] = ec[ec['skill'].notna()]['skill'].map(lambda x: x[0])

In [26]:
ec

Unnamed: 0,title,description,skill
0,Evolution of the Human Sociality: A Quest for ...,"<p>Through the process of evolution, animals h...",
1,The Chemistry of Life,<p>Chemistry and biology are traditionally tau...,
2,The Extremes of Life: Microbes and Their Diver...,<p>Life on our planet is diverse. While we can...,
3,应对气候变化的中国视角 | China’s Perspective on Climate C...,<p>This course will provide China’s perspectiv...,
4,Introduction to Statistical Methods for Gene M...,<p>This data course is a primer to statistical...,
5,Chinese History From Warring States to the Tan...,<p><em>Zizhi Tongjian _is one of the earliest ...,
6,文物精品与文化中国：农业与制造业 | Relics in Chinese History -...,<p>Chinese archaeology is one of the fastest g...,
7,文物精品与文化中国：天文与医药 | Relics of Chinese History - ...,<p>Chinese archaeology is one of the fastest-g...,
8,文物精品与文化中国：文字与乐礼 | Relics of Chinese History - ...,<p>Chinese archaeology is one of the fastest-g...,
9,Introduction to Mao Zedong Thought | 毛泽东思想概论,<p>With the world’s biggest population and sec...,


In [27]:
# replace None in description with empty str
ec.loc[ec['description'].isnull(), ['description']] = ''

In [28]:
ec['description'].isnull().sum()

0

In [29]:
ec['description'] = ec['description'].map(lambda x: x.replace('<p>', ''))
ec['description'] = ec['description'].map(lambda x: x.replace('<em>', ''))
ec['description'] = ec['description'].map(lambda x: x.replace('<strong>', ''))

In [30]:
ec_labeled = ec[ec['skill'].notna()]

In [31]:
ec_labeled.shape

(344, 3)

In [32]:
ec_labeled

Unnamed: 0,title,description,skill
14,Calculus 1B: Integration,How long should the handle of your spoon be so...,Calculus
15,Calculus 1C: Coordinate Systems & Infinite Series,How did Newton describe the orbits of the plan...,Calculus
54,Behavioural Economics in Action,"How can we get people to save more money, eat ...",Economics
68,Marketing Analytics: Marketing Measurement Str...,Begin your journey in a new career in marketin...,Marketing Analytics
69,Marketing Analytics: Price and Promotion Analy...,"In this marketing course, you will learn how t...",Marketing Analytics
70,Marketing Analytics: Competitive Analysis and ...,"In this marketing course, you will learn about...",Marketing Analytics
71,"Marketing Analytics: Products, Distribution an...","In this marketing course, you will learn how t...",Marketing Analytics
90,Supply Chain Analytics,Supply chains are complex systems involving mu...,Supply Chain Data Analysis
91,Supply Chain Fundamentals,This Supply Chain Fundamentals course is part ...,Supply Chain Knowledge
92,Supply Chain Design,This Supply Chain Design course is part of the...,Supply Chain Model Design


# futurelearn-courses

In [33]:
fl_json = load_json('futurelearn-courses')

In [34]:
fl_json[5]

{'_id': {'$oid': '5c8c4bb0edc27e807c685286'},
 'uuid': '924b1510-c2f9-47a5-802e-7f3bfc8e9fdd',
 'url': 'https://www.futurelearn.com/courses/dental-photography-in-practice?utm_campaign=Courses+feed&utm_medium=courses-feed&utm_source=courses-feed',
 'image_url': 'https://ugc.futurelearn.com/uploads/images/ab/4f/regular_ab4fe1b5-2203-48de-85fa-ad0b5275d473.jpg',
 'introduction': 'Learn how to achieve consistent and excellent results in all aspects of dental photography with this free online course.',
 'description': 'This free course will cover all aspects of the use of digital photography in dental practice: intra-oral, extra-oral, and portraits. Participants will be taken through photography, from the basics of choosing correct equipment and setting it up, to optimum settings, techniques for consistent imaging and the safe storage of images.The course is taught by the team that delivers the online masters degree course in Advanced General Dental Practice from the School of Dentistry at 

In [35]:
fl = pd.DataFrame({'title': [course['title'] for course in fl_json], 
              'description': [course['description'] for course in fl_json],
              'skill': [course['skill'] for course in fl_json]})

In [36]:
fl.shape

(931, 3)

In [37]:
fl['skill'].isnull().sum()

0

In [38]:
fl

Unnamed: 0,title,description,skill
0,"Good Brain, Bad Brain: Basics",Understand your own amazing brainWe will look ...,[]
1,Forensic Science and Criminal Justice,"Over the past two decades, the criminal justic...",[]
2,Exploring Our Ocean,Discover how you can play a part in the safe k...,[]
3,Physical Theatre: Meyerhold and Biomechanics,Discover Meyerhold’s technique of theatrical b...,[]
4,Begin Programming: Build Your First Mobile Game,Take your first steps into the world of comput...,"[Java, Mobile Development]"
5,Improving Your Image: Dental Photography in Pr...,This free course will cover all aspects of the...,[Dental Technology]
6,Medicines Adherence: Supporting Patients with ...,It is estimated that 30-50% of patients do not...,[]
7,"Corpus Linguistics: Method, Analysis, Interpre...",Learn how to build and query corpora in this p...,[Computational Linguistics]
8,Community Journalism: Digital and Social Media,Community journalism is a growing area of inte...,"[Journalism, Digital Media, Social Media]"
9,Caring for Vulnerable Children,"In times of shrinking public services, the tas...",[]


In [39]:
# because there are values that is empty string (not list)
fl['skill'] = fl['skill'].map(lambda x: x[0] if isinstance(x, list) else None)

In [40]:
fl['skill'] = fl['skill'].apply(empty_to_null)

In [41]:
fl_labeled = fl[fl['skill'].notna()]

In [42]:
fl_labeled.shape

(123, 3)

# skillsoft-courses

In [43]:
ss_json = load_json('skillsoft-courses')

In [44]:
ss_json[5]

{'_id': {'$oid': '5ca04e69edc27eb97ce53f8a'},
 'asset_id': 'oper_38_a01_bs_enus',
 'collection': 'BUSINESS STRATEGY and OPERATIONS',
 'curriculum': 'Operations Curriculum',
 'series': 'Six Sigma Black Belt (2015 BOK): Team Management',
 'link': 'https://www.skillsoft.com/courses/5411931-six-sigma-team-dynamics-roles-and-success-factors/',
 'description': "Forming an effective Six Sigma team for driving improvement projects throughout an organization is essential to Six Sigma success. Teams are vital to Six Sigma and Lean projects that have goals of improving an organization's existing quality, enhancing bottom-line performance, and reducing costs. The methods used to form and develop a Six Sigma team will have a dramatic effect on the team's overall performance. Black Belts need to proactively contribute to the effectiveness of Six Sigma teams to promote positive organizational change.  This course explores the variety of team types, roles, and composition, revealing strategies for sel

In [45]:
ss = pd.DataFrame({'title': [course['title'] for course in ss_json], 
              'description': [course['description'] for course in ss_json],
              'skill': [course['skills'] for course in ss_json]})

In [46]:
ss.shape

(464, 3)

In [47]:
ss['skill'].isnull().sum()

0

In [48]:
ss

Unnamed: 0,title,description,skill
0,Fundamentals of Lean and Six Sigma and their A...,Six Sigma is a data-driven improvement strateg...,"[Six Sigma, Six Sigma Black Belt]"
1,"Six Sigma Project Selection, Roles, and Respon...","Deployment of Six Sigma, Lean, or another cont...","[Six Sigma, Six Sigma Black Belt]"
2,Six Sigma Strategic Planning and Deployment,Strategic planning of Six Sigma projects and L...,"[Six Sigma, Six Sigma Black Belt]"
3,Impact on Stakeholders and Benchmarking for Si...,The success of Six Sigma deployment in an orga...,"[Six Sigma, Six Sigma Black Belt]"
4,Using Business and Financial Measures in Six S...,Six Sigma improvement begins with assessing th...,"[Six Sigma, Six Sigma Black Belt]"
5,"Six Sigma Team Dynamics, Roles, and Success Fa...",Forming an effective Six Sigma team for drivin...,"[Six Sigma, Six Sigma Black Belt]"
6,Six Sigma Team Facilitation and Leadership,Six Sigma Black Belts must possess specific qu...,"[Six Sigma, Six Sigma Black Belt]"
7,Six Sigma Team Dynamics and Training,Black Belts have the challenging task of manag...,"[Six Sigma, Six Sigma Black Belt]"
8,Determining Requirements by Listening to the V...,Customers are at the heart of all Six Sigma in...,"[Six Sigma, Six Sigma Black Belt]"
9,"Six Sigma Business Case, Project Charter, and ...",A project charter is the most important docume...,"[Six Sigma, Six Sigma Black Belt]"


In [49]:
ss['skill'] = ss['skill'].map(lambda x: x[0])

In [50]:
ss

Unnamed: 0,title,description,skill
0,Fundamentals of Lean and Six Sigma and their A...,Six Sigma is a data-driven improvement strateg...,Six Sigma
1,"Six Sigma Project Selection, Roles, and Respon...","Deployment of Six Sigma, Lean, or another cont...",Six Sigma
2,Six Sigma Strategic Planning and Deployment,Strategic planning of Six Sigma projects and L...,Six Sigma
3,Impact on Stakeholders and Benchmarking for Si...,The success of Six Sigma deployment in an orga...,Six Sigma
4,Using Business and Financial Measures in Six S...,Six Sigma improvement begins with assessing th...,Six Sigma
5,"Six Sigma Team Dynamics, Roles, and Success Fa...",Forming an effective Six Sigma team for drivin...,Six Sigma
6,Six Sigma Team Facilitation and Leadership,Six Sigma Black Belts must possess specific qu...,Six Sigma
7,Six Sigma Team Dynamics and Training,Black Belts have the challenging task of manag...,Six Sigma
8,Determining Requirements by Listening to the V...,Customers are at the heart of all Six Sigma in...,Six Sigma
9,"Six Sigma Business Case, Project Charter, and ...",A project charter is the most important docume...,Six Sigma


In [51]:
ss['skill'] = ss['skill'].apply(empty_to_null)

In [52]:
ss['skill'].isnull().sum()

168

In [53]:
ss_labeled = ss[ss['skill'].notna()]

In [54]:
ss_labeled.shape

(296, 3)

# combine all courses

In [55]:
courses_labeled = pd.concat([cg_labeled, ec_labeled, fl_labeled, ss_labeled], 
                   keys=['cg','ec','fl','ss'])

In [56]:
courses_labeled.shape

(853, 3)

In [57]:
courses_labeled

Unnamed: 0,Unnamed: 1,title,description,skill
cg,5,Energy Industry Fundamentals Certificate,"The Energy Industry Fundamentals course, for w...",Energy Industry
cg,6,Insurance Studies Certificate,The Certificate of Completion (CCL) in Insuran...,Insurance Industry Knowledge
cg,10,BOC Level I Training Certificate of Completion,Level I classes support the BOC operator skill...,HVAC
cg,11,Official (ISC)² CISSP Training,Our training helps you fully prepare for this ...,Cybersecurity Strategy
cg,12,NSF Certificate in Food Safety Programs,"NSF International, NSF Training & Education Se...",Food Safety
cg,13,Certified Food Safety Manager Training,Certified Managers have a responsibility to sh...,Food Safety
cg,42,Professional Development Certificate in SQL Se...,An 11-week in-depth program designed to provid...,Microsoft SQL
cg,44,.Net Web Developer Program,A 10-week comprehensive program covering web d...,.NET
cg,47,Graduate Certificate in Cybersecurity - Critic...,The Cybersecurity area of study blends both in...,Infrastructure Security Fundamentals
cg,49,Welding & Fabrication Industry Certification P...,The Monroe Community College Career Welding & ...,MIG (PULSE) Welding


In [58]:
#courses_labeled.to_csv('courses_labeled.csv')

In [59]:
courses = pd.concat([cg, ec, fl, ss], 
                   keys=['cg','ec','fl','ss'])

In [60]:
courses

Unnamed: 0,Unnamed: 1,title,description,skill
cg,0,Certified Safety Professional (CSP),The purpose of the recertification program is ...,
cg,1,"HOSP 215, Front Office",This course presents a systematic approach to ...,
cg,2,Computer Science Program at Texas Luthern Univ...,"We believe the analysis of problems, the desig...",
cg,3,Bachelor of Science in Information Technology ...,Brandman University's Bachelor of Science in I...,
cg,4,Pharmacy Technician Certification Board Certif...,Certificants must complete a minimum of twenty...,
cg,5,Energy Industry Fundamentals Certificate,"The Energy Industry Fundamentals course, for w...",Energy Industry
cg,6,Insurance Studies Certificate,The Certificate of Completion (CCL) in Insuran...,Insurance Industry Knowledge
cg,7,Bachelor of Science in Information Technology ...,This program consists of the following courses...,
cg,8,Respiratory Care Associate of Science,Students are admitted to the Respiratory Care ...,
cg,9,BOC Level II Training Certificate of Completion,To earn a Level II Training Certificate of Com...,


In [61]:
courses.to_csv('courses_all.csv')

# train-test split

In [24]:
courses = pd.read_csv('courses_labeled.csv')

In [17]:
courses.shape

(853, 5)

In [25]:
courses

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,title,description,skill
0,cg,5,Energy Industry Fundamentals Certificate,"The Energy Industry Fundamentals course, for w...",Energy Industry
1,cg,6,Insurance Studies Certificate,The Certificate of Completion (CCL) in Insuran...,Insurance Industry Knowledge
2,cg,10,BOC Level I Training Certificate of Completion,Level I classes support the BOC operator skill...,HVAC
3,cg,11,Official (ISC)² CISSP Training,Our training helps you fully prepare for this ...,Cybersecurity Strategy
4,cg,12,NSF Certificate in Food Safety Programs,"NSF International, NSF Training & Education Se...",Food Safety
5,cg,13,Certified Food Safety Manager Training,Certified Managers have a responsibility to sh...,Food Safety
6,cg,42,Professional Development Certificate in SQL Se...,An 11-week in-depth program designed to provid...,Microsoft SQL
7,cg,44,.Net Web Developer Program,A 10-week comprehensive program covering web d...,.NET
8,cg,47,Graduate Certificate in Cybersecurity - Critic...,The Cybersecurity area of study blends both in...,Infrastructure Security Fundamentals
9,cg,49,Welding & Fabrication Industry Certification P...,The Monroe Community College Career Welding & ...,MIG (PULSE) Welding


In [26]:
random.seed(0)

In [27]:
courses_shuffled = courses.sample(frac=1)

In [28]:
courses_shuffled

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,title,description,skill
513,fl,594,Environmental Challenges: Justice in Natural R...,When looking at different societies and cultur...,Natural Resource Management
592,ss,35,Using Lean to Reduce Waste and Streamline Valu...,One of the most effective ways to increase the...,Lean Methods
381,ec,1141,Applying Leadership and Strategy Fundamentals ...,This course is part of the Certified Lifestyle...,Leadership
144,ec,249,International Law,International law can be considered as the law...,Legal Industry Knowledge
835,ss,401,Prospecting: Panning for Sales Gold,The art of sales prospecting is an essential s...,Sales
639,ss,84,Identifying Risks in Your Organization,"As a manager, it's important that you understa...",Risk Management
848,ss,429,Providing Effective Internal Customer Service,When you do things to help other people within...,Customer Service
404,ec,1243,"Nuclear Energy: Science, Systems and Society","Nuclear Energy: Science, Systems and Society</...",Nuclear Energy
788,ss,339,Define and Sequence Activities (PMBOK® Guide S...,This course covers the Plan Schedule Managemen...,Project Planning and Development Skills
195,ec,464,Azure Security and Compliance,Security is a critical requirement for placing...,Microsoft Azure


In [33]:
test_size = courses.shape[0]//5

In [34]:
test_size

170

In [37]:
test = courses_shuffled[:test_size]

In [38]:
train = courses_shuffled[test_size:]

In [39]:
train

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,title,description,skill
263,ec,684,Marketing Analytics,Marketers want to understand and forecast how ...,Marketing Analytics
158,ec,316,Introduction to Economics: Microeconomics,A country’s economy consists of three major ec...,Economics
469,fl,259,Italian for Beginners 4: Likes And Dislikes,This free online course is the fourth of six i...,Italian
71,cg,254,Professional Certificate in Geographic Informa...,GIS system management requires good leadership...,Geographic Information Systems (GIS)
598,ss,41,Six Sigma: Identifying Projects,This course discusses how to capture informati...,Six Sigma
293,ec,823,Introduction to Marketing,Marketing is a crucial function in all busines...,Marketing
296,ec,846,Teamwork & Collaboration,"In today’s workplace, professionals don’t work...",Teamwork / Collaboration
450,fl,109,Psychology and Mental Health: Beyond Nature an...,Psychological therapies and the work of clinic...,Psychology
359,ec,1066,C Programming: Advanced Data Types,"In this course, part of the C Programming with...",C (Programming Language)
485,fl,386,Big Data and the Environment,Use current examples to explore the potential ...,Big Data


In [43]:
train.shape

(683, 5)

In [44]:
test.shape

(170, 5)

In [41]:
train.to_csv('train.csv')

In [42]:
test.to_csv('test.csv')