In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import requests
import json
import os

# Import core data from Coursera Catalog API

In [2]:
# import raw json data from Coursera Catalog API
response = requests.get("https://api.coursera.org/api/courses.v1?limit=100000&includes=instructorIds,partnerIds&fields=instructorIds,partnerIds,courseType,primaryLanguages,subtitleLanguages,certificates,description,startDate,specializations,domainTypes,s12nIds").text 
# API documentation: https://build.coursera.org/app-platform/catalog/
response_info = json.loads(response) # a dictionary
response_info["elements"][0:5]

[{'courseType': 'v2.ondemand',
  's12nIds': [],
  'description': 'In this two-hour, project-based course, we introduce Unity\'s powerful animation tool. You\'ll learn about different animations and effects that you can apply to pick-up objects to make them more noticeable and give your game a more polished look. This project covers recording and playing simple animations, basic customization of particle effects, and playing the particle effects when certain events occur in your game.\n\nThe guided project will introduce you to the following Unity concepts:\n- Animations\n- Particle Effects\n- Triggers\n- Coding techniques such as OnTrigger events and invoking methods\n\nThis is a stand-alone guided project, but is also Part 1 in a two-part series introducing Unity\'s animation features. Part 2 will cover using the Animation Window to create trigger transitions between multiple animations on a single GameObject.\n\nThis guided project and series also serve as an optional but recommended

In [11]:
# convert from json-like dict to pd dataframe
df = pd.DataFrame.from_dict(response_info["elements"])
print(len(df))
print(df.dtypes)
df.head()

9507
courseType           object
s12nIds              object
description          object
domainTypes          object
id                   object
slug                 object
instructorIds        object
specializations      object
primaryLanguages     object
partnerIds           object
certificates         object
name                 object
subtitleLanguages    object
startDate             int64
dtype: object


Unnamed: 0,courseType,s12nIds,description,domainTypes,id,slug,instructorIds,specializations,primaryLanguages,partnerIds,certificates,name,subtitleLanguages,startDate
0,v2.ondemand,[],"In this two-hour, project-based course, we int...","[{'domainId': 'computer-science', 'subdomainId...",NJSdGN71Eeq4CApSN3OTvQ,make-pick-ups-look-cool-unity-introduction-ani...,[4730641],[],[en],[565],[VerifiedCert],Make Your Pick-Ups Look Cool in Unity (Intro t...,[],1598035773172
1,v2.ondemand,[],"À la fin de ce projet, vous aurez toutes les c...","[{'domainId': 'business', 'subdomainId': 'mark...",DMkcgX7LEeyRTg6FtAvfBw,integrer-applications-dashboard-hootsuite,[26958800],[],[fr],[565],[VerifiedCert],Intégrer des applications dans votre Dashboard...,[],1644614698077
2,v2.ondemand,[],ينشئ العديد من الأشخاص حسابات وسائط اجتماعية م...,"[{'domainId': 'business', 'subdomainId': 'mark...",YLO0oGSUEeyIUg4Qv2RsBQ,getting-started-with-hootsuite-ar,[76377122],[],[ar],[565],[VerifiedCert],كيفية استعمال التطبيق هووتسويت,[],1642193091376
3,v2.ondemand,[],Gamification is the application of game elemen...,"[{'subdomainId': 'design-and-product', 'domain...",69Bku0KoEeWZtA4u62x6lQ,gamification,[226710],[],[en],[6],[VerifiedCert],Gamification,"[ar, fr, uk, pt-PT, zh-CN, it, pt-BR, vi, de, ...",1447095621493
4,v2.ondemand,[eYmv6d_7EeWOdA7Ea609SQ],This course will cover the steps used in weigh...,"[{'subdomainId': 'data-analysis', 'domainId': ...",0HiU7Oe4EeWTAQ4yevf_oQ,missing-data,[8394050],[],[en],[32],"[VerifiedCert, Specialization]",Dealing With Missing Data,[es],1471901099812


# Clean domain variable

In [12]:
# check if any course have more than 2 domains or no domains, drop observations
  # total of 1 observation dropped
for index, row in df.iterrows():
  num_dom = len(row["domainTypes"])
  if num_dom>2 or num_dom<1:
    print(index)
    print(num_dom)
    print(row)
    df.drop(index, inplace=True)

4456
0
courseType                                                 v2.ondemand
s12nIds                                                             []
description          Learn about the inner workings of cryptographi...
domainTypes                                                         []
id                                              J2MPfuJcEeWLHw5Kc9wFbw
slug                                                           crypto2
instructorIds                                                    [774]
specializations                                                     []
primaryLanguages                                                  [en]
partnerIds                                                         [1]
certificates                                                        []
name                                                   Cryptography II
subtitleLanguages                                                   []
startDate                                                1494627509286

In [13]:
# separate domain and subdomain
## code is not optimized (currently using iterrows)
df["domain1"] = ""
df["subdomain1"] = ""
df["domain2"] = ""
df["subdomain2"] = ""
for index, row in df.iterrows():
  df.loc[index,"domain1"] = row["domainTypes"][0]["domainId"]
  df.loc[index,"subdomain1"] = row["domainTypes"][0]["subdomainId"]
  df.loc[index,"domain2"] = row["domainTypes"][-1]["domainId"]
  df.loc[index,"subdomain2"] = row["domainTypes"][-1]["subdomainId"]
  if df.loc[index,"subdomain1"]==df.loc[index,"subdomain2"]:
    df.loc[index,"subdomain2"] = ""
    if df.loc[index,"domain1"]==df.loc[index,"domain2"]:
      df.loc[index,"domain2"] = ""

# save all courses to csv
df.to_csv('courses_all.csv', index=False)
df.head()

Unnamed: 0,courseType,s12nIds,description,domainTypes,id,slug,instructorIds,specializations,primaryLanguages,partnerIds,certificates,name,subtitleLanguages,startDate,domain1,subdomain1,domain2,subdomain2
0,v2.ondemand,[],"In this two-hour, project-based course, we int...","[{'domainId': 'computer-science', 'subdomainId...",NJSdGN71Eeq4CApSN3OTvQ,make-pick-ups-look-cool-unity-introduction-ani...,[4730641],[],[en],[565],[VerifiedCert],Make Your Pick-Ups Look Cool in Unity (Intro t...,[],1598035773172,computer-science,software-development,,
1,v2.ondemand,[],"À la fin de ce projet, vous aurez toutes les c...","[{'domainId': 'business', 'subdomainId': 'mark...",DMkcgX7LEeyRTg6FtAvfBw,integrer-applications-dashboard-hootsuite,[26958800],[],[fr],[565],[VerifiedCert],Intégrer des applications dans votre Dashboard...,[],1644614698077,business,marketing,computer-science,design-and-product
2,v2.ondemand,[],ينشئ العديد من الأشخاص حسابات وسائط اجتماعية م...,"[{'domainId': 'business', 'subdomainId': 'mark...",YLO0oGSUEeyIUg4Qv2RsBQ,getting-started-with-hootsuite-ar,[76377122],[],[ar],[565],[VerifiedCert],كيفية استعمال التطبيق هووتسويت,[],1642193091376,business,marketing,business,business-strategy
3,v2.ondemand,[],Gamification is the application of game elemen...,"[{'subdomainId': 'design-and-product', 'domain...",69Bku0KoEeWZtA4u62x6lQ,gamification,[226710],[],[en],[6],[VerifiedCert],Gamification,"[ar, fr, uk, pt-PT, zh-CN, it, pt-BR, vi, de, ...",1447095621493,computer-science,design-and-product,business,marketing
4,v2.ondemand,[eYmv6d_7EeWOdA7Ea609SQ],This course will cover the steps used in weigh...,"[{'subdomainId': 'data-analysis', 'domainId': ...",0HiU7Oe4EeWTAQ4yevf_oQ,missing-data,[8394050],[],[en],[32],"[VerifiedCert, Specialization]",Dealing With Missing Data,[es],1471901099812,data-science,data-analysis,social-sciences,governance-and-society


# Filter domain to select DS courses

In [15]:
# number of courses in each domain
print(df['domain1'].value_counts())

# keep DS courses
indexes = df[ (df['domain1']!="data-science") & (df['domain2']!="data-science") ].index
df.drop(indexes, inplace=True)
df.reset_index(drop=True, inplace=True)

# save DS courses in csv
df.to_csv('courses_raw.csv', index=False)
df.head()

Unnamed: 0,courseType,s12nIds,description,domainTypes,id,slug,instructorIds,specializations,primaryLanguages,partnerIds,certificates,name,subtitleLanguages,startDate,domain1,subdomain1,domain2,subdomain2
0,v2.ondemand,[eYmv6d_7EeWOdA7Ea609SQ],This course will cover the steps used in weigh...,"[{'subdomainId': 'data-analysis', 'domainId': ...",0HiU7Oe4EeWTAQ4yevf_oQ,missing-data,[8394050],[],[en],[32],"[VerifiedCert, Specialization]",Dealing With Missing Data,[es],1471901099812,data-science,data-analysis,social-sciences,governance-and-society
1,v2.ondemand,[],"In this 1-hour long project-based course, we w...","[{'subdomainId': 'data-analysis', 'domainId': ...",zjIfbv3pEeqRMRJTGi3b6Q,cluster-analysis-rcmdr,[70333041],[],[en],[565],[VerifiedCert],Cluster Analysis using RCmdr,[],1601055478420,data-science,data-analysis,,
2,v2.ondemand,[],"In this 2-hour long project-based course, you ...","[{'subdomainId': 'data-analysis', 'domainId': ...",LuXbfNnCEeq3nQoreMGNOQ,performing-data-aggregation-using-sql-aggregat...,[40353561],[],[en],[565],[VerifiedCert],Performing Data Aggregation using SQL Aggregat...,[],1597188408088,data-science,data-analysis,information-technology,data-management
3,v2.ondemand,[],Machine learning (ML) is one of the fastest gr...,"[{'subdomainId': 'machine-learning', 'domainId...",GHxiT9MGEemYpxKNJIfVmA,aws-machine-learning,[37525971],[],[en],[338],[VerifiedCert],Getting Started with AWS Machine Learning,"[ar, fr, pt-PT, it, vi, de, ru, es]",1568841469010,data-science,machine-learning,computer-science,software-development
4,v2.ondemand,[SIs8zE87EemPhBLbT0zqZA],This course is designed to quite literally ‘ma...,"[{'subdomainId': 'data-analysis', 'domainId': ...",hwvdsk86Eem0lhKc4vCNig,social-network-analysis,[1503466],[],[en],[83],"[VerifiedCert, Specialization]",Social Network Analysis,"[ar, fr, pt-PT, it, vi, de, ru, es]",1576005639598,data-science,data-analysis,social-sciences,education


# Instructor and partner data

In [24]:
instructors = pd.DataFrame.from_dict(response_info['linked']['instructors.v1'])
instructors.rename(columns={"fullName":"instructorName","id":"instructorId"}, inplace=True)

# save instructors
instructors.to_csv('instructors_DS.csv', index=False)
print(instructors.head())

partners = pd.DataFrame.from_dict(response_info['linked']['partners.v1'])
partners.rename(columns={"name":"partnerName","id":"partnerId","shortName":"partnerShortName"}, inplace=True)

# save partners
partners.to_csv('partners_DS.csv', index=False)
print(partners.head())

                 instructorName instructorId
0                   Brian Walsh     77620729
1                   Emily Ellis      3259066
2  Harold P. Lehmann, MD, Ph.D.       678654
3                   Brian Casey     29515160
4      Vicente Hinojosa Alarcón      2636979
                                         partnerName partnerId  \
0  Korea Advanced Institute of Science and Techno...       178   
1             Check Point Software Technologies Ltd.       636   
2                                 Capitals Coalition       994   
3                                    DeepLearning.AI       475   
4    The University of North Carolina at Chapel Hill        77   

    partnerShortName  
0              kaist  
1         checkpoint  
2  capitalscoalition  
3    deeplearning-ai  
4                unc  


# Scrape advanced data from course pages

In [17]:
from bs4 import BeautifulSoup
import re
## code is not optimized (currently using iterrows)

# initialize new columns
df["stars"] = None
df["ratings_count"] = None
df["num_enrolled"] = None
df["difficulty"] = None
df["length"] = None
df["ins_stars"] = None
df["ins_ratings_count"] = None
df["ins_learners_count"] = None
df["ins_courses_count"] = None
df["content_rating"] = None
df["content_ratings_count"] = None

# iterate through all courses
for index, row in df.iterrows():

  # set up html for scraping
  slug = row['slug']
  print(index, slug)
  url = "https://www.coursera.org/learn/" + slug
  page = requests.get(url)
  doc = BeautifulSoup(page.text, "html.parser")

  # course star rating and number of ratings
  stars = doc.find("span", attrs={"data-test":"number-star-rating"})
  if stars is not None:
    stars = float(stars.text[0:-5])
  #print("stars",stars)
  #print(type(stars))
  df.loc[index,"stars"] = stars

  ratings_count = doc.find(text=re.compile('ratings$'))
  if ratings_count is not None:
    ratings_count = int(ratings_count[0:-8].replace(',', ''))
  #print("ratings_count",ratings_count)
  #print(type(ratings_count))
  df.loc[index, "ratings_count"] = ratings_count

  # number of enrolled students
  num_enrolled = doc.find("div", attrs={"class":"_1fpiay2"})
  if num_enrolled is not None:
    num_enrolled = num_enrolled.find("span").text
    #num_enrolled = doc.find(text=re.compile('enrolled$')) #doesn't work
    num_enrolled = int(num_enrolled[0:-17].replace(',',''))
  #print("num_enrolled", num_enrolled)
  df.loc[index,"num_enrolled"] = num_enrolled

  # difficulty level
  #difficulty = doc.find("div", attrs={"class":"_16ni8zai"}, text="Level")
  difficulty = doc.find(class_="_16ni8zai",text=re.compile(' Level$'))
  if difficulty is not None:
    difficulty = difficulty.text
  #print("difficulty", difficulty)
  df.loc[index, "difficulty"] = difficulty

  # course length (hours to complete)
  length = doc.find(text=re.compile('hours to complete$'))
  if length is not None:
    length = int(re.findall(r'\d+',length)[0])
  #print("hours to complete", length)
  df.loc[index, "length"] = length

  # instructor rating and number of ratings
  ins_stars = doc.find("span", class_="avg-instructor-rating__total")
  if ins_stars is not None:
    ins_stars = float(ins_stars.text[0:-2])
  #print("ins_stars",ins_stars)
  df.loc[index,"ins_stars"] = ins_stars

  ins_ratings_count = doc.find("span", class_="avg-instructor-rating__ratings-count")
  if ins_ratings_count is not None:
    ins_ratings_count = int(ins_ratings_count.text[1:-8].replace(',',''))
  #print("ins_raings_count",ins_ratings_count)
  df.loc[index, "ins_ratings_count"] = ins_ratings_count

  # instructor learners and courses count
  ins_learners_count = doc.find("div", class_="learners-count")
  if ins_learners_count is not None:
    ins_learners_count = int(ins_learners_count.find("strong").text.replace(',',''))
  #print("ins learners count", ins_learners_count)
  df.loc[index, "ins_learners_count"] = ins_learners_count

  ins_courses_count = doc.find("div", class_="courses-count")
  if ins_courses_count is not None:
    ins_courses_count = int(ins_courses_count.find("strong").text.replace(',',''))
  #print("ins courses count", ins_courses_count)
  df.loc[index, "ins_courses_count"] = ins_courses_count

  # content rating
  content_rating = doc.find("span", class_="expertise-rating__average-rating")
  if content_rating is not None:
    content_rating = content_rating.text
  #print("content rating", content_rating)
  df.loc[index, "content_rating"] = content_rating

  content_ratings_count = doc.find("span", class_="expertise-rating p-l-1")
  if content_ratings_count is not None:
    content_ratings_count = int(content_ratings_count.find("span", text=re.compile('ratings\)$')).text[1:-9].replace(',',''))
  #print("content ratings count", content_ratings_count)
  df.loc[index, "content_ratings_count"] = content_ratings_count


df.head(10)

0 missing-data
1 cluster-analysis-rcmdr
2 performing-data-aggregation-using-sql-aggregate-functions
3 aws-machine-learning
4 social-network-analysis
5 computer-vision-basics
6 analyze-data-seaborn-python
7 code-free-data-science
8 accounting-analytics
9 extract-transform-and-load-data
10 advanced-ai-techniques-for-the-supply-chain
11 sequence-models-tensorflow-gcp
12 seeking-investment-alpha-ru
13 googlecloud-bracketology-with-google-machine-learning-5ytsd
14 data-analysis-python-pandas-ar
15 introduction-to-virtual-networks-in-microsoft-azure
16 introduction-to-eda-in-r
17 object-detection-facebook-detectron2
18 visualization-for-statistical-analysis
19 r-programming-ar
20 introduction-microsoft-excel
21 business-valuation-approaches
22 text-classification-in-r
23 googlecloud-build-an-end-to-end-data-capture-pipeline-using-document-ai-wiisg
24 data-collection-analytics-project-es
25 app-machine-learning-spark-synapse-analytics
26 autoencoders-image-denoising
27 build-an-anomaly-detect

215 launching-machine-learning
216 support-vector-machines-in-python
217 ibm-exploratory-data-analysis-for-machine-learning
218 anova-and-experimental-design
219 smart-analytics-machine-learning-ai-gcp
220 curso-completo-data-science
221 python-social-network-analysis-ko
222 exploratory-data-analysis-in-r
223 automl-computer-vision-microsoft-custom-vision
224 hypothesis-testing-in-r
225 tidy-messy-data-using-tidyr-in-r
226 calculus-through-data-and-modelling-applying-differentiation
227 deep-neural-network
228 nlp-sequence-models-fr
229 sql-data-science
230 compstatsintro
231 excel-intermediate-1-ar
232 career-guide-and-interview-prep-for-data-analyst-pc
233 excel-intermediate-1-fr
234 python-data-ru
235 data-analysis-with-python
236 excel-business-forecasting-time-series
237 ai2
238 mcmc
239 openvino-building-a-crossroad-ai-camera
240 getting-started-tensorflowjs-tensorflow-web-applications-machine-learning-python
241 algoritmos-de-negociacion-basados-en-machine-learning
242 googleclo

425 data-manipulation-with-dplyr-in-r
426 regular-expressions-in-python
427 mlops-fundamentals
428 managing-data-analysis
429 gcp-big-data-ml-fundamentals
430 machine-learning-asset-management-alternative-data
431 diabetes-prediction-with-pyspark-mllib
432 communicating-business-analytics-results
433 analytics-information-capstone
434 introduction-al-analisis-de-datos
435 google-data-studio-rapport
436 transiao-energtica-e-ecologica-em-paises-do-sul
437 managing-describing-analyzing-data
438 ethical-frameworks-action
439 design-thinking-predictive-analytics-data-products
440 ml-classification
441 genome-analysis-hierarchical-clustering
442 healthcare-data-models
443 python-pandas-merge-sort-filter
444 genomic-tools
445 como-usar-funciones-mtodos-y-bucles-en-python-desde-cero
446 ml-safety-stock
447 custom-models-layers-loss-functions-with-tensorflow
448 the-total-data-quality-framework
449 open-source-tools-for-data-science
450 linear-algebra-machine-learning
451 data-analyze-visualize

627 modelos-sagemaker
628 unsupervised-algorithms-in-machine-learning
629 get-started-with-r-markdown
630 dna-sequencing
631 data-driven-process-improvement
632 improving-statistical-questions
633 clinical-natural-language-processing
634 logistic-regression-numpy-python
635 extraire-des-donnes-de-texte-avec-python-et-regex
636 beginners-guide-to-aws-identity-and-access-management
637 contabilidad-como-herramienta-gerencial
638 data-analysis-with-python-ru
639 creacion-chatbot-azure
640 measurement-turning-concepts-data
641 smartpls-advanced-regression
642 getting-started-data-analytics-aws
643 python-for-applied-data-science-ai-ru
644 serverless-data-processing-with-dataflow-foundations-es
645 titanic-survival-prediction-using-machine-learning
646 reverse-and-complement-nucleic-acid-sequences-using-python
647 applied-data-science-capstone-ru
648 data-collection-framework
649 chatbot-composer
650 geospatial-covid19-python
651 chances-probability-uncertainty-statistics
652 excel-data-ana

836 applied-calculus-with-python
837 styles-html-css
838 bot-luis
839 image-segmentation-python-unsupervised-learning
840 sql-date-time-functions
841 working-big-data
842 ai-for-drug-discovery
843 create-geovisualizations-tableau
844 googlecloud-data-studio-qwik-start-bc935
845 using-tensorflow-image-style-transfer
846 ibm-ai-workflow-feature-engineering-bias-detection
847 join-tables-sql-query-libreoffice-base
848 curso-completo-spark-databricks
849 investment-risk-management
850 deep-learning-with-pytorch-build-an-autoencoder
851 where-why-and-how-of-lambda-functions-in-python
852 ferramentas-para-ciencia-de-dados-introducao-ao-r
853 fundamentals-of-data-analysis
854 aprenda-ia-con-ibm-watson
855 regresion-logistica-con-numpy-python
856 grab-data-fast-with-vertical-and-horizontal-lookup-in-google-sheets
857 analiza--tu-mercado-con-python
858 build-decision-trees-svms-neural-networks
859 data-analytics-business
860 decision-tree-random-forest-classification-julia
861 instagram-fake-pr

1046 sistemas-difusos
1047 ml-powerbi-ventas-retail
1048 primeros-pasos-sql
1049 crash-course-on-interactive-data-visualization-with-plotly
1050 analisis-data-dengan-pemrograman-r
1051 cdss3
1052 serverless-machine-learning-gcp-de
1053 tirer-parti-des-listes-pour-un-code-python-plus-simple
1054 googlecloud-cloud-operations-for-gke-hhhly
1055 neural-network-tensorflow
1056 convolutional-neural-networks
1057 nazaruh-eamatan-ean-albayanat-almusawarat-biastikhdam-tableau
1058 scatter-plot-data-scientists-big-data-analysts-visuals
1059 analyze-text-data-yellowbrick
1060 introduction-to-embedded-machine-learning
1061 precalculus-relations-functions
1062 time-series-analysis-arima-with-r
1063 automated-reasoning-sat
1064 excel-intermediate-1
1065 python-per-la-data-science
1066 r-programming
1067 introduction-to-data-analytics
1068 data-analytics-introduction
1069 responsible-data-analysis
1070 datavisualization
1071 tensorflow-beginner-predicting-house-prices-regression
1072 data-analytics-p

1253 python-for-data-analysis-numpy
1254 python-for-data-visualization-pt
1255 supply-chain-analytics-essentials
1256 superhero-tensorflow
1257 googlecloud-running-a-spark-application-with-opencv-on-cloud-dataproc-bt9fu
1258 creacion-de-aplicaciones-de-ia-con-las-api-de-watson
1259 fundamentals-machine-learning-in-finance
1260 business-data-ru
1261 generating-new-recipes-python
1262 tables-and-illustrations-in-ms-excel
1263 estadistica-aplicada-fundamentos
1264 basic-data-processing-visualization-python
1265 mining-medical-data
1266 clustering-geolocation-data-intelligently-python
1267 gcp-big-data-ml-fundamentals-br
1268 googlecloud-answering-complex-questions-using-native-derived-tables-with-l-msb9l
1269 neural-networks-deep-learning-ru
1270 excel-intermediate-1-es
1271 googlecloud-apis-explorer-cloud-storage-vyl1r
1272 machine-learning-h2o
1273 analisis-exploratorio-de-datos-con-python-y-pandas
1274 googlecloud-getting-started-with-splunk-cloud-gdi-on-google-cloud-6x1wa
1275 linear-

1457 python-for-applied-data-science-ai
1458 data-science-as-a-field
1459 microsoft-azure-databricks-for-data-engineering
1460 analyze-visitors-google-analytics-segments
1461 introduccion-algoritmos-regresion
1462 ml-basics-kaggle-competition
1463 finding-bibliography-metrics-using-crossref-api
1464 ciencia-computacao-python-conceitos
1465 shiny-to-plot-differential-gene-expression
1466 draw-insights-with-crosstabs-reports-google-sheets
1467 create-interactive-graphs-tableau
1468 ml-models-human-in-the-loop-pipelines
1469 googlecloud-build-a-project-tracking-app-with-appsheet-2pgwd
1470 probability-intro-ko
1471 statistical-analysis-hypothesis-testing-sas
1472 simulation-covid-testing-rsimmer
1473 create-relational-database-table-sqlitestudio
1474 deploying-machine-learning-models
1475 creating-plots-using-matplotlib-python-ar
1476 ai-deep-learning-capstone
1477 bahs-tarteb-tasfeya-el-baynat-fi-microsoft-excel
1478 excel-para-negocios
1479 big-data-introduction-ar
1480 googlecloud-usin

1663 machine-learning-projects-ar
1664 finalize-a-data-science-project
1665 big-data-integration-processing
1666 trier-et-filtrer-les-donnes-en-sql
1667 introduction-to-pymc3
1668 storytelling-with-data
1669 simulation-models-for-decision-making
1670 information-visualization-applied-perception
1671 data-science-course
1672 trabajando-dask
1673 googlecloud-use-google-maps-api-to-visualize-bigquery-geospatial-data-rav9g
1674 preparing-sas-programming-certification
1675 query-client-data-libreoffice-base
1676 r-programming-ko
1677 getting-started-with-git-and-github
1678 machine-learning-interpretable-shap-p
1679 analyzing-and-visualizing-data-in-looker
1680 administracion-tecnologias-informacion
1681 ibm-data-topology
1682 data-analysis-python
1683 data-management
1684 analise-de-dados-com-programacao-em-r
1685 leveraging-real-time-analytics-in-slack-local-and-surveymonkey-integrations
1686 spreadsheets-beginner-google-sheets
1687 ibm-data-analyst-capstone-project
1688 classify-radio-si

Unnamed: 0,courseType,s12nIds,description,domainTypes,id,slug,instructorIds,specializations,primaryLanguages,partnerIds,...,ratings_count,num_enrolled,difficulty,length,ins_stars,ins_ratings_count,ins_learners_count,ins_courses_count,content_rating,content_ratings_count
0,v2.ondemand,[eYmv6d_7EeWOdA7Ea609SQ],This course will cover the steps used in weigh...,"[{'subdomainId': 'data-analysis', 'domainId': ...",0HiU7Oe4EeWTAQ4yevf_oQ,missing-data,[8394050],[],[en],[32],...,119.0,10404.0,,18.0,3.57,7.0,13999,5.0,,
1,v2.ondemand,[],"In this 1-hour long project-based course, we w...","[{'subdomainId': 'data-analysis', 'domainId': ...",zjIfbv3pEeqRMRJTGi3b6Q,cluster-analysis-rcmdr,[70333041],[],[en],[565],...,,,,,,,3340,,,
2,v2.ondemand,[],"In this 2-hour long project-based course, you ...","[{'subdomainId': 'data-analysis', 'domainId': ...",LuXbfNnCEeq3nQoreMGNOQ,performing-data-aggregation-using-sql-aggregat...,[40353561],[],[en],[565],...,58.0,2657.0,,,,,20863,,,
3,v2.ondemand,[],Machine learning (ML) is one of the fastest gr...,"[{'subdomainId': 'machine-learning', 'domainId...",GHxiT9MGEemYpxKNJIfVmA,aws-machine-learning,[37525971],[],[en],[338],...,6218.0,191477.0,Intermediate Level,9.0,4.53,1809.0,453721,5.0,94%,9896.0
4,v2.ondemand,[SIs8zE87EemPhBLbT0zqZA],This course is designed to quite literally ‘ma...,"[{'subdomainId': 'data-analysis', 'domainId': ...",hwvdsk86Eem0lhKc4vCNig,social-network-analysis,[1503466],[],[en],[83],...,179.0,11250.0,Beginner Level,10.0,4.78,49.0,36006,5.0,,
5,v2.ondemand,[],"By the end of this course, learners will under...","[{'subdomainId': 'algorithms', 'domainId': 'co...",5YCz7-zMEeeMzQrhp6Bs1g,computer-vision-basics,"[30951525, 33716827]",[],[en],"[458, 117]",...,1738.0,71144.0,Intermediate Level,13.0,4.28,400.0,74032,4.0,96%,8116.0
6,v2.ondemand,[],Welcome to this project-based course on Analyz...,"[{'subdomainId': 'machine-learning', 'domainId...",fd3ZQRthEeqQJw6Wlm3hnw,analyze-data-seaborn-python,[15528766],[],[en],[565],...,171.0,4122.0,,,5.0,7.0,139004,,,
7,v2.ondemand,[],The Code Free Data Science class is designed f...,"[{'subdomainId': 'machine-learning', 'domainId...",GJT3MD6xEemL3g4W6JiGWA,code-free-data-science,[4972118],[],[en],[53],...,165.0,20062.0,Beginner Level,14.0,4.28,88.0,195893,4.0,92%,1187.0
8,v2.ondemand,[utjeMEgmEeWJSQ6Y2aPmQQ],Accounting Analytics explores how financial st...,"[{'subdomainId': 'finance', 'domainId': 'busin...",rc5KG0aUEeWG1w6arGoEIQ,accounting-analytics,"[1937011, 14757138]",[],[en],[6],...,2811.0,96638.0,,10.0,4.67,174.0,363139,5.0,92%,5673.0
9,v2.ondemand,[3UP06qzSEeupbwow02JDpQ],This course is designed for business and data ...,"[{'subdomainId': 'data-analysis', 'domainId': ...",AW_qutKPEeux6ApRYvYN1w,extract-transform-and-load-data,"[73411398, 19542214, 50069859]",[],[en],[639],...,,,Intermediate Level,15.0,,,2506,6.0,,


# Reorder columns

In [19]:
cols = df.columns.tolist()
print(cols)
print(len(cols))

['courseType', 's12nIds', 'description', 'domainTypes', 'id', 'slug', 'instructorIds', 'specializations', 'primaryLanguages', 'partnerIds', 'certificates', 'name', 'subtitleLanguages', 'startDate', 'domain1', 'subdomain1', 'domain2', 'subdomain2', 'stars', 'ratings_count', 'num_enrolled', 'difficulty', 'length', 'ins_stars', 'ins_ratings_count', 'ins_learners_count', 'ins_courses_count', 'content_rating', 'content_ratings_count']
29


In [20]:
# rearrange columns, dropped the nested domainTypes column
cols = ['name', 'slug', 'domain1', 'subdomain1', 'domain2', 'subdomain2', 'description', 'startDate', 'partnerIds', 'instructorIds', 'primaryLanguages', 'subtitleLanguages', 'specializations', 'certificates', 'courseType', 'id', 's12nIds', 'stars', 'ratings_count', 'num_enrolled', 'difficulty', 'length', 'ins_stars', 'ins_ratings_count', 'ins_learners_count', 'ins_courses_count', 'content_rating', 'content_ratings_count']
print(len(cols))

28


In [22]:
df = df[cols]
df.to_csv('courses_DS.csv', index=False)
df.head()

Unnamed: 0,name,slug,domain1,subdomain1,domain2,subdomain2,description,startDate,partnerIds,instructorIds,...,ratings_count,num_enrolled,difficulty,length,ins_stars,ins_ratings_count,ins_learners_count,ins_courses_count,content_rating,content_ratings_count
0,Dealing With Missing Data,missing-data,data-science,data-analysis,social-sciences,governance-and-society,This course will cover the steps used in weigh...,1471901099812,[32],[8394050],...,119.0,10404.0,,18.0,3.57,7.0,13999,5.0,,
1,Cluster Analysis using RCmdr,cluster-analysis-rcmdr,data-science,data-analysis,,,"In this 1-hour long project-based course, we w...",1601055478420,[565],[70333041],...,,,,,,,3340,,,
2,Performing Data Aggregation using SQL Aggregat...,performing-data-aggregation-using-sql-aggregat...,data-science,data-analysis,information-technology,data-management,"In this 2-hour long project-based course, you ...",1597188408088,[565],[40353561],...,58.0,2657.0,,,,,20863,,,
3,Getting Started with AWS Machine Learning,aws-machine-learning,data-science,machine-learning,computer-science,software-development,Machine learning (ML) is one of the fastest gr...,1568841469010,[338],[37525971],...,6218.0,191477.0,Intermediate Level,9.0,4.53,1809.0,453721,5.0,94%,9896.0
4,Social Network Analysis,social-network-analysis,data-science,data-analysis,social-sciences,education,This course is designed to quite literally ‘ma...,1576005639598,[83],[1503466],...,179.0,11250.0,Beginner Level,10.0,4.78,49.0,36006,5.0,,
