# Install required packages

In [None]:
!pip install xlsxwriter -q
!pip install keybert -q
!pip install wordwise -q

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
from pprint import pprint
import xlsxwriter
from keybert import KeyBERT
from wordwise import Extractor

# Load the Data

**Use** the [requests](https://pypi.org/project/requests/) library to send a `HTTP request` to the server. Save the output to a variable and check the status code. A status of 200 indicates that the request was successful. We can read the content of the server's response `r.text`

In [None]:
# url = "https://www.cloudskillsboost.google/course_templates/72?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346376"
# url = "https://www.cloudskillsboost.google/quests/132?catalog_rank=%7B%22rank%22%3A2%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346300"
url = "https://www.cloudskillsboost.google/course_templates/53?catalog_rank=%7B%22rank%22%3A1%2C%22num_filters%22%3A0%2C%22has_search%22%3Atrue%7D&search_id=25346338"
page = requests.get(url)

In [None]:
print(page.status_code)
print('\n')
print(page.text)

200










<!DOCTYPE html>
<html lang='en'>
<head>
<title>Building Batch Data Pipelines on Google Cloud | Google Cloud Skills Boost</title>
<meta name="action-cable-url" content="/cable" />
<script>
//<![CDATA[
window.gon={};gon.deployment="google-run";
//]]>
</script>
<script>
  (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':
  new Date().getTime(),event:'gtm.js'});var f=d.getElementsByTagName(s)[0],
  j=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=
  'https://www.googletagmanager.com/gtm.js?id='+i+dl;f.parentNode.insertBefore(j,f);
  })(window,document,'script','dataLayer',"GTM-5XSKHDX");
</script>
<script src="https://www.googletagmanager.com/gtag/js?id=G-2X30ZRBDSG" async="async"></script>
<script>
  window.dataLayer = window.dataLayer || [];
  function gtag(){dataLayer.push(arguments);}
  gtag('js', new Date());
  gtag('config', "G-2X30ZRBDSG", {
    user_id: ""
  });
</script>
<script src="https://cdn.qwiklabs.com/assets/hallofmirrors/polyfills

Looking at the server's response we see that it returns the DOM which is in HTML format. Therefore we will use [`BeautifulSoup`](https://pypi.org/project/beautifulsoup4/) to pull the data from the HTML. It sits atop an HTML or XML parser, providing Pythonic idioms for iterating, searching, and modifying the parse tree.

In [None]:
soup = BeautifulSoup(page.text, "html.parser")
ql_course = soup.find("ql-course")
modules = ql_course.attrs['modules']
print(type(modules))
print("\n\n")
print(modules)

<class 'str'>



[{"id":"59338","title":"Introduction","description":"\u003cp\u003eIn this module, we introduce the course and agenda\u003c/p\u003e","steps":[{"id":"386567","prompt":null,"isOptional":true,"activities":[{"id":"379215","href":null,"isLocked":false,"duration":55000,"title":"Course Introduction","type":"video","isComplete":false,"inProgress":false,"score":null,"disabled":false}],"isComplete":false,"allActivitiesRequired":false}],"expanded":false},{"id":"59339","title":"Introduction to Building Batch Data Pipelines","description":"\u003cp\u003eThis module reviews different methods of data loading: EL, ELT and ETL and when to use what\u003c/p\u003e","steps":[{"id":"386568","prompt":null,"isOptional":true,"activities":[{"id":"379216","href":null,"isLocked":false,"duration":69000,"title":"Module introduction","type":"video","isComplete":false,"inProgress":false,"score":null,"disabled":false}],"isComplete":false,"allActivitiesRequired":false},{"id":"386569","prompt":null,"isOpt

convert the data from string to json. Then you can print the first entry in the list to see the structure of the data.

In [None]:
#convert to json
json_modules = json.loads(modules)
print(type(json_modules))
print("\n\n")
pprint(json_modules[0])

<class 'list'>



{'description': '<p>In this module, we introduce the course and agenda</p>',
 'expanded': False,
 'id': '59338',
 'steps': [{'activities': [{'disabled': False,
                            'duration': 55000,
                            'href': None,
                            'id': '379215',
                            'inProgress': False,
                            'isComplete': False,
                            'isLocked': False,
                            'score': None,
                            'title': 'Course Introduction',
                            'type': 'video'}],
            'allActivitiesRequired': False,
            'id': '386567',
            'isComplete': False,
            'isOptional': True,
            'prompt': None}],
 'title': 'Introduction'}


create a dataframe to see how the data is represented in a table format.

In [None]:
df = pd.DataFrame(json_modules)
df

Unnamed: 0,id,title,description,steps,expanded
0,59338,Introduction,"<p>In this module, we introduce the course and...","[{'id': '386567', 'prompt': None, 'isOptional'...",False
1,59339,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"[{'id': '386568', 'prompt': None, 'isOptional'...",False
2,59340,Executing Spark on Dataproc,<p>This module shows how to run Hadoop on Data...,"[{'id': '386575', 'prompt': None, 'isOptional'...",False
3,59341,Serverless Data Processing with Dataflow,<p>This module covers using Dataflow to build ...,"[{'id': '386587', 'prompt': None, 'isOptional'...",False
4,59342,Manage Data Pipelines with Cloud Data Fusion a...,<p>This module shows how to manage data pipeli...,"[{'id': '386604', 'prompt': None, 'isOptional'...",False
5,59343,Course Summary,<p>Course Summary</p>,"[{'id': '386620', 'prompt': None, 'isOptional'...",False
6,59344,Course Resources,<p>PDF links to all modules</p>,"[{'id': '386621', 'prompt': None, 'isOptional'...",False


flatten the json output using the pandas function `json_normalize()`. This will remove the nested lists in `steps` and `activities`. Each entry in these lists will be put in a seperate column in our dataframe.

In [None]:
flatten_df = pd.json_normalize(json_modules, record_path=['steps', ['activities']], meta=['id','title', 'description'], meta_prefix='meta-', record_prefix='record-')
flatten_df.head(3)

Unnamed: 0,record-id,record-href,record-isLocked,record-duration,record-title,record-type,record-isComplete,record-inProgress,record-score,record-disabled,meta-id,meta-title,meta-description
0,379215,,False,55000,Course Introduction,video,False,False,,False,59338,Introduction,"<p>In this module, we introduce the course and..."
1,379216,,False,69000,Module introduction,video,False,False,,False,59339,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...
2,379217,,False,220000,"EL, ELT, ETL",video,False,False,,False,59339,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...


Drop unwanted columns to reduce the dimension of our dataframe.

In [None]:
flatten_df.drop(['record-href', 'record-isLocked', 'record-isComplete', 'record-inProgress','record-score','meta-id','record-id','record-disabled'], axis=1, inplace=True)

In [None]:
print(f'column names ----> {flatten_df.columns}')

column names ----> Index(['record-duration', 'record-title', 'record-type', 'meta-title',
       'meta-description'],
      dtype='object')


In [None]:
flatten_df.rename(columns={'meta-title':'module', 'meta-description':'description', 'record-title':'activities','record-duration':'duration', 'record-type':'type'}, inplace=True, errors='raise')

In [None]:
print(flatten_df.shape)
print("\n\n")
print(flatten_df.info())

(58, 5)



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58 entries, 0 to 57
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   duration     58 non-null     int64 
 1   activities   58 non-null     object
 2   type         58 non-null     object
 3   module       58 non-null     object
 4   description  58 non-null     object
dtypes: int64(1), object(4)
memory usage: 2.4+ KB
None


# Aggregating and Grouping the Data

## Using pd.groupby

In [None]:
flatten_df['text'] = flatten_df.groupby(['module', 'description'])['activities'].transform(lambda x:'. '.join(x))
flatten_df.head(10)

Unnamed: 0,duration,activities,type,module,description,text
0,55000,Course Introduction,video,Introduction,"<p>In this module, we introduce the course and...",Course Introduction
1,69000,Module introduction,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con..."
2,220000,"EL, ELT, ETL",video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con..."
3,168000,Quality considerations,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con..."
4,180000,How to carry out operations in BigQuery,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con..."
5,208000,Shortcomings,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con..."
6,428000,ETL to solve data quality issues,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con..."
7,0,Introduction to Building Batch Data Pipelines,quiz,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con..."
8,27000,Module introduction,video,Executing Spark on Dataproc,<p>This module shows how to run Hadoop on Data...,Module introduction. The Hadoop ecosystem. Run...
9,286000,The Hadoop ecosystem,video,Executing Spark on Dataproc,<p>This module shows how to run Hadoop on Data...,Module introduction. The Hadoop ecosystem. Run...


### initializing the model

[keyBert](https://pypi.org/project/keybert/) uses BERT-embeddings and simple cosine similarity to find the sub-phrases in a document that are the most similar to the document itself.

In [None]:
keywords_model = KeyBERT()

First, document embeddings are extracted with BERT to get a document-level representation. Then, word embeddings are extracted for N-gram words/phrases. n-gram looks n-1 words into the past.

In [None]:
def get_keywords_keybert(text, model):
  keywords_arr = model.extract_keywords(text, keyphrase_ngram_range=(1, 1), stop_words=None)
  return [x[0] for x in keywords_arr]

In [None]:
flatten_df['keywords'] = flatten_df['text'].apply(lambda x: get_keywords_keybert(x, keywords_model))
flatten_df

Unnamed: 0,duration,activities,type,module,description,text,keywords
0,55000,Course Introduction,video,Introduction,"<p>In this module, we introduce the course and...",Course Introduction,"[introduction, course]"
1,69000,Module introduction,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con...","[bigquery, pipelines, batch, etl, data]"
2,220000,"EL, ELT, ETL",video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con...","[bigquery, pipelines, batch, etl, data]"
3,168000,Quality considerations,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con...","[bigquery, pipelines, batch, etl, data]"
4,180000,How to carry out operations in BigQuery,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con...","[bigquery, pipelines, batch, etl, data]"
5,208000,Shortcomings,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con...","[bigquery, pipelines, batch, etl, data]"
6,428000,ETL to solve data quality issues,video,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con...","[bigquery, pipelines, batch, etl, data]"
7,0,Introduction to Building Batch Data Pipelines,quiz,Introduction to Building Batch Data Pipelines,<p>This module reviews different methods of da...,"Module introduction. EL, ELT, ETL. Quality con...","[bigquery, pipelines, batch, etl, data]"
8,27000,Module introduction,video,Executing Spark on Dataproc,<p>This module shows how to run Hadoop on Data...,Module introduction. The Hadoop ecosystem. Run...,"[hadoop, dataproc, hdfs, spark, cloud]"
9,286000,The Hadoop ecosystem,video,Executing Spark on Dataproc,<p>This module shows how to run Hadoop on Data...,Module introduction. The Hadoop ecosystem. Run...,"[hadoop, dataproc, hdfs, spark, cloud]"


# Save output to Excel workbook

In [None]:
with pd.ExcelWriter('demo_GCBS.xlsx', engine='xlsxwriter') as writer:
  flatten_df.to_excel(writer, sheet_name="CLMG004")