In [1]:
import requests
import bs4
from toolz.itertoolz import mapcat, unique
import logging
from dask.distributed import Client
import pandas as pd

In [2]:
logging.basicConfig()
logger = logging.getLogger('get_bas_videos')
logger.setLevel(logging.DEBUG) # Do not chain this to previous statement

In [3]:
client = Client()

In [4]:
base_url = 'https://powerusers.microsoft.com'

In [5]:
def get_pagefull_PA_MBAS_Gallery(page):
    page_url = f"https://powerusers.microsoft.com/t5/MBAS-Gallery/bd-p/PA_MBAS_Gallery/page/{page}"

    logger.debug(f"page_url: {page_url}")
    
    r = requests.get(page_url) #returns the HTML of the page, can be done through urlopen as well

    soup = bs4.BeautifulSoup(r.content)

    session_urls = []
    
    tags = soup.find_all("div",{"class":"lia-messages-message-card"})
    
    for index, tag in enumerate(tags):
        session_urls.append( base_url + tag.select("div > a")[0]['href'] )
        
    return session_urls
    

In [6]:
futures = client.map(get_pagefull_PA_MBAS_Gallery, range(1,13))

In [7]:
session_urls_PA_MBAS_Gallery = client.gather(futures)

In [8]:
session_urls_PA_MBAS_Gallery = list(mapcat(lambda x: x, session_urls_PA_MBAS_Gallery))

In [9]:
session_urls_PA_MBAS_Gallery

['https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-Dynamics-365-for-Finance-and-Operations-What-s-new-in/m-p/299878',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-Power-BI-The-future-of-modern-BI-roadmap-and-vision/m-p/298820',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-Dynamics-365-for-Finance-and-Operations-How-Dual-Write/m-p/299884',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-Power-BI-BI-power-hour/m-p/299883',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-Dynamics-365-for-Finance-and-Operations-Updated/m-p/299814',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-Dynamics-365-Business-Central-roadmap-and-overview/m-p/299140',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-Flow-Vision-and-feature-roadmap/m-p/299125',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-PowerApps-Vision-and-roadmap/m-p/299119',
 'https://powerusers.microsoft.com/t5/MBAS-Gallery/Real-World-St

In [10]:
session_urls_PA_MBAS_Gallery = [x for x in unique(session_urls_PA_MBAS_Gallery)]

In [11]:
len(session_urls_PA_MBAS_Gallery)

223

### Flow sessions

In [12]:
def get_pagefull_FL_MBAS_Gallery(page):
    page_url = f"https://powerusers.microsoft.com/t5/MBAS-Gallery/bd-p/FL_MBAS_Gallery/page/{page}"

    logger.debug(f"page_url: {page_url}")
    
    r = requests.get(page_url) #returns the HTML of the page, can be done through urlopen as well

    soup = bs4.BeautifulSoup(r.content)

    session_urls = []
    
    tags = soup.find_all("div",{"class":"lia-messages-message-card"})
    
    for index, tag in enumerate(tags):
        session_urls.append( base_url + tag.select("div > a")[0]['href'] )
        
    return session_urls
    

In [13]:
futures = client.map(get_pagefull_FL_MBAS_Gallery, range(1,13))

In [14]:
session_urls_FL_MBAS_Gallery = client.gather(futures)

In [15]:
session_urls_FL_MBAS_Gallery = list(mapcat(lambda x: x, session_urls_FL_MBAS_Gallery))

In [16]:
session_urls_FL_MBAS_Gallery = [x for x in unique(session_urls_FL_MBAS_Gallery)]

In [17]:
len(session_urls_FL_MBAS_Gallery)

222

In [18]:
session_urls = list(set(session_urls_PA_MBAS_Gallery + session_urls_FL_MBAS_Gallery)) 

In [19]:
df = pd.DataFrame({'session_urls': session_urls})

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 445 entries, 0 to 444
Data columns (total 1 columns):
session_urls    445 non-null object
dtypes: object(1)
memory usage: 3.6+ KB


In [21]:
df.to_clipboard(sep='\t', index=False)

In [22]:
split_url_df = df['session_urls'].str.rpartition('/')

In [23]:
df.insert( len(df.columns), 'urlpart1', split_url_df[0])

In [24]:
df.insert( len(df.columns), 'urlpart2', split_url_df[2])

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 218 entries, 0 to 405
Data columns (total 3 columns):
session_urls    218 non-null object
urlpart1        218 non-null object
urlpart2        218 non-null object
dtypes: object(3)
memory usage: 6.8+ KB


In [26]:
df.drop_duplicates(subset=['urlpart1'], keep='first', inplace=True)

In [27]:
df.to_clipboard(sep='\t', index=False)

In [None]:
page_url = f"https://powerusers.microsoft.com/t5/MBAS-Gallery/Microsoft-PowerApps-Accelerate-your-journey-for-building-and/m-p/299118"

#logger.debug(f"page_url: {page_url}")

r = requests.get(page_url) #returns the HTML of the page, can be done through urlopen as well

soup = bs4.BeautifulSoup(r.content)


tags = soup.find_all("a",{"class":"ytp-youtube-button ytp-button yt-uix-sessionlink"})

#for index, tag in enumerate(tags):
#    session_urls.append( base_url + tag.select("div > a")[0]['href'] )


In [51]:
session_data_df = pd.read_json( 'c://mmm/session_data1.json')

In [52]:
session_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 221 entries, 0 to 99
Data columns (total 12 columns):
session_url           221 non-null object
Title:                221 non-null object
Date:                 221 non-null object
Video URL:            221 non-null object
Description:          221 non-null object
Other resources:      221 non-null object
Session Type:         221 non-null object
Session Code:         221 non-null object
Product Category:     221 non-null object
Product:              221 non-null object
Level:                221 non-null object
Presenter:            221 non-null object
dtypes: object(12)
memory usage: 22.4+ KB


In [53]:
session_data_df.to_clipboard(sep='\t', index=False)

In [45]:
session_data2_df = pd.read_json( 'c://mmm/session_data2.json')

In [46]:
session_data2_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117 entries, 0 to 99
Data columns (total 11 columns):
Title:                117 non-null object
Date:                 117 non-null object
Video URL:            117 non-null object
Description:          117 non-null object
Other resources:      117 non-null object
Session Type:         117 non-null object
Session Code:         117 non-null object
Product Category:     117 non-null object
Product:              117 non-null object
Level:                117 non-null object
Presenter:            117 non-null object
dtypes: object(11)
memory usage: 11.0+ KB


In [47]:
session_data2_df.to_clipboard(sep='\t', index=False)

In [35]:
session_urls_df = pd.read_json( 'c://mmm/session_urls.json')

In [None]:
session_urls_df

In [38]:
session_urls_df.to_clipboard(sep='\t', index=False)

In [48]:
session_urls_dups_df = pd.read_json( 'c://mmm/session_urls_dups.json')

In [None]:
session_urls_dups_df

In [50]:
session_urls_dups_df.to_clipboard(sep='\t', index=False)