In [None]:
import requests
import bs4
from toolz.itertoolz import mapcat, unique
from toolz.dicttoolz import merge
import logging
from dask.distributed import Client
import pandas as pd

In [None]:
logging.basicConfig()
logger = logging.getLogger('get_bas_videos')
logger.setLevel(logging.DEBUG) # Do not chain this to previous statement

In [None]:
client = Client()

In [None]:
base_url = 'https://powerusers.microsoft.com'

### Dynamics, PowerApps and Power BI sessions

In [None]:
def get_pagefull_PA_MBAS_Gallery(page):
    page_url = f"https://powerusers.microsoft.com/t5/MBAS-Gallery/bd-p/PA_MBAS_Gallery/page/{page}"

    logger.debug(f"page_url: {page_url}")
    
    r = requests.get(page_url) #returns the HTML of the page, can be done through urlopen as well

    soup = bs4.BeautifulSoup(r.content)

    session_urls = []
    
    tags = soup.find_all("div",{"class":"lia-messages-message-card"})
    
    for index, tag in enumerate(tags):
        session_urls.append( base_url + tag.select("div > a")[0]['href'] )
        
    return session_urls
    

In [None]:
futures = client.map(get_pagefull_PA_MBAS_Gallery, range(1,13))

In [None]:
session_urls_PA_MBAS_Gallery = client.gather(futures)

In [None]:
session_urls_PA_MBAS_Gallery = list(mapcat(lambda x: x, session_urls_PA_MBAS_Gallery))

In [None]:
session_urls_PA_MBAS_Gallery

In [None]:
session_urls_PA_MBAS_Gallery = [x for x in unique(session_urls_PA_MBAS_Gallery)]

In [None]:
len(session_urls_PA_MBAS_Gallery)

### Flow sessions

In [None]:
def get_pagefull_FL_MBAS_Gallery(page):
    page_url = f"https://powerusers.microsoft.com/t5/MBAS-Gallery/bd-p/FL_MBAS_Gallery/page/{page}"

    logger.debug(f"page_url: {page_url}")
    
    r = requests.get(page_url) #returns the HTML of the page, can be done through urlopen as well

    soup = bs4.BeautifulSoup(r.content)

    session_urls = []
    
    tags = soup.find_all("div",{"class":"lia-messages-message-card"})
    
    for index, tag in enumerate(tags):
        session_urls.append( base_url + tag.select("div > a")[0]['href'] )
        
    return session_urls
    

In [None]:
futures = client.map(get_pagefull_FL_MBAS_Gallery, range(1,13))

In [None]:
session_urls_FL_MBAS_Gallery = client.gather(futures)

In [None]:
session_urls_FL_MBAS_Gallery = list(mapcat(lambda x: x, session_urls_FL_MBAS_Gallery))

In [None]:
session_urls_FL_MBAS_Gallery = [x for x in unique(session_urls_FL_MBAS_Gallery)]

In [None]:
len(session_urls_FL_MBAS_Gallery)

### Consolidate session URLs

In [None]:
session_urls = list(set(session_urls_PA_MBAS_Gallery + session_urls_FL_MBAS_Gallery)) 

In [None]:
df = pd.DataFrame({'session_urls': session_urls})

In [None]:
df.info()

### Unduplicate sessions

In [None]:
split_url_df = df['session_urls'].str.rpartition('/')

In [None]:
df.insert( len(df.columns), 'urlpart1', split_url_df[0])

In [None]:
df.insert( len(df.columns), 'urlpart2', split_url_df[2])

In [None]:
df.info()

In [None]:
df.to_json('session_urls_dups.json')

In [None]:
df.drop_duplicates(subset=['urlpart1'], keep='first', inplace=True)

### Store session URLS for future reference

In [None]:
df.to_json('session_urls.json')

### Read from storage to prevent multiple, time-consuming runs to collect session URLs

In [None]:
df = pd.read_json( 'session_urls.json')

In [None]:
df = pd.read_csv( 'session_urls.csv')

### Get individual session data

In [None]:
def get_session_data(session_url):
    splash_url = f"http://localhost:8050/render.html?url={session_url}&timeout=10&wait=0.5" 

    logger.debug(f"splash_url: {splash_url}")
    
    r = requests.get(splash_url) #returns the HTML of the page, can be done through urlopen as well

    soup = bs4.BeautifulSoup(r.content)

    tags = soup.find_all("div",{"class":"lia-message-subject"})

    sess_details = {}
    
    sess_details['session_url'] = session_url
    
    tag = list(tags)[0].select("div > div > h1")
    sess_details['Title: '] = tag[0].contents[len(tag[0].contents)-1] if len(list(tag)) else ''
    #sess_details['Title: '] = list(tags)[0].select("div > div > h1")[0].contents[1]
    sess_details['Date: '] = list(tags)[0].find_all("span",{"class":"local-friendly-date"})[0].contents[0].strip()


    tags = soup.find_all("iframe",{"class":"videoIframe"})
    sess_details['Video URL: '] = list(tags)[0].get('src', '')  if len(list(tags)) else ''

    tags = soup.find_all("div",{"class":"lia-message-body-content"})

    tag = list(tags)[0].select("p")
    sess_details['Description: '] = tag[0].contents[0] if len(list(tag)) else ''
    
    tag = list(tags)[0].find_all("span",{"class":"fileLink"})
    sess_details['Other resources: '] = tag[0].select("a")[0].get('href', '') if len(list(tag)) else ''

    tags = soup.find_all("div",{"class":"custom-mbas-labels"})
    if(len(list(tags))):
        labels = tags[0].select("div.profileDetails > p > span.profilePageItemLabel")
        labels = [x for x in map(lambda x: x.contents[0], labels)]
        values = tags[0].select("div.profileDetails > p > span.profilePageItemValue")
        values = [x for x in map(lambda x: x.contents[0], values)]


    sess_details = merge(sess_details, dict(zip(labels, values)))

    return sess_details

In [None]:
# Create a list of dictionaries (each for a single session page)
LD = [get_session_data(x) for x in df.iloc[0:-1, 0]]

In [None]:
# Convert list of dictionaries to dictionary of lists
v = {k: [dic.get(k, '')  for dic in LD] for k in LD[0]}

In [None]:
# Convert dictionary of lists to pandas dataframe
df_final = pd.DataFrame(v)

In [None]:
df_final.head()

In [None]:
df_final.to_json('session_data.json')

In [None]:
len(df_final.index)