In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
import gender_guesser.detector as gender

## SCRAPE SPEAKERS

In [14]:
def content_parser(content):
    key = content.findAll('h3', {'class': 'accordion__title'})[0].text.strip()
    session_type = content.findAll('div', {'class': 'content-type'})[0].findAll(
        'span', {'class': 'accordion__value'})[0].text
    try:
        theme = [cont.text for cont in content.findAll('div', {'class': 'theme'})[0].findAll(
            'div', {'class': "field--session-theme"})[0].findAll('div')[1:]]
    except IndexError:
        theme = 'No theme'
    try:
        categories = [cont.text for cont in content.findAll('div', {'class': 'category'})[0].findAll(
            'div', {'class': "field--session-category"})[0].findAll('div')[1:]]
    except IndexError:
        categories = 'No categories'
    
    try:
        speakers = [f.text for f in content.findAll('div', {'class' : 'speakers'})[0].findAll('div', {'class': 'field--session-speakers'})[0].findAll('div')[1:]]
    except IndexError:
        speakers = 'No speakers'
    
    return {key: {'speakers': speakers, 'session-type': session_type, 'theme': theme, 'categories': categories}}

def scrape_sessions(page='https://tc19.tableau.com/learn/sessions'):
    speakers_page = []
    page_left = True
    
    while page_left:
        r = requests.get(page)
        soup = BeautifulSoup(r.content, 'html5lib')
        contents = soup.findAll('div', {'class': 'accordion__item'})
        speakers_text = []
        speakers_text = [content_parser(f) for f in contents]
        speakers_page.append(speakers_text)
        try:
            page = f"https://tc19.tableau.com{soup.findAll('li', {'class': 'pager__item pager__item--next'})[0].findAll('a')[0]['href']}"
        except:
            page_left = False
    
    speakers_page_flat = [item for sublist in speakers_page for item in sublist]
        
    keys = [list(k.keys())[0] for k in speakers_page_flat]
    values = [list(k.values())[0] for k in speakers_page_flat]
    
    return {k: v for k,v in zip(keys, values)}

def try_scrape_sessions():
    try:
        return scrape_sessions()
    except:
        return None
    

In [15]:
# we do this multiple times as we get partially different results for every run
sessions_multi = [try_scrape_sessions() for i in tqdm(range(1000))]

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [16]:
# summarize: get only unique scraped sessions
sum_sessions = {}
for session in tqdm(sessions_multi):
    for k, v in session.items():
        sum_sessions[k] = v

HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [20]:
len(sum_sessions.keys())

375

In [22]:
# join multiple values from list to str
def list_str_joiner(val, join_str=' & '):
    if type(val) == str:
        return val
    if type(val) == list:
        return join_str.join(val)
    

In [23]:
# create a dictionary of speakers
speaker_dict = {}
for k,v in sum_sessions.items():
    for speaker in v['speakers']:
        speaker_dict[speaker] = [k, v['session-type'], list_str_joiner(v['theme']), list_str_joiner(v['categories'])]

In [24]:
# create a dataframe
session_data = pd.DataFrame.from_dict(speaker_dict,
                                      orient='index',
                                      columns = ['title', 'session-type', 'themes', 'categories'])

## GUESS THE GENDER

In [25]:
# instantiate a gender detector
d = gender.Detector()

In [26]:
# get first names to do detection
first_names = [name.split(' ')[0] for name in session_data.index]

In [27]:
# detect genders
genders = [d.get_gender(first_name) for first_name in first_names]
session_data['gender'] = genders

In [29]:
session_data.head(10)

Unnamed: 0,title,session-type,themes,categories,gender
Sarah Battersby,All Over the Map! Level Up your Spatial Vizzes,Session,Data and Analytics Skills,No categories,female
Ashwin Kumar,A Beginner's Guide to Maps,Session,Data and Analytics Skills,No categories,unknown
Eric Freeman,Just a Dash of Inspiration: Effective Visual D...,Hands-on Training,Dashboard and Design,No categories,male
Maddie Rawding,Just a Dash of Inspiration: Effective Visual D...,Hands-on Training,Dashboard and Design,No categories,female
Zach Ahrens,Agility and Deployment Best Practices,Session,Data Culture,No categories,male
Sanjeev Verma,Centralized Row-Level Security,Session,Data Management,No categories,male
Miranda Osterheld,Tableau Hacks: Use Tableau Server to Fit Your ...,Session,Data Culture,No categories,female
Ann Jackson,Zen Master: Tableau Speed Tipping,Session,Data and Analytics Skills,No categories,female
Lorna Eden,Zen Master: Tableau Speed Tipping,Session,Data and Analytics Skills,No categories,female
Priyatham Pamu,"Understanding Tableau Queries, Techniques and ...",Session,Data Management,No categories,unknown


In [30]:
session_data.to_csv('tableau-speakers-gender-scrape.csv')