## Import Libraries

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from pprint import pprint

## Extract Information from GUE.com

In [2]:
GUE_INSTRUCTORS_URL = "https://www.gue.com/diver-training/gue-instructors"
GUE_INSTRUCTOR_RESUME_URL = "https://www.gue.com/diver-training/gue-instructors/instructor-resume?id="

In [3]:
# Create Soup
page = requests.get(GUE_INSTRUCTORS_URL).content
soup = bs(page, 'html.parser')

In [4]:
filter_results = {}

instructor_container = soup.find(name='div', attrs={"id": "Content"})
options = instructor_container.find_all(name='option')
prev_option = None
option_values = []

for option in options:
    option_type = None
    if "--" in option.text:
        # Extract Text 
        option_type = option.text.lstrip("-- ").rstrip(" --").split(" ")[1].lower()
        if not prev_option:
            prev_option = option_type
        if prev_option != option_type:
            filter_results.update({prev_option: option_values})
            option_values = []
            prev_option = option_type
    else:
        option_values.append((option.get("value"), option.text.strip("\xa0\xa0").title()))

In [5]:
# Remove Regions from Locations
filter_results.update({'location': [x for x in filter_results.get('location') if len(x[0]) <= 2]})
pprint(filter_results)

{'course': [('100', 'Recreational'),
            ('53', 'Discover Diving'),
            ('54', 'Recreational Supervised Diver'),
            ('21', 'Recreational Diver 1 - Nitrox'),
            ('44', 'Recreational Diver 2 - Triox'),
            ('35', 'Recreational Diver 3 - Trimix'),
            ('200', 'Foundational'),
            ('1', 'Gue Fundamentals'),
            ('34', 'Doubles Diver'),
            ('33', 'Drysuit Diver'),
            ('31', 'Dpv Diver 1'),
            ('45', 'Dpv Cave'),
            ('41', 'Documentation Diver'),
            ('57', 'Photogrammetry Diver'),
            ('51', 'Scientific Diver'),
            ('43', 'Gas Blender'),
            ('9', 'Triox Primer'),
            ('62', 'Navigation Primer'),
            ('61', 'Rescue Primer'),
            ('300', 'Cave'),
            ('2', 'Cave Diver 1'),
            ('3', 'Cave Diver 2'),
            ('46', 'Underwater Cave Survey'),
            ('59', 'Cave Sidemount'),
            ('65', 'Ccr Cave'),
      

In [6]:
# Retrieve Instructors by Countries 
countries = filter_results.get("location")
country_instructor_dict = {}

for country in countries:
    code, country_name = country
    instructors = []
    
    page = requests.get(GUE_INSTRUCTORS_URL + "?country=" + code).content
    soup = bs(page, 'html.parser')
    instructor_container = soup.find(name='div', attrs={"id": "Content"})
    for bold_element in instructor_container.find_all('b'):
        _link = bold_element.find("a")
        if _link and _link.get("href").startswith("/diver-training/"):
            instructors.append(_link.text.strip("\xa0\xa0").title())
    country_instructor_dict.update({code: instructors})
pprint(country_instructor_dict)

{'AE': ['Dimitris Fifis', 'Dorota Czerny', 'Jesper Kjøller', 'Mark Devoldere'],
 'AN': ['German Arango'],
 'AO': [],
 'AR': ['Belen Andres-Garcia', 'Guillermo Riesco', 'Raul Alvarez Garcia'],
 'AT': ['Christina Müller',
        'Oliver Reimer',
        'Osama Gobara',
        'Sebastian Von Koss'],
 'AU': ['Duncan Paterson',
        'Kieran Hussey',
        'Liam Allen',
        'Rhys Toone',
        'Ryan Booker'],
 'BE': ['Jérôme Descamps',
        'Mark Devoldere',
        'Martijn Blommaert',
        'Richard Van De Logt',
        'Sander Evering'],
 'BR': ['Ricardo Constantino', 'Sergio R Schirato'],
 'CA': ['Alain Eid',
        'Dany Dulac',
        'Guy Shockey',
        'Heison Chak',
        'Imad Farhat',
        'Jason Cook',
        'Martin Lessard',
        'Michael Pinault',
        'Steve Blanchard'],
 'CH': ['Caterina De Seta',
        'Irene Homberger',
        'Ivan Wagner',
        'Maximilian Fahr',
        'Oliver Ober',
        'Richard Spikings',
        'Romano 

## Build Datasets

In [7]:
instructor_names = [x[1] for x in filter_results.get("instructor")]
instructor_names

['Abell Yu Zhang',
 'Ahmad Yaqoub Alhusaini',
 'Ahmed Hagi',
 'Alain Eid',
 'Alberto Nava',
 'Alessandro Vezzani',
 'Andrea Marassich',
 'Andrea Milani',
 'Andrei Dmitriev',
 'Andrew Couch',
 'Anne-Marie Bresser',
 'Annika Persson',
 'Arthur Nguyen-Kim',
 'Bartek Trzcinski',
 'Belen Andres-Garcia',
 'Benjamin Ott',
 'Bob Sherwood',
 'Bruno Borelli',
 'Bruno Espinosa',
 'Cai Keke',
 'Caterina De Seta',
 'Charles Han',
 'Chee Hoon Ong',
 'Christian Höing',
 'Christina Müller',
 'Christof Müller',
 'Christophe Le Maillot',
 'Claudia Haltern',
 'Claudio Provenzani',
 'Daniel Riordan',
 'Daniel Schulte',
 'Dany Dulac',
 'David Dusek',
 'David Rhea',
 'David Sinwoo Lee',
 'David Watson',
 'Derk Remmers',
 'Dimitris Fifis',
 'Diogo Paulo',
 'Dorota Czerny',
 'Douglas Mudry',
 'Duncan Paterson',
 'Edward Hayes',
 'Emőke Wagner',
 'Erik Wurz',
 'Errol Kalayci',
 'Esther Trösch',
 'Fabio Portella',
 'Ferry Schram',
 'Francesco Cameli',
 'Fred Devos',
 'Gemma Thomas',
 'German Arango',
 'Gideon L

In [8]:
df_instructors = pd.DataFrame(instructor_names, columns=['instructor'])
df_instructors

Unnamed: 0,instructor
0,Abell Yu Zhang
1,Ahmad Yaqoub Alhusaini
2,Ahmed Hagi
3,Alain Eid
4,Alberto Nava
...,...
172,Xuan Chen
173,Yanbin Wu
174,Youngwoo Seo
175,Yousuke Ogiwara


In [9]:
rev_country_instructor_dict = {}

rev_country_instructor_dict = {}
for key, value in country_instructor_dict.items():
    for name in value:
        if name not in rev_country_instructor_dict.keys():
            rev_country_instructor_dict.update({name: f"{key}, "})
        else:
            _value = rev_country_instructor_dict.get(name)
            _value += key + ", "
            rev_country_instructor_dict.update({name: _value})

df_instructor_countries = pd.DataFrame(rev_country_instructor_dict.items(), columns=['instructor', 'countries'])
df_instructor_countries['countries'] = df_instructor_countries['countries'].str.rstrip(', ')
df_instructor_countries.sample(3)

Unnamed: 0,instructor,countries
41,Charles Han,US
23,Alessandro Vezzani,"MX, IT"
151,Anne-Marie Bresser,"NL, SE"


In [10]:
set(df_instructors['instructor']) - set(df_instructor_countries['instructor'])

{'Gue Hq', 'Lereyce Josephs'}

Upon further checking, `Gue Hq` and `Lereyce Josephs` both do not have resume on GUE.com as of 6 Nov 2023.

In [11]:
# Map Country codes to Country Names
country_code_enum = dict(filter_results.get('location'))
df_instructor_countries['countries'] = df_instructor_countries['countries'].apply(lambda x:", ".join([country_code_enum.get(y) for y in x.split(", ")]))
df_instructor_countries.sample(3)

Unnamed: 0,instructor,countries
93,Youngwoo Seo,"Korea (South), Philippines"
40,Bob Sherwood,United States
105,Oliver Reimer,"Austria, Germany"


In [12]:
df_instructor_countries.to_csv('../data/instructors_countries.csv', index=False)