**Purpose**: Extract data from website into CSV files

In [1]:
actual_cohort = 6

In [2]:
import base64
import datetime
import pandas as pd
import yaml
from github import Github
from pathlib import Path

Usage:
- Generate a Personal access tokens on GitHub (Settings - Developer settings - Personal access token)
- Add it to the `../config.yml` after `github:`



In [3]:
# get a GitHub token
with open("../config.yml", "r") as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)
token = config['github']
if token is None:
    token = os.environ.get('GITHUB_TOKEN', None)

In [4]:
# connect to GitHub
g = Github(token)
# to get limit of API request: g.get_rate_limit()
# retrieve the hub repository
repo = g.get_user("open-life-science").get_repo("open-life-science.github.io")

Extraction date

In [5]:
str(datetime.datetime.now())

'2022-09-19 17:02:21.386925'

# Get people information

In [6]:
def read_yaml_file(fp, ref):
    '''
    Read a YAML file at a given git commit

    :param fp: path to file on GitHub
    :param ref: name of the commit/branch/tag

    :return: set with contributor ids
    '''
    file_content = repo.get_contents(fp, ref=ref).content
    decoded_file_content = base64.b64decode(file_content)
    return yaml.load(decoded_file_content, Loader=yaml.FullLoader)

In [7]:
people = read_yaml_file("_data/people.yaml", "main")

In [8]:
# remove some keys and add space for cohorts
for key, value in people.items():
    value.pop('affiliation', None)
    value.pop('bio', None)
    value.pop('orcid', None)
    value.pop('twitter', None)
    value.pop('website', None)
    for i in range(1, actual_cohort+1):
        value[f'ols-{i}'] = []

# Get cohort informations

In [9]:
def update_people_info(p_list, p_dict, status, cohort_id):
    '''
    Update people attribute for a cohort

    :param p_list: list of people id to update
    :param p_dict: dictionary with people information
    :param status: status to add
    :param cohort_id: concerned cohort
    '''
    for p in p_list:
        if p not in p_dict:
            raise ValueError("{o} not found in people")
        p_dict[p][f'ols-{cohort_id}'].append(status)

In [11]:
for i in range(1, actual_cohort+1):
    print(f"OLS {i}")
    # extract experts, facilitators, organizers from metadata
    metadata = read_yaml_file(f"_data/ols-{i}-metadata.yaml" , "main")
    update_people_info(metadata['experts'], people, 'expert', i)
    if 'facilitators' in metadata:
        update_people_info(metadata['facilitators'], people, 'facilitator', i)
    update_people_info(metadata['organizers'], people, 'organizer', i)
    # extract participants, mentors from projects
    projects = read_yaml_file(f"_data/ols-{i}-projects.yaml", "main")
    # extract speakers from schedule
    schedule = read_yaml_file(f"_data/ols-{i}-schedule.yaml", "main")

OLS 1
OLS 2
OLS 3
OLS 4
OLS 5
OLS 6


ValueError: {o} not found in people

# Export

In [13]:
people_df = pd.DataFrame.from_dict(people, orient='index')
people_fp = Path('..') / Path('data') / Path('people.csv')
people_df.to_csv(people_fp)