**Purpose**: Extract data from website into CSV files

In [1]:
actual_cohort = 6

In [2]:
import base64
import copy
import datetime
import pandas as pd
import yaml
from github import Github
from pathlib import Path

Usage:
- Generate a Personal access tokens on GitHub (Settings - Developer settings - Personal access token)
- Add it to the `../config.yml` after `github:`



In [3]:
# get a GitHub token
with open("../config.yml", "r") as stream:
    config = yaml.load(stream, Loader=yaml.FullLoader)
token = config['github']
if token is None:
    token = os.environ.get('GITHUB_TOKEN', None)

In [4]:
# connect to GitHub
g = Github(token)
# to get limit of API request: g.get_rate_limit()
# retrieve the hub repository
repo = g.get_user("open-life-science").get_repo("open-life-science.github.io")

Extraction date

In [5]:
str(datetime.datetime.now())

'2023-01-23 17:22:14.176786'

# Get people information

In [6]:
def read_yaml_file(fp, ref):
    '''
    Read a YAML file at a given git commit

    :param fp: path to file on GitHub
    :param ref: name of the commit/branch/tag

    :return: set with contributor ids
    '''
    file_content = repo.get_contents(fp, ref=ref).content
    decoded_file_content = base64.b64decode(file_content)
    return yaml.load(decoded_file_content, Loader=yaml.FullLoader)

In [7]:
people = read_yaml_file("_data/people.yaml", "main")

In [8]:
# remove some keys and add space for cohorts
for key, value in people.items():
    value.pop('affiliation', None)
    value.pop('bio', None)
    value.pop('orcid', None)
    value.pop('twitter', None)
    value.pop('website', None)
    value.pop('github', None)
    value.pop('title', None)
    value.pop('expertise', None)
    for i in range(1, actual_cohort+1):
        value[f'ols-{i}'] = []

# Get cohort and project informations

In [9]:
def update_people_info(p_list, p_dict, status, cohort_id):
    '''
    Update people attribute for a cohort

    :param p_list: list of people id to update
    :param p_dict: dictionary with people information
    :param status: status to add
    :param cohort_id: concerned cohort
    '''
    for p in p_list:
        if p is None:
            continue
        if p not in p_dict:
            print(f"{p} not found in people")
            continue
        p_dict[p][f'ols-{cohort_id}'].append(status)

In [10]:
def get_people_names(p_list, p_dict):
    '''
    Get names of peoke
    
    :param p_list: list of people id
    :param p_dict: dictionary with people information
    '''
    names = []
    for p in p_list:
        if p is None:
            names.append(None)
        elif p not in p_dict:
            print(f"{p} not found in people")
            names.append(None)
        else:
            names.append(f"{p_dict[p]['first-name']} {p_dict[p]['last-name']}")
    return names

In [11]:
projects = []

In [12]:
for i in range(1, actual_cohort+1):
    print(f"OLS {i}")
    # extract experts, facilitators, organizers from metadata
    metadata = read_yaml_file(f"_data/ols-{i}-metadata.yaml" , "main")
    update_people_info(metadata['experts'], people, 'expert', i)
    if 'facilitators' in metadata:
        update_people_info(metadata['facilitators'], people, 'facilitator', i)
    update_people_info(metadata['organizers'], people, 'organizer', i)
    # extract participants, mentors from projects
    # extract project details
    cohort_projects = read_yaml_file(f"_data/ols-{i}-projects.yaml", "main")
    for p in cohort_projects:
        # update participant and mentor information
        update_people_info(p['participants'], people, 'participant', i)
        update_people_info(p['mentors'], people, 'mentor', i)
        # get project details
        pr = copy.copy(p)
        pr['participants'] = get_people_names(p['participants'], people)
        pr['mentors'] = get_people_names(p['mentors'], people)
        pr['cohort'] = i
        pr['keywords'] = p['keywords'] if 'keywords' in p else []
        projects.append(pr)
    # extract speakers from schedule
    schedule = read_yaml_file(f"_data/ols-{i}-schedule.yaml", "main")
    for w, week in schedule['weeks'].items():
        for c in week['calls']:
            if c['type'] == 'Cohort' and 'resources' in c and c['resources'] is not None:
                for r in c['resources']:
                    if r['type'] == 'slides' and 'speaker' in r and r['speaker'] is not None:
                        update_people_info([r['speaker']], people, 'speaker', i)

OLS 1
Demellina not found in people
JasonJWilliamsNY not found in people
OLS 2
natasha_wood not found in people
ekinbolukbasi not found in people
OLS 3
margaret-wanjiku not found in people
alexandra-holinski not found in people
OLS 4
OLS 5
lisanne-walma not found in people
lisanne-walma not found in people
OLS 6


# Export

People information to CSV file

In [13]:
people_df = pd.DataFrame.from_dict(people, orient='index')
# people_df['expertise'] = people_df['expertise'].apply(lambda x: ', '.join([str(i) for i in x]))
for i in range(1, actual_cohort+1):
    people_df[f'ols-{i}'] = people_df[f'ols-{i}'].apply(lambda x: ', '.join([str(i) for i in x]))

In [14]:
people_df

Unnamed: 0,city,country,first-name,last-name,pronouns,ols-1,ols-2,ols-3,ols-4,ols-5,ols-6
0sahene,Tamale,Ghana,Sitsofe,Morgah,He,,,,,participant,
0x174,Boston,United States,William,Jackson,He/Him,,,participant,,,
abdulelahsm,Dammam,Saudi Arabia,Abdulelah,Al Mesfer,he/him,,,participant,,,
abraham-dabengwa,Johannesburg,South Africa,Abraham,Dabengwa,he/ him,,,,,participant,
abretaud,Rennes,France,Anthony,Bretaudeau,He/him,,,,mentor,,
...,...,...,...,...,...,...,...,...,...,...,...
rhoné-roux,,,Rhoné,Roux,,,,,,,participant
sgsfak,,,Stelios,Sfakianakis,,,,participant,,,
tallmar,,,Marta,Lloret Llinares,,,,,,,expert
umar-farouk-ahmad,,,Umar Farouk,Ahmad,,,,,,,participant


In [15]:
people_fp = Path('..') / Path('data') / Path('people.csv')
people_df.to_csv(people_fp)

Project information to CSV file

In [17]:
project_df = pd.DataFrame(projects)
project_df['participants'] = project_df['participants'].apply(lambda x: ', '.join([str(i) for i in x]))
project_df['mentors'] = project_df['mentors'].apply(lambda x: ', '.join([str(i) for i in x]))
project_df['keywords'] = project_df['keywords'].apply(lambda x: ', '.join([str(i) for i in x]))
project_df

Unnamed: 0,name,participants,mentors,description,cohort,keywords,status
0,Open Science Community Barcelona (OSCBa),Elisenda Bonet-Carne,Harry Smith,This project aims to create a community of loc...,1,,
1,Investigate feasible solutions to help create ...,Deborah Akuoko,Vicky Nembaware,Most sub Saharan African countries barely have...,1,,
2,Infusing a culture of open science within the ...,Chiara Bertipaglia,Mateusz Kuzak,"The community of <a href=""https://zuckermanins...",1,,graduated
3,Bioinformatics Hub of Kenya (BHK),"Festus Nyasimi, Margaret Wanjiku, David Kiragu...","Toby Hodges, Malvika Sharan",The Bioinformatics Hub of Kenya is an entity t...,1,,graduated
4,Expanding plenoptic Python Package,Billy (William) Broderick,Rodrigo Oliveira Campos,"<a href=""https://pypi.org/project/plenpy/"">ple...",1,,graduated
...,...,...,...,...,...,...,...
171,An extensible notebook for open specimens,Nicky Nicolson,"Andrea Sánchez-Tapia, Batool Almarzouq",This project is developing a prototype “extens...,6,"biodiversity informatics, species description,...",
172,Multiomics profiling and analysis of cardiovas...,Rushda Patel,Hans-Rudolf Hotz,Cardiovascular diseases are the leading cause ...,6,"ethics, neuroscience, open-research, computer ...",
173,The Undergraduates Guide To Research Software ...,Aman Goel,Mariana Meireles,**The Undergraduate's Guide To Research Softwa...,6,"research software engineering, open science, o...",
174,Developing policy briefs on mental well-being ...,Mayya Sundukova,Natalie Banner,,6,,


In [18]:
project_fp = Path('..') / Path('data') / Path('projects.csv')
project_df.to_csv(project_fp)