## This is a file to test out web scraping the 2023 OHW projects page and return a csv

Import necessary packages:

In [1]:
import requests # used to get the data from the url
from bs4 import BeautifulSoup # used to convert url request data to a readable and workable format
import re # used to create a regex function for splitting strings (for help with regex visit: https://regexr.com/)
import pandas as pd # used for many data science/analytical things but in this case, used to make a neat csv 

##### Get the content from the above url

In [2]:
def get_url_content(url: str):
    # Making a GET request
    r = requests.get(url)
    # use BeautifulSoup to make it pretty
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup

- `projects_html_split` is created from `projects_html` where we split the whole string using the regular expression (regex) `[<>]` which splits on all characters between the `[]`. The `filter` function with `None` as the first parameter filters out any list objects that are `None` or empty strings. Finally, we convert the filter result to a list.
- Next create an empty list called `projects_list`
- Now a `for` loop is used to loop through each `string` in `projects_html_split`
    - If the first character of the string (`string[0]`) is a digit (in this case all project names start with #.), proceed
    - Then strip out those numbers using a regex formula `[\d.]`. `\d` matches any digit character (0-9)
    - Now append the new string to the `projects_list` list

In [3]:
def get_projects_list(soup, tag):
    projects_html = soup.find_all(tag)
    projects_html_split = list(filter(None, re.split(r"[<>]", str(projects_html))))
    projects_list = []
    try:
        for string in projects_html_split:
            if string[0].isdigit():
                stripped_string = re.sub(r'[\d.]', '', str(string)).strip()
                projects_list.append(stripped_string)
            elif string[0:8] == 'Project:':
                projects_list.append(string)
        if projects_list == []:
            raise Exception('resulting list is empty')
    except Exception:
        regex = re.compile(r'\b[A-Z]')
        for string in projects_html_split:
            if regex.match(string):
                projects_list.append(string)
    return projects_list

- `github_urls_html_split` is created from `github_urls_html` where we split the whole string using the regular expression (regex) `["]` which splits on all characters between the `[]`. The `filter` function with `None` as the first parameter filters out any list objects that are `None` or empty strings. Finally, we convert the filter result to a list.
- Next create an empty list called `github_urls_list`
- Now a `for` loop is used to loop through each `string` in `github_urls_html_split`
    - If the string `https://github.com` is contained with the string we are looking at, proceed 
    - Now append the new string to the `github_urls_list` list

In [4]:
def get_github_url_list(soup, tag, flag=None, **kwargs):
    if len(kwargs.items()) != 0:
        for key, value in kwargs.items():
            if key == 'tag2':
                github_urls_html = soup.find_all(tag, value)
    else:
        github_urls_html = soup.find_all(tag)
    github_urls_html_split = list(filter(None, re.split(r'["<>]', str(github_urls_html))))
    github_urls_list = []
    for string in github_urls_html_split:
        if 'https://github.com' in string:
            if 'proj' in string or 'ohw18' in string:
                github_urls_list.append(string)
                # now remove any duplicates
                github_urls_list_final = []
                [github_urls_list_final.append(x) for x in github_urls_list if x not in github_urls_list_final]
    if github_urls_list_final == []:
        raise Exception('resulting list is empty')
    return github_urls_list_final

In [5]:
url_2023 = 'https://oceanhackweek.org/ohw23/projects/projects_thisyear'
url_2022 = 'https://oceanhackweek.org/ohw22/projects/projects_thisyear.html'
url_2021 = 'https://oceanhackweek.org/ohw-resources/projects/projectlist/'
url_2020 = 'https://oceanhackweek.org/ohw21/projects_2020.html'
url_2019 = 'https://oceanhackweek.org/ohw19/projects_2019.html'
url_2018 = 'https://oceanhackweek.org/ohw2018/projects.html'

In [6]:
soup_2023 = get_url_content(url_2023)
soup_2022 = get_url_content(url_2022)
soup_2021 = get_url_content(url_2021)
soup_2020 = get_url_content(url_2020)
soup_2019 = get_url_content(url_2019)
soup_2018 = get_url_content(url_2018)

In [7]:
projects_list_2023 = get_projects_list(soup_2023, tag = 'h3')
projects_list_2022 = get_projects_list(soup_2022, tag = 'h2')
projects_list_2021 = get_projects_list(soup_2021, tag = 'h2')
projects_list_2020 = get_projects_list(soup_2020, tag = 'h3')
projects_list_2019 = get_projects_list(soup_2019, tag = 'h3')
projects_list_2018 = get_projects_list(soup_2018, tag = 'p')
len(projects_list_2023 + projects_list_2022 + projects_list_2021 + projects_list_2020 + projects_list_2019 + projects_list_2018)

72

In [8]:
github_urls_list_2023 = get_github_url_list(soup_2023, tag="a", tag2="github reference external")
print(len(github_urls_list_2023))
print(len(projects_list_2023))

11
11


In [9]:
github_urls_list_2022 = get_github_url_list(soup_2022, tag="a", tag2="github reference external")
print(len(github_urls_list_2022))
print(len(projects_list_2022))
# Need to fill in a few gaps
# Firstly project #8 doesn't have a linked github repo
# Secondly github link for #20 https://github.com/oceanhackweek/ohw22-proj-video-data-processing
github_urls_list_2022.insert(7, 'None')
github_urls_list_2022.append('https://github.com/oceanhackweek/ohw22-proj-video-data-processing')
print(len(github_urls_list_2022))
print(len(projects_list_2022))

18
20
20
20


In [10]:
github_urls_list_2021 = get_github_url_list(soup_2021, tag="a")
print(len(github_urls_list_2021))
print(len(projects_list_2021))
# the first url isn't a project link
github_urls_list_2021.remove(github_urls_list_2021[0])
# Secondly github link for Pull/Hack all ocean data repositories into a global searchable resource is https://github.com/oceanhackweek/metadata-repository
github_urls_list_2021.insert(4, 'https://github.com/oceanhackweek/metadata-repository')
print(len(github_urls_list_2021))
print(len(projects_list_2021))

11
11
11
11


In [11]:
github_urls_list_2020 = get_github_url_list(soup_2020, tag="a")
print(len(github_urls_list_2020))
print(len(projects_list_2020))
# github link for Project: Co-locators expansion is https://github.com/ioos/colocate
github_urls_list_2020.append('https://github.com/ioos/colocate')
print(len(github_urls_list_2020))
print(len(projects_list_2020))

7
8
8
8


In [12]:
github_urls_list_2019 = get_github_url_list(soup_2019, tag="a")
print(len(github_urls_list_2019))
print(len(projects_list_2019))
# github link for Project: Isopy is https://github.com/oceanhackweek/DataAccess/tree/master/isopy
# Project: Modeling Volcano Deformation at Axial Seamount doesn't have a linked github repo
# github link for Project: Working with Chlorophyll Data from the Cloud is https://github.com/oceanhackweek/DataAccess/tree/master/Chlorophyll
# Project: Amazon Fires doesn't have a linked github repo
github_urls_list_2019.insert(2, 'https://github.com/oceanhackweek/DataAccess/tree/master/isopy')
github_urls_list_2019.insert(6, 'None')
github_urls_list_2019.append('https://github.com/oceanhackweek/DataAccess/tree/master/Chlorophyll')
github_urls_list_2019.append('None')
print(len(github_urls_list_2019))
print(len(projects_list_2019))

7
11
11
11


In [13]:
github_urls_list_2018 = get_github_url_list(soup_2018, tag="a")
print(len(github_urls_list_2018))
print(len(projects_list_2018))
# github link for Project: Mussel Beach is https://github.com/oceanhackweek/ohw2018_musselbeach
# github link for Project: LTER Visualization is https://github.com/oceanhackweek/ohw_lter_vis
# github link for Project: OOI Data Validation is https://github.com/oceanhackweek/ohw2018_Data_Validation
github_urls_list_2018.insert(4, 'https://github.com/oceanhackweek/ohw2018_musselbeach')
github_urls_list_2018.insert(7, 'https://github.com/oceanhackweek/ohw_lter_vis')
github_urls_list_2018.insert(8, 'https://github.com/oceanhackweek/ohw2018_Data_Validation')
print(len(github_urls_list_2018))
print(len(projects_list_2018))

8
11
11
11


##### create a dataframe from the two lists

In [14]:
df_2023 = pd.DataFrame(
    {'project': projects_list_2023,
     'github_url': github_urls_list_2023,
     'year': 2023
    })

In [15]:
df_2022 = pd.DataFrame(
    {'project': projects_list_2022,
     'github_url': github_urls_list_2022,
     'year': 2022
    })

In [16]:
df_2021 = pd.DataFrame(
    {'project': projects_list_2021,
     'github_url': github_urls_list_2021,
     'year': 2021
    })

In [17]:
df_2020 = pd.DataFrame(
    {'project': projects_list_2020,
     'github_url': github_urls_list_2020,
     'year': 2020
    })

In [18]:
df_2019 = pd.DataFrame(
    {'project': projects_list_2019,
     'github_url': github_urls_list_2019,
     'year': 2019
    })

In [19]:
df_2018 = pd.DataFrame(
    {'project': projects_list_2018,
     'github_url': github_urls_list_2018,
     'year': 2018
    })

In [20]:
df_list = [df_2023, df_2022, df_2021, df_2020, df_2019, df_2018]

In [21]:
df = pd.concat(df_list)

In [22]:
df['md'] = '[' + df['project'] + ']' + '(' + df['github_url'] + ')'

In [23]:
df

Unnamed: 0,project,github_url,year,md
0,Oil spill monitoring: segmentation of satellit...,https://github.com/oceanhackweek/ohw23_proj_oil,2023,[Oil spill monitoring: segmentation of satelli...
1,Marine species distribution modeling tutorial:...,https://github.com/oceanhackweek/ohw23_proj_ma...,2023,[Marine species distribution modeling tutorial...
2,Inertial oscillations in the marginal ice zone,https://github.com/oceanhackweek/ohw23_proj_se...,2023,[Inertial oscillations in the marginal ice zon...
3,Machine learning for Argo Data QC,https://github.com/oceanhackweek/ohw23_proj_ar...,2023,[Machine learning for Argo Data QC](https://gi...
4,Benthic habitat mapping (image processing/seab...,https://github.com/oceanhackweek/ohw23-proj-ha...,2023,[Benthic habitat mapping (image processing/sea...
...,...,...,...,...
6,Project: Profiles,https://github.com/oceanhackweek/ohw18_profiles,2018,[Project: Profiles](https://github.com/oceanh...
7,Project: LTER Visualization,https://github.com/oceanhackweek/ohw_lter_vis,2018,[Project: LTER Visualization](https://github....
8,Project: OOI Data Validation,https://github.com/oceanhackweek/ohw2018_Data_...,2018,[Project: OOI Data Validation](https://github...
9,Project: Shallow Profiler Motion,https://github.com/oceanhackweek/ohw18_shallow...,2018,[Project: Shallow Profiler Motion](https://git...


##### write out a csv from that dataframe

In [24]:
df.to_csv('project_list_final.csv', index=False)