In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

In [2]:
url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5082682/#pone.0164025.ref029"
resp = requests.get(url)

In [3]:
resp.status_code

200

In [4]:
html = resp.text

In [5]:
html = BeautifulSoup(html)

In [6]:
html.select("#pone\.0164025\.t002 > div.xtable thead tr th")

[<th align="left" colspan="1" rowspan="1">Season</th>,
 <th align="center" colspan="1" rowspan="1">Adélie breeding pairs</th>,
 <th align="center" colspan="1" rowspan="1">Date of pair count</th>,
 <th align="center" colspan="1" rowspan="1">Chinstrap breeding pairs</th>,
 <th align="center" colspan="1" rowspan="1">Date of pair count</th>,
 <th align="center" colspan="1" rowspan="1">Gentoo breeding pairs</th>,
 <th align="center" colspan="1" rowspan="1">Date of pair count</th>]

In [7]:
column_names = [row.get_text() for row in html.select("#pone\.0164025\.t002 > div.xtable thead tr th")]
column_names

['Season',
 'Adélie breeding pairs',
 'Date of pair count',
 'Chinstrap breeding pairs',
 'Date of pair count',
 'Gentoo breeding pairs',
 'Date of pair count']

In [8]:
#html.select("#pone\.0164025\.t002 > div.xtable tbody tr td")

In [9]:
data = [[e.get_text(strip=True) for e in row.select("td")] for row in html.select("#pone\.0164025\.t002 > div.xtable tbody tr")]
data[0]

['1978/79',
 '1873',
 '5 Dec.1978',
 '2050',
 '14 Dec. 1978',
 '370',
 '26 Nov. 1978']

In [10]:
df = pd.DataFrame(data, columns=column_names)

In [11]:
df.head()

Unnamed: 0,Season,Adélie breeding pairs,Date of pair count,Chinstrap breeding pairs,Date of pair count.1,Gentoo breeding pairs,Date of pair count.2
0,1978/79,1873,5 Dec.1978,2050,14 Dec. 1978,370,26 Nov. 1978
1,1979/80,2269,16 Nov. 1979,2253,08 Dec. 1979,303,19 Nov. 1979
2,1980/81,1726,11 Nov. 1980,1809,29 Dec. 1980,330,11 Nov. 1980
3,1981/82,1831,22 Nov. 1981,2250,25 Dec. 1981,341,22 Nov. 1981
4,1982/83,2631,19 Nov. 1982,2334,04 Dec. 1982,299,21 Nov. 1982


In [12]:
df.dtypes

Season                      object
Adélie breeding pairs       object
Date of pair count          object
Chinstrap breeding pairs    object
Date of pair count          object
Gentoo breeding pairs       object
Date of pair count          object
dtype: object

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

def get_table(number):
    
    if not (number == 2 or number == 3):
        return "Working with just two tables, 2 or 3"
    #request 
    url = "https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5082682/#pone.0164025.ref029"
    resp = requests.get(url)
    html = BeautifulSoup(resp.text)
    #pone\.0164025\.t001 is a constant selector for each table 
    table_link = f"#pone\.0164025\.t00{number}"
    
    #getting columns of the table 
    column_names = [row.get_text() for row in html.select(f"{table_link} > div.xtable thead tr th")]
    column_names
    #getting data from table 
    data = [[e.get_text(strip=True) for e in row.select("td")] for row in html.select(f"{table_link} > div.xtable tbody tr")]
    data = pd.DataFrame(data, columns=column_names)
    
    #cleaning data
    
    def format_value(value):
        if value == "No data":
            return float(0.6)
        else:
            return float(value)
            
        
    data = data.drop(columns=[column_names[2], column_names[4], column_names[6]])
    data["Season"] = data["Season"].apply(lambda x: int(x.split("/")[0]))
    for n in range(1,6,2):
         data[column_names[n]] = data[column_names[n]].apply(format_value)
        
    return data 

In [14]:
table2 = get_table(2)

In [15]:
table2.head()

Unnamed: 0,Season,Adélie breeding pairs,Chinstrap breeding pairs,Gentoo breeding pairs
0,1978,1873.0,2050.0,370.0
1,1979,2269.0,2253.0,303.0
2,1980,1726.0,1809.0,330.0
3,1981,1831.0,2250.0,341.0
4,1982,2631.0,2334.0,299.0


In [16]:
table2.dtypes

Season                        int64
Adélie breeding pairs       float64
Chinstrap breeding pairs    float64
Gentoo breeding pairs       float64
dtype: object

In [17]:
table2.isna().sum()

Season                      0
Adélie breeding pairs       0
Chinstrap breeding pairs    0
Gentoo breeding pairs       0
dtype: int64

In [19]:
table3 = get_table(3)

In [20]:
table3.head()

Unnamed: 0,Season,Adélie chicks fledged per pair,Chinstrap chicks fledged per pair,Gentoo chicks fledged per pair
0,1978,0.6,1.27,0.6
1,1979,0.5,0.24,0.63
2,1980,0.86,0.05,0.44
3,1981,1.05,0.72,1.51
4,1982,0.73,0.74,1.07


In [21]:
table3.dtypes

Season                                 int64
Adélie chicks fledged per pair       float64
Chinstrap chicks fledged per pair    float64
Gentoo chicks fledged per pair       float64
dtype: object

In [22]:
table3.isna().sum()

Season                               0
Adélie chicks fledged per pair       0
Chinstrap chicks fledged per pair    0
Gentoo chicks fledged per pair       0
dtype: int64

In [23]:
table2.to_csv('../data/breeding_pairs.csv')
table3.to_csv('../data/chicks_pairs.csv')