In [None]:
import pandas as pd

uris = [
    # Adelie
    "https://pasta.lternet.edu/package/data/eml/knb-lter-pal/219/5/002f3893385f710df69eeebe893144ff",
    # Gentoo
    "https://pasta.lternet.edu/package/data/eml/knb-lter-pal/220/7/e03b43c924f226486f2f0ab6709d2381",
    # Chinstrap
    "https://pasta.lternet.edu/package/data/eml/knb-lter-pal/221/8/fe853aa8f7a59aa84cdd3197619ef462",
]

def read_data(uri):
    return pd.read_csv(
        uri,
        sep=",",
        quotechar='"',
        usecols=[
            "Species",             # ペンギンの種
            "Island",              # データを収集した島
            "Individual ID",       # 個体識別番号
            "Date Egg",            # 調査年月日
            "Culmen Length (mm)",  # くちばしの長さ
            "Culmen Depth (mm)",   # くちばしの太さ
            "Flipper Length (mm)", # ひれの長さ
            "Body Mass (g)",       # 体重
            "Sex",                 # 性別
            "Comments",            # コメント
        ],
        na_values=".",
    ).rename(
        columns={
            "Individual ID": "Individual_ID",
            "Date Egg": "Date_Egg",
            "Culmen Length (mm)": "Culmen_Length",
            "Culmen Depth (mm)": "Culmen_Depth",
            "Flipper Length (mm)": "Flipper_Length",
            "Body Mass (g)": "Body_Mass",
        }
    )


df = pd.concat([read_data(uri) for uri in uris], ignore_index=True)
df.head()

Unnamed: 0,Species,Island,Individual_ID,Date_Egg,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments
0,Adelie Penguin (Pygoscelis adeliae),Torgersen,N1A1,2007-11-11,39.1,18.7,181.0,3750.0,MALE,Not enough blood for isotopes.
1,Adelie Penguin (Pygoscelis adeliae),Torgersen,N1A2,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,
2,Adelie Penguin (Pygoscelis adeliae),Torgersen,N2A1,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,
3,Adelie Penguin (Pygoscelis adeliae),Torgersen,N2A2,2007-11-16,,,,,,Adult not sampled.
4,Adelie Penguin (Pygoscelis adeliae),Torgersen,N3A1,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,


In [6]:
df.tail()

Unnamed: 0,Species,Island,Individual_ID,Date_Egg,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments
339,Chinstrap penguin (Pygoscelis antarctica),Dream,N98A2,2009-11-19,55.8,19.8,207.0,4000.0,MALE,
340,Chinstrap penguin (Pygoscelis antarctica),Dream,N99A1,2009-11-21,43.5,18.1,202.0,3400.0,FEMALE,Nest never observed with full clutch.
341,Chinstrap penguin (Pygoscelis antarctica),Dream,N99A2,2009-11-21,49.6,18.2,193.0,3775.0,MALE,Nest never observed with full clutch.
342,Chinstrap penguin (Pygoscelis antarctica),Dream,N100A1,2009-11-21,50.8,19.0,210.0,4100.0,MALE,
343,Chinstrap penguin (Pygoscelis antarctica),Dream,N100A2,2009-11-21,50.2,18.7,198.0,3775.0,FEMALE,


In [4]:
df.shape

(344, 10)

In [5]:
df.dtypes

Species            object
Island             object
Individual_ID      object
Date_Egg           object
Culmen_Length     float64
Culmen_Depth      float64
Flipper_Length    float64
Body_Mass         float64
Sex                object
Comments           object
dtype: object

In [8]:
df["Date_Egg"] = pd.to_datetime(df.loc[:, "Date_Egg"], format="%Y-%m-%d")
df["Species"] = df.loc[:, "Species"].astype("category")
df["Island"] = df.loc[:, "Island"].astype("category")
df["Sex"] = df.loc[:, "Sex"].astype("category")
df.head()

Unnamed: 0,Species,Island,Individual_ID,Date_Egg,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments
0,Adelie Penguin (Pygoscelis adeliae),Torgersen,N1A1,2007-11-11,39.1,18.7,181.0,3750.0,MALE,Not enough blood for isotopes.
1,Adelie Penguin (Pygoscelis adeliae),Torgersen,N1A2,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,
2,Adelie Penguin (Pygoscelis adeliae),Torgersen,N2A1,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,
3,Adelie Penguin (Pygoscelis adeliae),Torgersen,N2A2,2007-11-16,,,,,,Adult not sampled.
4,Adelie Penguin (Pygoscelis adeliae),Torgersen,N3A1,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,


In [9]:
df.dtypes

Species                 category
Island                  category
Individual_ID             object
Date_Egg          datetime64[ns]
Culmen_Length            float64
Culmen_Depth             float64
Flipper_Length           float64
Body_Mass                float64
Sex                     category
Comments                  object
dtype: object

In [11]:
import re

df["Species_Short"] = df.loc[:, "Species"].apply(
    lambda e: re.search(r"^\w+", e)[0]
)
df["Species_Short"] = df.loc[:, "Species_Short"].astype("category")
df.head()

Unnamed: 0,Species,Island,Individual_ID,Date_Egg,Culmen_Length,Culmen_Depth,Flipper_Length,Body_Mass,Sex,Comments,Species_Short
0,Adelie Penguin (Pygoscelis adeliae),Torgersen,N1A1,2007-11-11,39.1,18.7,181.0,3750.0,MALE,Not enough blood for isotopes.,Adelie
1,Adelie Penguin (Pygoscelis adeliae),Torgersen,N1A2,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,,Adelie
2,Adelie Penguin (Pygoscelis adeliae),Torgersen,N2A1,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,,Adelie
3,Adelie Penguin (Pygoscelis adeliae),Torgersen,N2A2,2007-11-16,,,,,,Adult not sampled.,Adelie
4,Adelie Penguin (Pygoscelis adeliae),Torgersen,N3A1,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,,Adelie


In [12]:
df.dtypes

Species                 category
Island                  category
Individual_ID             object
Date_Egg          datetime64[ns]
Culmen_Length            float64
Culmen_Depth             float64
Flipper_Length           float64
Body_Mass                float64
Sex                     category
Comments                  object
Species_Short           category
dtype: object

In [14]:
df.to_parquet("../result/penguins.parquet")