# PCA biplot analysis of life history traits

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Load the dataset
df_life = pd.read_csv('../data/lifehistory_df.csv')
df_basiceco = pd.read_csv('../data/basicecodf.csv')
df_species = pd.read_csv("../data/species.csv")
df_food = pd.read_csv("../data/fooddf.csv")
df_habitat = pd.read_csv("../data/habitatdf.csv")


display(df_life.head())
display(df_basiceco.head())

Unnamed: 0,species,am,Wwi,Ri,Wwb,Li,ab
0,Symsagittifera_roscoffensis,-2.263097,-2.681943,0.018237,-1.610842,-2.223594,-0.870294
1,Aequipecten_opercularis,-0.059093,-0.275378,1.639591,-2.351822,-0.85488,-1.373181
2,Mimachlamys_varia,-0.400497,-0.622221,1.639591,-2.351822,-0.944853,-1.373181
3,Mytilus_edulis,0.778895,-0.231961,1.392184,-2.764869,-0.464663,-3.043736
4,Panopea_abbreviata,2.507146,0.675057,2.385215,-1.703486,-0.074446,-1.876068


Unnamed: 0,ID,ecozone,gender,reproduction,embryo
0,Symsagittifera_roscoffensis,M,H,O,M
1,Aequipecten_opercularis,M,H,O,M
2,Mimachlamys_varia,M,H,O,M
3,Mytilus_edulis,M,D,O,M
4,Panopea_abbreviata,M,D,O,M


In [3]:
#pca
features = df_life[['am', 'Wwi', 'Ri', 'Wwb', 'Li', 'ab']]

X = StandardScaler().fit_transform(features)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)

pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
final_df = pd.concat([ df_life[['species']], pca_df], axis = 1)
final_df.head()

Unnamed: 0,species,PC1,PC2
0,Symsagittifera_roscoffensis,-4.332599,-1.319409
1,Aequipecten_opercularis,-2.418778,1.949067
2,Mimachlamys_varia,-2.773367,1.725362
3,Mytilus_edulis,-2.6023,2.953991
4,Panopea_abbreviata,-0.460463,3.460949


In [4]:
#explained variance
explained_variance = pca.explained_variance_ratio_

pc1_var = explained_variance[0] * 100
pc2_var = explained_variance[1] * 100

print(f"PC1: {pc1_var:.2f}%")
print(f"PC2: {pc2_var:.2f}%")

PC1: 47.39%
PC2: 37.59%


In [5]:
final_df['Wwi'] = df_life['Wwi']
final_df['Ri'] = df_life['Ri']
final_df['am'] = df_life['am']
final_df['Wwb'] = df_life['Wwb']
final_df['Li'] = df_life['Li']
final_df['ab'] = df_life['ab']

final_df['ecozone'] = df_basiceco['ecozone']
final_df['habitat'] = df_habitat['adult']
final_df['gender'] = df_basiceco['gender']
final_df['reproduction'] = df_basiceco['reproduction']
final_df['embryo'] = df_basiceco['embryo']
final_df['food'] = df_food['adult']
final_df['order'] = df_species['Order']  
final_df['class'] = df_species['Class'] 
final_df['phylum'] = df_species['Phylum']

final_df.head()

Unnamed: 0,species,PC1,PC2,Wwi,Ri,am,Wwb,Li,ab,ecozone,habitat,gender,reproduction,embryo,food,order,class,phylum
0,Symsagittifera_roscoffensis,-4.332599,-1.319409,-2.681943,0.018237,-2.263097,-1.610842,-2.223594,-0.870294,M,M,H,O,M,BA,Acoela,Acoelomorpha,Xenacoelomorpha
1,Aequipecten_opercularis,-2.418778,1.949067,-0.275378,1.639591,-0.059093,-2.351822,-0.85488,-1.373181,M,M,H,O,M,P,Pectinida,Bivalvia,Mollusca
2,Mimachlamys_varia,-2.773367,1.725362,-0.622221,1.639591,-0.400497,-2.351822,-0.944853,-1.373181,M,M,H,O,M,P,Pectinida,Bivalvia,Mollusca
3,Mytilus_edulis,-2.6023,2.953991,-0.231961,1.392184,0.778895,-2.764869,-0.464663,-3.043736,M,M,D,O,M,P,Mytiloida,Bivalvia,Mollusca
4,Panopea_abbreviata,-0.460463,3.460949,0.675057,2.385215,2.507146,-1.703486,-0.074446,-1.876068,M,M,D,O,M,H,Adapedonta,Bivalvia,Mollusca


In [6]:
#loadings
loadings = pca.components_.T * np.sqrt(explained_variance)

x_range = principal_components[:, 0].max() - principal_components[:, 0].min()
y_range = principal_components[:, 1].max() - principal_components[:, 1].min()

scale = 0.3 * np.mean([x_range, y_range])
scaled_loadings = loadings * scale


In [None]:
chordata = final_df[final_df['phylum'] == 'Chordata']
#color by size (no zero)
size_variable_wwi = final_df["Wwi"] - final_df["Wwi"].min() + 0.1  
size_variable_ri = final_df["Ri"] - final_df["Ri"].min() + 0.1
size_variable_am = final_df["am"] - final_df["am"].min() + 0.1
size_variable_li = final_df["Li"] - final_df["Li"].min() + 0.1
size_variable_wwb = final_df["Wwb"] - final_df["Wwb"].min() + 0.1

fig = px.scatter(
    chordata, 
    x="PC1", 
    y="PC2", 
    color=chordata["class"],
    # size=size_variable_wwb,
    #symbol=final_df["ecozone"],
    color_discrete_sequence=px.colors.qualitative.Set2,
    # color_continuous_scale="Magma", 
    hover_name=chordata["species"],
    )

for i, feature in enumerate(features):
    x = scaled_loadings[i, 0]
    y = scaled_loadings[i, 1]

    fig.add_annotation(
        ax=0, ay=0,
        axref="x", ayref="y",
        x=scaled_loadings[i, 0],
        y=scaled_loadings[i, 1],
        showarrow=True,
        arrowsize=2,
        arrowhead=2,
        xanchor="right",
        yanchor="top",
        arrowcolor="black",
    )

    fig.add_annotation(
        x=x * 1.2,  
        y=y * 1.2,
        text=feature,
        showarrow=False,
        font=dict(size=12),
        xanchor="center",
        yanchor="middle",
    )

fig.update_layout(
    title="Biplot of life history traits colored by class",
    xaxis_title=f"PC1 ({pc1_var:.1f}%)",
    yaxis_title=f"PC2 ({pc2_var:.1f}%)",
    legend_title="Class"
    # xaxis=dict(showgrid=False, zeroline=False),
    # yaxis=dict(showgrid=False, zeroline=False),
)

fig.update_traces(marker=dict(size=7, opacity=0.8))  
# fig.update_coloraxes(colorbar_title="ab")
fig.show()
# fig.write_html("../figures/.html")
