In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# Load the dataset
df_life = pd.read_csv('data/lifehistory_df.csv')
df_basiceco = pd.read_csv('data/basicecodf.csv')

display(df_life.head())
display(df_basiceco.head())

Unnamed: 0,species,am,Wwi,Ri,Wwb,Li,ab
0,Symsagittifera_roscoffensis,-2.263097,-2.681943,0.018237,-1.610842,-2.223594,-0.870294
1,Aequipecten_opercularis,-0.059093,-0.275378,1.639591,-2.351822,-0.85488,-1.373181
2,Mimachlamys_varia,-0.400497,-0.622221,1.639591,-2.351822,-0.944853,-1.373181
3,Mytilus_edulis,0.778895,-0.231961,1.392184,-2.764869,-0.464663,-3.043736
4,Panopea_abbreviata,2.507146,0.675057,2.385215,-1.703486,-0.074446,-1.876068


Unnamed: 0,ID,ecozone,gender,reproduction,embryo
0,Symsagittifera_roscoffensis,M,H,O,M
1,Aequipecten_opercularis,M,H,O,M
2,Mimachlamys_varia,M,H,O,M
3,Mytilus_edulis,M,D,O,M
4,Panopea_abbreviata,M,D,O,M


In [3]:
features = df_life[['am', 'Wwi', 'Ri', 'Wwb', 'Li', 'ab']]

X = StandardScaler().fit_transform(features)

pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)

pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
final_df = pd.concat([ df_life[['species']], pca_df], axis = 1)
final_df.head()

Unnamed: 0,species,PC1,PC2
0,Symsagittifera_roscoffensis,-4.332599,-1.319409
1,Aequipecten_opercularis,-2.418778,1.949067
2,Mimachlamys_varia,-2.773367,1.725362
3,Mytilus_edulis,-2.6023,2.953991
4,Panopea_abbreviata,-0.460463,3.460949


In [4]:
explained_variance = pca.explained_variance_ratio_

pc1_var = explained_variance[0] * 100
pc2_var = explained_variance[1] * 100

print(f"PC1: {pc1_var:.2f}%")
print(f"PC2: {pc2_var:.2f}%")

PC1: 47.39%
PC2: 37.59%


In [5]:
final_df['Wwi'] = df_life['Wwi']
final_df['Ri'] = df_life['Ri']

final_df['ecozone'] = df_basiceco['ecozone']
final_df['gender'] = df_basiceco['gender']
final_df['reproduction'] = df_basiceco['reproduction']
final_df['embryo'] = df_basiceco['embryo']

final_df.head()

Unnamed: 0,species,PC1,PC2,Wwi,Ri,ecozone,gender,reproduction,embryo
0,Symsagittifera_roscoffensis,-4.332599,-1.319409,-2.681943,0.018237,M,H,O,M
1,Aequipecten_opercularis,-2.418778,1.949067,-0.275378,1.639591,M,H,O,M
2,Mimachlamys_varia,-2.773367,1.725362,-0.622221,1.639591,M,H,O,M
3,Mytilus_edulis,-2.6023,2.953991,-0.231961,1.392184,M,D,O,M
4,Panopea_abbreviata,-0.460463,3.460949,0.675057,2.385215,M,D,O,M


In [6]:
loadings = pca.components_.T * np.sqrt(explained_variance)

In [7]:
x_range = principal_components[:, 0].max() - principal_components[:, 0].min()
y_range = principal_components[:, 1].max() - principal_components[:, 1].min()

scale = 0.3 * np.mean([x_range, y_range])
scaled_loadings = loadings * scale

#color
size_variable = final_df["Wwi"] - final_df["Wwi"].min() + 0.1  # +0.1 évite les zéros

fig = px.scatter(
    pca_df, 
    x="PC1", 
    y="PC2", 
    color=final_df['ecozone'],
    color_continuous_scale="Viridis", 
    hover_name=final_df["species"],
    size=size_variable,
    )

for i, feature in enumerate(features):
    x = scaled_loadings[i, 0]
    y = scaled_loadings[i, 1]

    fig.add_annotation(
        ax=0, ay=0,
        axref="x", ayref="y",
        x=scaled_loadings[i, 0],
        y=scaled_loadings[i, 1],
        showarrow=True,
        arrowsize=2,
        arrowhead=2,
        xanchor="right",
        yanchor="top",
        arrowcolor="black",
    )

    fig.add_annotation(
        x=x * 1.2,  
        y=y * 1.2,
        text=feature,
        showarrow=False,
        font=dict(size=12),
        xanchor="center",
        yanchor="middle",
    )

fig.update_layout(
    title="Biplot of Life History traits colored by ecozone and size by Wwi",
    xaxis_title=f"PC1 ({pc1_var:.1f}%)",
    yaxis_title=f"PC2 ({pc2_var:.1f}%)",
    legend_title="Ecozone",
)

fig.show()
fig.write_html("biplot2.html")
