In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Load the CSV file
# Replace 'chienloup.csv' with the exact path to your file
data = pd.read_csv('chienloup.csv', sep=';')

# Extract quantitative and qualitative variables names
quantitative_vars = data.columns[1:-1]  # Quantitative variables
qualitative_var = data.columns[-1]     # Qualitative variable (e.g., category)

# Group the data by the qualitative variable
groups = data.groupby(qualitative_var)

# Initialize a dictionary to store intra-class covariance matrices
W_k_dict = {}

# Initilize the intraclass variance matrix
W = np.zeros((len(quantitative_vars), len(quantitative_vars)))

# Compute W_k for each group as well as W
for group_name, group in groups:
    group_values = group[quantitative_vars].values
    group_mean = np.mean(group_values, axis=0)
    n_k = group_values.shape[0]  # Number of samples in the group
    # Calculate W_k: intra-class covariance matrix
    W_k = (group_values - group_mean).T @ (group_values - group_mean) / n_k
    W_k_dict[group_name] = W_k  # Store each W_k with the corresponding group name
    W += W_k * n_k

W = W/data.shape[0]

###(Bonus)############################################################################################################################
# We compute and visualize the correlation matrix from the covariance matrix to interpret the values of the intra covariances
#for group_name, W_k in W_k_dict.items():
#    print(f"Intra-class correlation matrix W_k for group '{group_name}':")
#    # Computation of the correlation matrix from the covariance matrix
#    std_dev = np.sqrt(np.diag(W_k))  # Standard deviations
#    correlation_matrix = W_k / np.outer(std_dev, std_dev)  # Normalize to get correlations
#
#    # Visualization of the correlation matrix
#    plt.figure(figsize=(8, 6))
#    plt.imshow(correlation_matrix, cmap="coolwarm", vmin=-1, vmax=1)
#    plt.colorbar(label="Correlation")
#    plt.xticks(ticks=range(len(quantitative_vars)), labels=quantitative_vars, rotation=45, ha="right")
#    plt.yticks(ticks=range(len(quantitative_vars)), labels=quantitative_vars)
#    plt.title("Correlation Matrix Heatmap")
#    plt.tight_layout()
#    plt.show()
#
####################################################################################################################################



# Computation of inter-class covariance matrix (B), which measures variability between group means
overall_mean = np.mean(data[quantitative_vars].values, axis=0)  # Overall mean vector
B = np.zeros((len(quantitative_vars), len(quantitative_vars)))  # Initialize inter-class covariance matrix

for name, group in groups:
    group_size = len(group)  # Number of samples in the current group
    group_mean = np.mean(group[quantitative_vars].values, axis=0)  # Mean vector of the group
    diff = (group_mean - overall_mean).reshape(-1, 1)  # Difference vector reshaped as a column
    B += group_size * (diff @ diff.T)  # Weighted outer product of the difference vector

# Computation of total variance matrix (V) as the sum of intra-class (W) and inter-class (B) covariances
V = W + B

# Displaying intra-class (W), inter-class (B), and total variance (V) matrices
print(f"Total intra-class covariance matrix (W):")
print(pd.DataFrame(W, index=quantitative_vars, columns=quantitative_vars))
print("\n")
print("\nTotal inter-class covariance matrix (B):")
print(pd.DataFrame(B, index=quantitative_vars, columns=quantitative_vars))
print("\nTotal variance matrix (V):")
print(pd.DataFrame(V, index=quantitative_vars, columns=quantitative_vars))

Total intra-class covariance matrix W :
              LCB           LSM            LBM         LP           LM    \
LCB    70166.944444  36126.587302  7473.134921  2907.956349  2022.908730   
LSM    36126.587302  20150.396825  1862.301587  1526.706349  1034.325397   
 LBM    7473.134921   1862.301587  7990.753968   482.718254   260.146825   
LP      2907.956349   1526.706349   482.718254   204.831349   108.974206   
LM      2022.908730   1034.325397   260.146825   108.974206    93.399603   
LAM     2907.531746   1442.341270   513.103175   149.091270   115.103968   

               LAM  
LCB    2907.531746  
LSM    1442.341270  
 LBM    513.103175  
LP      149.091270  
LM      115.103968  
LAM     361.976984  



Totatal Inter-calss covariance matrix (B):
               LCB            LSM             LBM          LP            LM    \
LCB    188171.666667  135466.666667  51011.666667  60854.166667  26479.500000   
LSM    135466.666667   97523.809524  36723.809524  43809.523810  19062.8

Coumputation of V, B traces to compare intra-class and inter-class variances

Interpretation of V, W, B:
We observe 