# Analyzing results
Goal: Answer question - In combined queries, does correct/incorrect prediction of earlier nutrient(s) affect accuracy of later nutrient(s)?

1. correlation matrix of absolute error - combined vs. individual
3. correlation matrix of pure error - combined vs. individual

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [17]:
combined_df = pd.read_json("/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_combined_base_20250719_194516.jsonl", lines=True)

result_df = pd.json_normalize(combined_df["result"])
result_df.columns = [col.replace(".", "_") for col in result_df.columns]

pred_df = pd.json_normalize(combined_df["pred"])
doc_df = pd.json_normalize(combined_df["doc"])

pred_df = pred_df.rename(columns={"carb" : "carb_pred", "fat" : "fat_pred", "energy" : "energy_pred", "protein" : "protein_pred"})


combined_results = pd.concat([
    combined_df[["doc_id", "doc"]],
    doc_df[["carb", "fat", "energy", "protein"]],
    pred_df,
    result_df[['carb_acc', 'carb_mae', 'carb_mse', 'fat_acc', 'fat_mae', 'fat_mse', 'energy_acc', 
            'energy_mae', 'energy_mse', 'protein_acc', 'protein_mae', 'protein_mse']]
], axis=1)

print(combined_results.columns)

Index(['doc_id', 'doc', 'carb', 'fat', 'energy', 'protein', 'carb_pred',
       'fat_pred', 'energy_pred', 'protein_pred', 'carb_acc', 'carb_mae',
       'carb_mse', 'fat_acc', 'fat_mae', 'fat_mse', 'energy_acc', 'energy_mae',
       'energy_mse', 'protein_acc', 'protein_mae', 'protein_mse'],
      dtype='object')


In [18]:
files = {
    "carb": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_carb_base_20250720_033502.jsonl",
    "fat": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_fat_base_20250719_184222.jsonl",
    "energy": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_energy_base_20250721_222318.jsonl",
    "protein": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_protein_base_20250719_192710.jsonl",
}

# Init empty results dictionary
indiv_results = {}

for nutrient, filepath in files.items():
    df = pd.read_json(filepath, lines=True)
    indiv_results[f"{nutrient}_acc"] = [r["acc"] for r in df["result"]]
    indiv_results[f"{nutrient}_mae"] = [r["mae"] for r in df["result"]]
    indiv_results[f"{nutrient}_mse"] = [r["mse"] for r in df["result"]]

# Convert to final DataFrame
indiv_results_df = pd.DataFrame(indiv_results)
print(indiv_results_df.head())

   carb_acc  carb_mae  carb_mse  fat_acc  fat_mae  fat_mse  energy_acc  \
0      True      0.00    0.0000     True     0.00   0.0000        True   
1      True      5.01   25.1001    False     5.68  32.2624        True   
2      True      5.62   31.5844    False     3.08   9.4864        True   
3     False     10.11  102.2121    False     2.60   6.7600        True   
4      True      3.67   13.4689    False     4.04  16.3216       False   

   energy_mae  energy_mse  protein_acc  protein_mae  protein_mse  
0        0.00      0.0000         True         0.00       0.0000  
1       15.96    254.7216        False         5.46      29.8116  
2        1.75      3.0625         True         0.08       0.0064  
3       37.25   1387.5625        False         5.28      27.8784  
4       65.28   4261.4784         True         0.98       0.9604  


In [19]:
# correlation between mae for combined estimates vs individual estimates
combined_mae = combined_results[["carb_mae", "fat_mae", "energy_mae","protein_mae"]]
indiv_mae = indiv_results_df[["carb_mae", "fat_mae", "energy_mae", "protein_mae",]]
print(combined_mae.corr())
print(indiv_mae.corr())

             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.273352    0.621870     0.507155
fat_mae      0.273352  1.000000    0.788847     0.507924
energy_mae   0.621870  0.788847    1.000000     0.631438
protein_mae  0.507155  0.507924    0.631438     1.000000
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.212044    0.515180     0.446933
fat_mae      0.212044  1.000000    0.585662     0.270500
energy_mae   0.515180  0.585662    1.000000     0.515830
protein_mae  0.446933  0.270500    0.515830     1.000000


In [20]:
def match_percentage_matrix(df):
    cols = df.columns
    n = len(df)
    match_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)

    for col1 in cols:
        for col2 in cols:
            match_matrix.loc[col1, col2] = np.mean(df[col1] == df[col2]) * 100  # percentage match

    return match_matrix

In [22]:
# same with acc
combined_acc = combined_results[["carb_acc", "fat_acc", "energy_acc", "protein_acc"]]
indiv_acc = indiv_results_df[["carb_acc", "fat_acc", "energy_acc", "protein_acc"]]

acc_match_combined = match_percentage_matrix(combined_acc)
acc_match_indiv = match_percentage_matrix(indiv_acc)

print("Match % (Combined Acc):\n", acc_match_combined)
print("Match % (Individual Acc):\n", acc_match_indiv)

Match % (Combined Acc):
                carb_acc     fat_acc  energy_acc  protein_acc
carb_acc     100.000000   63.962264   78.113208    69.433962
fat_acc       63.962264  100.000000   73.018868    69.245283
energy_acc    78.113208   73.018868  100.000000    72.452830
protein_acc   69.433962   69.245283   72.452830   100.000000
Match % (Individual Acc):
                carb_acc     fat_acc  energy_acc  protein_acc
carb_acc     100.000000   65.849057   76.603774    68.867925
fat_acc       65.849057  100.000000   69.622642    68.679245
energy_acc    76.603774   69.622642  100.000000    69.245283
protein_acc   68.867925   68.679245   69.245283   100.000000


Correlation Between Pure Error - Combiend vs. Individual

In [23]:
carb_error = combined_results["carb_pred"].to_numpy() - combined_results["carb"]
fat_error = combined_results["fat_pred"].to_numpy() - combined_results["fat"]
energy_error = combined_results["energy_pred"].to_numpy() - combined_results["energy"]
protein_error = combined_results["protein_pred"].to_numpy() - combined_results["protein"]

combined_error = pd.DataFrame({"carb_error" : carb_error, "fat_error" : fat_error, "energy_error" : energy_error, "protein_error" : protein_error})
print(combined_mae.corr())
print(combined_error.corr())

             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.273352    0.621870     0.507155
fat_mae      0.273352  1.000000    0.788847     0.507924
energy_mae   0.621870  0.788847    1.000000     0.631438
protein_mae  0.507155  0.507924    0.631438     1.000000
               carb_error  fat_error  energy_error  protein_error
carb_error       1.000000   0.191531      0.664062       0.435384
fat_error        0.191531   1.000000      0.768565       0.478140
energy_error     0.664062   0.768565      1.000000       0.626502
protein_error    0.435384   0.478140      0.626502       1.000000
