# Analyzing Correlation
Answer question: In combined queries, does correct/incorrect prediction of earlier nutrient(s) affect accuracy of later nutrient(s)?

1. correlation matrix of absolute error - combined vs. individual
3. correlation matrix of pure error - combined vs. individual

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def get_combined_results(file):
    combined_df = pd.read_json(file, lines=True)

    result_df = pd.json_normalize(combined_df["result"])
    result_df.columns = [col.replace(".", "_") for col in result_df.columns]

    pred_df = pd.json_normalize(combined_df["pred"])
    doc_df = pd.json_normalize(combined_df["doc"])

    pred_df = pred_df.rename(columns={"carb" : "carb_pred", "fat" : "fat_pred", "energy" : "energy_pred", "protein" : "protein_pred"})


    combined_results = pd.concat([
        combined_df[["doc_id", "doc"]],
        doc_df[["carb", "fat", "energy", "protein"]],
        pred_df,
        result_df[['carb_acc', 'carb_mae', 'carb_mse', 'fat_acc', 'fat_mae', 'fat_mse', 'energy_acc', 
                'energy_mae', 'energy_mse', 'protein_acc', 'protein_mae', 'protein_mse']]
    ], axis=1)

    return combined_results

In [3]:
file = "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_combined_base_20250719_194516.jsonl"

combined_results = get_combined_results(file)
print(combined_results.columns)

Index(['doc_id', 'doc', 'carb', 'fat', 'energy', 'protein', 'carb_pred',
       'fat_pred', 'energy_pred', 'protein_pred', 'carb_acc', 'carb_mae',
       'carb_mse', 'fat_acc', 'fat_mae', 'fat_mse', 'energy_acc', 'energy_mae',
       'energy_mse', 'protein_acc', 'protein_mae', 'protein_mse'],
      dtype='object')


In [4]:
def get_indiv_results(files):
    indiv_results = {}
    base_df = pd.read_json(files["carb"], lines=True)

    for nutrient, filepath in files.items():
        df = pd.read_json(filepath, lines=True)
        results = pd.json_normalize(df["result"])
        doc = pd.json_normalize(df["doc"])

        indiv_results[nutrient] = doc[nutrient]
        indiv_results[f"{nutrient}_pred"] = df["pred"]
        indiv_results[f"{nutrient}_acc"] = results["acc"]
        indiv_results[f"{nutrient}_mae"] = results["mae"]

    # Convert to final DataFrame
    indiv_results_df = pd.concat([
        base_df[["doc_id"]],
        pd.DataFrame(indiv_results)
    ], axis=1)

    return indiv_results_df

In [5]:
files = {
    "carb": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_carb_base_20250720_033502.jsonl",
    "fat": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_fat_base_20250719_184222.jsonl",
    "energy": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_energy_base_20250721_222318.jsonl",
    "protein": "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_protein_base_20250719_192710.jsonl",
}

indiv_results_df = get_indiv_results(files)
print(indiv_results_df.head())

   doc_id   carb  carb_pred  carb_acc  carb_mae    fat  fat_pred  fat_acc  \
0       0   0.00        0.0      True      0.00   0.00       0.0     True   
1       1  63.41       58.4      True      5.01  14.82      20.5    False   
2       2  18.38       24.0      True      5.62  18.08      15.0    False   
3       3  26.39       36.5     False     10.11  30.70      28.1    False   
4       4  42.07       38.4      True      3.67   9.64       5.6    False   

   fat_mae  energy  energy_pred  energy_acc  energy_mae  protein  \
0     0.00    0.00          0.0        True        0.00     0.00   
1     5.68  502.04        518.0        True       15.96    26.66   
2     3.08  248.25        250.0        True        1.75     3.08   
3     2.60  485.75        523.0        True       37.25    25.38   
4     4.04  251.28        186.0       False       65.28     3.12   

   protein_pred  protein_acc  protein_mae  
0           0.0         True         0.00  
1          21.2        False         5.4

In [6]:
# correlation between mae for combined estimates vs individual estimates
combined_mae = combined_results[["carb_mae", "fat_mae", "energy_mae","protein_mae"]]
indiv_mae = indiv_results_df[["carb_mae", "fat_mae", "energy_mae", "protein_mae",]]
print(combined_mae.corr())
print(indiv_mae.corr())

             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.273352    0.621870     0.507155
fat_mae      0.273352  1.000000    0.788847     0.507924
energy_mae   0.621870  0.788847    1.000000     0.631438
protein_mae  0.507155  0.507924    0.631438     1.000000
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.212044    0.515180     0.446933
fat_mae      0.212044  1.000000    0.585662     0.270500
energy_mae   0.515180  0.585662    1.000000     0.515830
protein_mae  0.446933  0.270500    0.515830     1.000000


In [7]:
def match_percentage_matrix(df):
    cols = df.columns
    n = len(df)
    match_matrix = pd.DataFrame(index=cols, columns=cols, dtype=float)

    for col1 in cols:
        for col2 in cols:
            match_matrix.loc[col1, col2] = np.mean(df[col1] == df[col2]) * 100  # percentage match

    return match_matrix

In [8]:
# same with acc
combined_acc = combined_results[["carb_acc", "fat_acc", "energy_acc", "protein_acc"]]
indiv_acc = indiv_results_df[["carb_acc", "fat_acc", "energy_acc", "protein_acc"]]

acc_match_combined = match_percentage_matrix(combined_acc)
acc_match_indiv = match_percentage_matrix(indiv_acc)

print("Match % (Combined Acc):\n", acc_match_combined)
print("Match % (Individual Acc):\n", acc_match_indiv)

Match % (Combined Acc):
                carb_acc     fat_acc  energy_acc  protein_acc
carb_acc     100.000000   63.962264   78.113208    69.433962
fat_acc       63.962264  100.000000   73.018868    69.245283
energy_acc    78.113208   73.018868  100.000000    72.452830
protein_acc   69.433962   69.245283   72.452830   100.000000
Match % (Individual Acc):
                carb_acc     fat_acc  energy_acc  protein_acc
carb_acc     100.000000   65.849057   76.603774    68.867925
fat_acc       65.849057  100.000000   69.622642    68.679245
energy_acc    76.603774   69.622642  100.000000    69.245283
protein_acc   68.867925   68.679245   69.245283   100.000000


Correlation Between Pure Error - Combined vs. Individual

In [9]:
carb_error = combined_results["carb_pred"] - combined_results["carb"]
fat_error = combined_results["fat_pred"] - combined_results["fat"]
energy_error = combined_results["energy_pred"] - combined_results["energy"]
protein_error = combined_results["protein_pred"] - combined_results["protein"]

combined_error = pd.DataFrame({"carb_err" : carb_error, "fat_err" : fat_error, "energy_err" : energy_error, "protein_err" : protein_error})

carb_err = indiv_results_df["carb_pred"] - indiv_results_df["carb"]
fat_err = indiv_results_df["fat_pred"] - indiv_results_df["fat"]
energy_err = indiv_results_df["energy_pred"] - indiv_results_df["energy"]
protein_err = indiv_results_df["protein_pred"] - indiv_results_df["protein"]

indiv_error = pd.DataFrame({"carb_err" : carb_err, "fat_err" : fat_err, "energy_err" : energy_err, "protein_err" : protein_err})

print("COMBINED PROMPTING:")
print(combined_mae.corr())
print(combined_error.corr())
print("\n INDIVIDUAL PROMPTING:")
print(indiv_mae.corr())
print(indiv_error.corr())

COMBINED PROMPTING:
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.273352    0.621870     0.507155
fat_mae      0.273352  1.000000    0.788847     0.507924
energy_mae   0.621870  0.788847    1.000000     0.631438
protein_mae  0.507155  0.507924    0.631438     1.000000
             carb_err   fat_err  energy_err  protein_err
carb_err     1.000000  0.191531    0.664062     0.435384
fat_err      0.191531  1.000000    0.768565     0.478140
energy_err   0.664062  0.768565    1.000000     0.626502
protein_err  0.435384  0.478140    0.626502     1.000000

 INDIVIDUAL PROMPTING:
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.212044    0.515180     0.446933
fat_mae      0.212044  1.000000    0.585662     0.270500
energy_mae   0.515180  0.585662    1.000000     0.515830
protein_mae  0.446933  0.270500    0.515830     1.000000
             carb_err   fat_err  energy_err  protein_err
carb_err     1.000000  0.087102    0.497165 

Correlation Between Nutrients: 

MAE combined > Error combined > MAE individual > Error individual

Do the correlation matrices for CoT match?

In [10]:
combined_cot_path = "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_combined_CoT_20250720_064149.jsonl"
combined_cot = get_combined_results(combined_cot_path)

cot_files = {
    "carb" : "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_carb_CoT_20250719_051735.jsonl",
    "fat" : "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_fat_CoT_20250720_050526.jsonl",
    "energy" : "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_energy_CoT_20250720_053252.jsonl",
    "protein" : "/data/lucasjia/projects/nutri/results/multi-nutrient/sub1/samples_protein_CoT_20250720_060342.jsonl"       
}
cot_results = get_indiv_results(cot_files)

combined_cot_mae = combined_cot[["carb_mae", "fat_mae", "energy_mae","protein_mae"]]
cot_mae = cot_results[["carb_mae", "fat_mae", "energy_mae", "protein_mae",]]
print(combined_cot_mae.corr())
print(cot_mae.corr())

             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.313969    0.664736     0.515518
fat_mae      0.313969  1.000000    0.717565     0.529013
energy_mae   0.664736  0.717565    1.000000     0.727328
protein_mae  0.515518  0.529013    0.727328     1.000000
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.191954    0.513062     0.385466
fat_mae      0.191954  1.000000    0.537609     0.531194
energy_mae   0.513062  0.537609    1.000000     0.548389
protein_mae  0.385466  0.531194    0.548389     1.000000


In [11]:
carb_cot_error = combined_cot["carb_pred"] - combined_cot["carb"]
fat_cot_error = combined_cot["fat_pred"] - combined_cot["fat"]
energy_cot_error = combined_cot["energy_pred"] - combined_cot["energy"]
protein_cot_error = combined_cot["protein_pred"] - combined_cot["protein"]

combined_cot_error = pd.DataFrame({"carb_err" : carb_cot_error, "fat_err" : fat_cot_error, "energy_err" : energy_cot_error, "protein_err" : protein_cot_error})

carb_cot_err = cot_results["carb_pred"] - cot_results["carb"]
fat_cot_err = cot_results["fat_pred"] - cot_results["fat"]
energy_cot_err = cot_results["energy_pred"] - cot_results["energy"]
protein_cot_err = cot_results["protein_pred"] - cot_results["protein"]

cot_err = pd.DataFrame({"carb_err" : carb_cot_err, "fat_err" : fat_cot_err, "energy_err" : energy_cot_err, "protein_err" : protein_cot_err})


print("COMBINED PROMPTING:")
print("base:")
print(combined_mae.corr())
print(combined_error.corr())
print("CoT:")
print(combined_cot_mae.corr())
print(combined_cot_error.corr())

COMBINED PROMPTING:
base:
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.273352    0.621870     0.507155
fat_mae      0.273352  1.000000    0.788847     0.507924
energy_mae   0.621870  0.788847    1.000000     0.631438
protein_mae  0.507155  0.507924    0.631438     1.000000
             carb_err   fat_err  energy_err  protein_err
carb_err     1.000000  0.191531    0.664062     0.435384
fat_err      0.191531  1.000000    0.768565     0.478140
energy_err   0.664062  0.768565    1.000000     0.626502
protein_err  0.435384  0.478140    0.626502     1.000000
CoT:
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.313969    0.664736     0.515518
fat_mae      0.313969  1.000000    0.717565     0.529013
energy_mae   0.664736  0.717565    1.000000     0.727328
protein_mae  0.515518  0.529013    0.727328     1.000000
             carb_err   fat_err  energy_err  protein_err
carb_err     1.000000  0.233921    0.672894     0.469514


In [12]:
print("INDIVIDUAL PROMPTING:")
print("base:")
print(indiv_mae.corr())
print(indiv_error.corr())
print("CoT:")
print(cot_mae.corr())
print(cot_err.corr())

INDIVIDUAL PROMPTING:
base:
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.212044    0.515180     0.446933
fat_mae      0.212044  1.000000    0.585662     0.270500
energy_mae   0.515180  0.585662    1.000000     0.515830
protein_mae  0.446933  0.270500    0.515830     1.000000
             carb_err   fat_err  energy_err  protein_err
carb_err     1.000000  0.087102    0.497165     0.368573
fat_err      0.087102  1.000000    0.566276     0.271191
energy_err   0.497165  0.566276    1.000000     0.382320
protein_err  0.368573  0.271191    0.382320     1.000000
CoT:
             carb_mae   fat_mae  energy_mae  protein_mae
carb_mae     1.000000  0.191954    0.513062     0.385466
fat_mae      0.191954  1.000000    0.537609     0.531194
energy_mae   0.513062  0.537609    1.000000     0.548389
protein_mae  0.385466  0.531194    0.548389     1.000000
             carb_err   fat_err  energy_err  protein_err
carb_err     1.000000  0.042450    0.505549     0.28339