# Items Matching

In [14]:
## Missing quotes and supposed reference sentences
# "Excerpt quote"
# "Supposed reference text"

# Stage3_St Loyes.txt
"I mean that’s maybe concentrating on local travelling really."
"I mean, that’s maybe concentrating on local travelling really."

# CreditonStLawrence.txt
"I think the changes have got to come with, with innovations, I think we have got to look at public transport in a different way."
"But I think the, the changes have got to come with, with innovations, I think we have got to look at public transport in a different way."

# Pennsylvania2.txt
"It doesn't work for me because I don’t do regular things, I don’t sort of travel into work, now I do know people who do car sharing and not Car Share Devon but they do travel together, they actually travel in from this area into the same place on a regular basis."
"It doesn’t work for me because I don’t do regular things, I don’t sort of travel into work, now I do know people who do car sharing and not Car Share Devon but they do travel together, they actually travel in from this area into the same place on a regular basis."

# CullomptonNorth.txt
"Yes definitely more links, not rail links, more bus links, more affordable bus links as well, make it more affordable for everyone to change."
"Yes, definitely more links, not rail links, more bus links, more affordable bus links as well, make it more affordable for everyone to change."


'Yes, definitely more links, not rail links, more bus links, more affordable bus links as well, make it more affordable for everyone to change.'

In [23]:
import json

from system_path import SystemPath
from sklearn.metrics import mean_squared_error


def calculate_mse_decision_probability_to_ground_truth(archetype_based_path, profile_based_path):
    with open(archetype_based_path, "r") as f:
        ground_truth = json.loads(f.read())

    with open(profile_based_path, "r") as f:
        decision_probability = json.loads(f.read())

    # MSE between profile-based & archetype-based decision weights
    ref_probs = []
    cand_probs = []
    for i in range(len(ground_truth)):
        cand_scenario = decision_probability[i]
        ref_scenario = ground_truth[i]

        for archetype, ref_action_probs in ref_scenario[
            "archetype_action_probs"
        ].items():
            if archetype in cand_scenario["archetype_action_probs"]:
                ref_probs += ref_action_probs
                cand_probs += cand_scenario["archetype_action_probs"][archetype]

    print(f"MSE between Archetype-based & Profile-based decision probability:")
    print(f"{mean_squared_error(ref_probs, cand_probs):.4f}")

In [26]:
archetype_based_path = "travel/06_scenario_probability_archetype.txt"
profile_based_path = "travel/06_scenario_probability_profile.txt"
calculate_mse_decision_probability_to_ground_truth(archetype_based_path,profile_based_path)

MSE between Archetype-based & Profile-based decision probability:
0.0413


In [1]:
from bert_score import BERTScorer

scorer = BERTScorer(model_type="bert-base-uncased")
print(scorer.score(cands=['Perceived Convenience'], refs=['personal habit']))
print(scorer.score(cands=['Perceived Convenience'], refs=['convenience']))


  return forward_call(*args, **kwargs)


(tensor([0.6988]), tensor([0.6988]), tensor([0.6988]))
(tensor([0.6045]), tensor([0.7501]), tensor([0.6695]))


# Sentiment analysis

### Thematic analysis

In [7]:
import numpy as np
from bert_score import BERTScorer
from tabulate import tabulate

from models.response_models import ScopeComponent


def display(pair_item):
    return f"{pair_item[0]} ({ScopeComponent.get_component_name_from_key(pair_item[1])})"

def calculate_best_code_pairs(reference_codes, candidate_codes, compare_key):
    # Calculate sematic similarity score
    scorer = BERTScorer(model_type="bert-base-uncased")

    reference_items = []
    for key, item in reference_codes.items():
        if key == "file":
            continue

        for item in item:
            reference_items.append((item[compare_key], key))

    # print(reference_items)

    candidate_items = []
    for key, item in candidate_codes.items():
        if key == "file":
            continue

        for item in item:
            candidate_items.append((item[compare_key], key))

    # print(candidate_items)


    best_pairs = []

    for reference_item in reference_items[:]:
        best_score = 0
        best_can_item = None
        for canidate_item in candidate_items:
            if reference_item[1] != canidate_item[1]:
                continue

            P, R, F1 = scorer.score(
                cands=[reference_item[0]], refs=[canidate_item[0]]
            )

            if F1 > best_score:
                best_score = F1
                best_can_item = canidate_item

        best_pairs.append([reference_item, best_can_item, round(float(best_score), 4)])

    table = []
    for pair in best_pairs:
        table.append((display(pair[0]), display(pair[1]), pair[2]))

    print(
        tabulate(
            table,
            headers=["Candidate Code", "Best Reference Code", "Best Score"],
            tablefmt="rst",
        )
    )
    print()
    
    pairs = [pair[2] for pair in best_pairs]
    similar_pair = [pair for pair in pairs if pair >= 0.75]

    print(f'Total codes: {len(pairs)}')
    print(f'Total similar codes (score >= 0.75): {len(similar_pair)}')
    print(f'Mean score: {np.mean(pairs)}')
    print()

    return best_pairs


In [7]:
import json

# Sentiment Analysis: Thematic analysis
with open("./travel/02_theme_codes_human.txt") as f:
    reference_codes_raw = f.read().strip().split("\n\n")

with open("./travel/02_theme_codes_llm.txt") as f:
    candidate_codes_raw = f.read().strip().split("\n\n")

best_pairs = []
for i in range(len(candidate_codes_raw)):
    reference_codes: dict = json.loads(reference_codes_raw[i])
    candidate_codes: dict = json.loads(candidate_codes_raw[i])

    print(f'File: {candidate_codes['file']}')
    best_pairs += calculate_best_code_pairs(reference_codes, candidate_codes, "code")




# Overall aspect (data from all documents)
pairs = [pair[2] for pair in best_pairs]
similar_pair = [pair for pair in pairs if pair >= 0.75]

print("-"*50)
print(f'Overall Total Code: {len(pairs)}')
print(f'Overall Total similar codes (score >= 0.75): {len(similar_pair)}')
print(f'Overall Mean Similarity: {np.mean(pairs):.4f}')
print()

# Array of scores of each component
score_dict = {}

for pair in best_pairs:
    component_key = pair[0][1]
    score = pair[2]
    if component_key not in score_dict:
        score_dict[component_key] = []

    score_dict[component_key].append(score)

for key in  ScopeComponent.get_component_keys():
    print('{:<25}: {:.4f} ± {:.4f}'.format(ScopeComponent.get_component_name_from_key(key), np.mean(score_dict[key]), np.std(score_dict[key])))
print()

File: data/travel_scope_txt/Stage3_Crediton St Lwrence.txt
Candidate Code                                         Best Reference Code                                      Best Score
cyclists (Actors)                                      Citizens (Actors)                                            0.7092
tax payers  (Actors)                                   Citizens (Actors)                                            0.4866
residents  (Actors)                                    Citizens (Actors)                                            0.8719
pedestrians  (Actors)                                  Citizens (Actors)                                            0.7909
cars (Physical Components)                             Alternative Fuel Vehicles (Physical Components)              0.6109
public transport (Physical Components)                 Public Transport Infrastructure (Physical Components)        0.8291
trains (Physical Components)                           Alternative Fuel Vehicles

### EABSS scope finalisation

In [9]:
import json

# Sentiment Analysis: EABSS Scope
with open("./travel/02_eabss_scope_human.txt") as f:
    reference_codes_raw = f.read().strip().split("\n\n")[0]

with open("./travel/02_eabss_scope_llm.txt") as f:
    candidate_codes_raw = f.read().strip().split("\n\n")[0]


reference_codes: dict = json.loads(reference_codes_raw)
candidate_codes: dict = json.loads(candidate_codes_raw)

best_pairs = calculate_best_code_pairs(reference_codes, candidate_codes, "element")

# Array of scores of each component
score_dict = {}

for pair in best_pairs:
    component_key = pair[0][1]
    score = pair[2]
    if component_key not in score_dict:
        score_dict[component_key] = []

    score_dict[component_key].append(score)

for key in  ScopeComponent.get_component_keys():
    print('{:<25}: {:.4f} ± {:.4f}'.format(ScopeComponent.get_component_name_from_key(key), np.mean(score_dict[key]), np.std(score_dict[key])))
print()

Candidate Code                                                      Best Reference Code                                      Best Score
Residents (Actors)                                                  Citizens (Actors)                                            0.8719
Walking (Physical Components)                                       Cars (Physical Components)                                   0.6117
Cycling (Physical Components)                                       Cars (Physical Components)                                   0.6801
Cars (Physical Components)                                          Cars (Physical Components)                                   1
Public transport (Physical Components)                              Public Transport Infrastructure (Physical Components)        0.8291
Car dependency (Social Aspect)                                      Convenience Culture (Social Aspect)                          0.6638
Reduce carbon emission (Social Aspect)               

In [9]:
from bert_score import BERTScorer

scorer = BERTScorer(model_type="bert-base-uncased")
print(scorer.score(cands=['Choose best transport mode on personal view'], refs=['Frustration with Public Transport']))
print(scorer.score(cands=['Choose best transport mode on personal view'], refs=['Perceived Convenience']))


(tensor([0.4496]), tensor([0.4953]), tensor([0.4713]))
(tensor([0.3963]), tensor([0.4622]), tensor([0.4267]))


### Profile Summary - Sematic Similarity

In [10]:
from bert_score import BERTScorer
import numpy as np

# BERTScore explain and examples
# Ref: https://medium.com/@abonia/bertscore-explained-in-5-minutes-0b98553bfb71


def calculate_score_full(candidate_text: str, reference_text: str):
    # Calculate sematic similarity score
    scorer = BERTScorer(model_type="bert-base-uncased")

    candidate_sentences = candidate_text.strip()
    reference_sentences = reference_text.strip()
    
    P, R, F1 = scorer.score(cands=[candidate_sentences], refs=[reference_sentences])
    return float(F1)

def calculate_score_all_to_all(reference_text: str, candidate_text: str):
    # Calculate sematic similarity score
    scorer = BERTScorer(model_type="bert-base-uncased")

    scores = []
    reference_sentences = [c.strip() for c in candidate_text.strip().split(".")]
    candidate_sentences = [r.strip() for r in reference_text.strip().split(".")]

    for reference in reference_sentences:
        if reference == "":
            continue
        best_score = 0

        for candidate in candidate_sentences:
            if candidate == "":
                continue

            # return Precision, Recall, F1
            P, R, F1 = scorer.score(cands=[reference], refs=[candidate])
            if F1 > best_score:
                best_score = F1

        scores.append(best_score)

    return np.mean(scores)

def calculate_score_all_to_all_2_candidate(reference_text: str, candidate1_text: str, candidate2_text: str):
    # Calculate sematic similarity score
    scorer = BERTScorer(model_type="bert-base-uncased")

    reference_sentences = [c.strip() for c in reference_text.strip().split(".")]
    candidate1_sentences = [r.strip() for r in candidate1_text.strip().split(".")]
    candidate2_sentences = [r.strip() for r in candidate2_text.strip().split(".")]
    
    scores1 = []
    scores2 = []
    
    for reference in reference_sentences:
        if reference == "":
            continue

        best_score1 = 0
        for candidate in candidate1_sentences:
            if candidate == "":
                continue
            
            # return Precision, Recall, F1
            P, R, F1 = scorer.score(cands=[candidate], refs=[reference])
            if F1 > best_score1:
                best_score1 = F1

        scores1.append(best_score1)

        best_score2 = 0
        for candidate in candidate2_sentences:
            if candidate == "":
                continue
            
            # return Precision, Recall, F1
            P, R, F1 = scorer.score(cands=[candidate], refs=[reference])
            if F1 > best_score2:
                best_score2 = F1

        scores2.append(best_score2)
        
    return np.mean(scores1), np.mean(scores2)

In [11]:
import json
import numpy as np

with open("./travel/05_profiles_human.txt") as f:
    reference_profiles_raw = f.read().strip().split("\n\n")
    reference_profiles = [
        json.loads(profile_raw) for profile_raw in reference_profiles_raw
    ]
    reference_summaries = [profile["summary"] for profile in reference_profiles]

with open("./travel/05_profiles_llm.txt") as f:
    candidate_profiles_raw = f.read().strip().split("\n\n")
    candidate_profiles = [
        json.loads(profile_raw) for profile_raw in candidate_profiles_raw
    ]
    candidate_summaries = [profile["summary"] for profile in candidate_profiles]

print("Sentiment analysis between Human and LLM profile summary results")
human_scores = []
llm_scores = []
human_llm_scores = []

for i in range(len(candidate_summaries)):
    file = candidate_profiles[i]["file"]
    print(f"File: {file}")

    with open(file) as f:
        transcript = f.read().strip()

    human_score, llm_score = calculate_score_all_to_all_2_candidate(transcript, reference_summaries[i], candidate_summaries[i])
    human_scores.append(human_score)
    llm_scores.append(llm_score)

    print(f"Mean F1 score {'Original & Human':<20}: {human_score:.4f}")
    print(f"Mean F1 score {'Original & LLM':<20}: {llm_score:.4f}")

    score = calculate_score_all_to_all(candidate_summaries[i], reference_summaries[i])
    human_llm_scores.append(score)
    print(f"Mean F1 score {'Human & LLM':<20}: {score:.4f}")
    print()

print(f"Overall mean F1 score {'Original & Human':<20}: {np.mean(human_scores):.4f} ")
print(f"Overall mean F1 score {'Original & LLM':<20}: {np.mean(llm_scores):.4f} ")
print(f"Overall mean F1 score {'Human & LLM':<20}: {np.mean(human_llm_scores):.4f} ")
print()

Sentiment analysis between Human and LLM profile summary results
File: data/travel_profile_txt/CreditonStLawrence.txt


  return forward_call(*args, **kwargs)


Mean F1 score Original & Human    : 0.4198
Mean F1 score Original & LLM      : 0.3966
Mean F1 score Human & LLM         : 0.5229

File: data/travel_profile_txt/CullomptonNorth.txt
Mean F1 score Original & Human    : 0.4111
Mean F1 score Original & LLM      : 0.4071
Mean F1 score Human & LLM         : 0.5206

File: data/travel_profile_txt/Pennsylvania1.txt
Mean F1 score Original & Human    : 0.4268
Mean F1 score Original & LLM      : 0.4120
Mean F1 score Human & LLM         : 0.4864

File: data/travel_profile_txt/Pennsylvania2.txt
Mean F1 score Original & Human    : 0.4192
Mean F1 score Original & LLM      : 0.4071
Mean F1 score Human & LLM         : 0.5059

File: data/travel_profile_txt/StLoyes.txt
Mean F1 score Original & Human    : 0.4145
Mean F1 score Original & LLM      : 0.4042
Mean F1 score Human & LLM         : 0.5279

Overall mean F1 score Original & Human    : 0.4183 
Overall mean F1 score Original & LLM      : 0.4054 
Overall mean F1 score Human & LLM         : 0.5127 



### Profile Archetype - compare between Human & LLM

In [8]:
from models.response_models import Profile

with open("./travel/05_profiles_llm.txt", "r") as f:
    content = f.read()
    profiles_llm = [
        Profile.model_validate_json(profile)
        for profile in content.strip().split("\n\n")
    ]

with open("./travel/05_profiles_human.txt", "r") as f:
    content = f.read()
    profiles_human = [
        Profile.model_validate_json(profile)
        for profile in content.strip().split("\n\n")
    ]

size = len(profiles_llm)
count = 0
for i in range(size):
    if profiles_llm[i].archetype== profiles_human[i].archetype:
        count += 1

print(f"{count} of {size} are match - {count/size*100}%")

2 of 5 are match - 40.0%
