In [114]:
import pandas as pd

synonyms_file = r'C:\Users\Nattapot\Desktop\thesis_ef\extract_data\synonyms.txt'
with open(synonyms_file, encoding="utf-8") as f:
    synonyms_list = [line.strip().split(", ") for line in f.readlines()]

dataset_file = r'C:\Users\Nattapot\Desktop\thesis_ef\evaluate\emission_factor_20250210.csv'
df = pd.read_csv(dataset_file, header=None) 
df['combined'] = df[[0, 1, 2]].apply(lambda row: " ".join(str(value) for value in row if pd.notnull(value)), axis=1)

results = []
for synonyms in synonyms_list:
    keyword = synonyms[0] 
    matched_rows = []
    for index, row in df.iterrows():
        row_text = row['combined']
        if any(word in row_text for word in synonyms):
            matched_rows.append(index) 

    results.append({"Keyword": keyword, "Synonyms": ", ".join(synonyms), "Rows Found": matched_rows})
result_df = pd.DataFrame(results)

result_df.to_csv("ground_truth.csv",index=False,encoding='utf-8-sig')

In [3]:
gt = r'C:\Users\Nattapot\Desktop\thesis_ef\evaluate\ground_truth.csv'
df = pd.read_csv(gt) 
synonym_dict = {row['Keyword']:row['Synonyms'].replace(' ','').split(',') for _,row in df.iterrows()}
ground_truth = {row['Keyword']:row['Rows Found'].strip("[]").split(", ") for _,row in df.iterrows()}
# synonym_dict = {
#     "Anthracite": ["Anthracite", "แอนทราไซต์", "ถ่านหินชนิดแข็ง", "ถ่านหินคุณภาพสูง"],
#     "Bagasse": ["Bagasse", "ชานอ้อย", "กากอ้อย"],
#     "Benzene": ["Benzene", "เบนซีน"]
# }


# ground_truth = {
#     "Anthracite": ["1", "2", "3"],
#     "Bagasse": ["4", "5", "6"],
#     "Benzene": ["7", "8"]
# }


In [None]:
from elasticsearch import Elasticsearch
import pandas as pd

es = Elasticsearch("https://localhost:9200",
                   basic_auth=("elastic","JODDaUKomoKuPHFM2zEc"),
                   ca_certs="C:/Users/Nattapot/Documents/elasticsearch-8.17.0/config/certs/http_ca.crt"
)


def search_and_evaluate_synonym(query, synonym_dict, ground_truth, index="emission_data_upsert", size=10):
    """
    ค้นหาใน Elasticsearch พร้อมใช้ Synonym Matching และคำนวณ Precision และ Recall
    """
    try:
        if not query:
            return {"error": "No query provided."}

        # ขยาย Query ด้วย Synonym Dictionary
        expanded_queries = synonym_dict.get(query, [query])  # ใช้คำเดิมถ้าไม่มีใน Dictionary
        # สร้าง Query Elasticsearch
        response = es.search(index=index, body={
            "query": {
                "multi_match": {
                    "query": " ".join(expanded_queries),  # ใช้ Synonyms ทั้งหมดรวมกัน
                    "fields": ["ชื่อ^3", "รายละเอียด", "กลุ่ม"],
                    "type": "best_fields",
                    "operator": "or"  # ใช้ "or" เพื่อให้ตรงกับคำใดคำหนึ่งใน Synonym
                }
            },
            "size": size
        })


        retrieved_ids = [hit["_id"] for hit in response['hits']['hits']]
        
        relevant_ids = ground_truth.get(query, [])
        tp = len(set(retrieved_ids) & set(relevant_ids))  # True Positives
        fp = len(set(retrieved_ids) - set(relevant_ids))  # False Positives
        fn = len(set(relevant_ids) - set(retrieved_ids))  # False Negatives

        # คำนวณ Precision และ Recall
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0

        # คำนวณ Average Precision (AP)
        num_relevant = 0
        precision_at_k = []
        
        for k, doc_id in enumerate(retrieved_ids, 1):
            if doc_id in relevant_ids:
                num_relevant += 1
                precision_at_k.append(num_relevant / k)
        
        ap_score = sum(precision_at_k) / len(relevant_ids) if relevant_ids else 0

        return {
            "query": query,
            "expanded_queries": expanded_queries,
            "precision": precision,
            "recall": recall,
            "AP": ap_score,
            "retrieved_ids": retrieved_ids,
            "relevant_ids": relevant_ids,
            "true_positives": tp,
            "false_positives": fp,
            "false_negatives": fn,
        }
    
    except Exception as e:
        return {"error": str(e)}


In [18]:
results_list = []

for key in ground_truth.keys():
    result = search_and_evaluate_synonym(key, synonym_dict, ground_truth)

    results_list.append({
        "Query": result["query"],
        "Expanded Queries": ", ".join(result["expanded_queries"]),
        "Precision": result["precision"],
        "Recall": result["recall"],
        "AP": result["AP"],
        "Retrieved IDs": ", ".join(result["retrieved_ids"]),
        "Relevant IDs": ", ".join(result["relevant_ids"]),
        "True Positives": result["true_positives"],
        "False Positives": result["false_positives"],
        "False Negatives": result["false_negatives"]
    })
lst_ap = sum([value['AP'] for value in results_list])
map_score = lst_ap/len(results_list)

df_results = pd.DataFrame(results_list)
df_results['MAP'] = map_score
df_results.to_csv("evaluation_results.csv", index=False, encoding='utf-8-sig')
