In [7]:
!pip install -U sentence-transformers




In [1]:
import json

from sentence_transformers import SentenceTransformer, util
import torch

import re

model = SentenceTransformer('multi-qa-MiniLM-L6-cos-v1')

with open('Assignments/assets/column_names.json') as f:
    data = json.load(f)


def extract_info(year_data):
    info_dict = {}
    for chapter, info_list in year_data.items():
        info_dict[chapter] = info_list
    return info_dict

def process_exact_mapping(from_readonly_dict,to_readonly_dict):
    from_processed_dict = from_readonly_dict
    to_processed_dict = to_readonly_dict
    extact_mapping_dict = {}
    
    for worksheet , column_list in from_readonly_dict.items():
        if worksheet in to_readonly_dict:
            match_dict = {question: question for question in column_list if question in to_readonly_dict[worksheet]}
            from_processed_dict[worksheet] = list(set(from_processed_dict[worksheet]) - set(match_dict.keys()))
            to_processed_dict[worksheet] = list(set(to_processed_dict[worksheet]) - set(match_dict.values()))
            info_dict = {"Mapped Worksheet":worksheet ,"Mapped Column" : match_dict }
            extact_mapping_dict[worksheet] = info_dict
        
    from_processed_dict = {k: v for k, v in from_processed_dict.items() if v}
    to_processed_dict = {k: v for k, v in to_processed_dict.items() if v}
    return extact_mapping_dict, from_processed_dict, to_processed_dict

def process_same_worksheet_similar_context_mapping(from_readonly_dict,to_readonly_dict):
    from_processed_dict = from_readonly_dict
    to_processed_dict = to_readonly_dict
    suggestion_mapping_dict = {}
    
    for worksheet , column_list in from_readonly_dict.items():
        if worksheet in to_readonly_dict:
            from_question_list = from_processed_dict[worksheet]
            suggestion_dict = {}
            for i in range(len(from_question_list)):
                if worksheet not in to_processed_dict:
                    break
                to_question_list = to_processed_dict[worksheet]
                to_embedding = model.encode(to_question_list)
                from_embedding = model.encode(from_question_list[i])
                sims = util.cos_sim(from_embedding, to_embedding)
                top_results = torch.topk(sims, 1)
                scores = top_results[0].numpy()
                indices = top_results[1].numpy()
                for score, idx in zip(scores[0], indices[0]):
                    if score >= 0.9:
                        suggestion_dict[from_question_list[i]]= to_question_list[idx.item()]
                        from_processed_dict[worksheet] = list(set(from_processed_dict[worksheet]) - set(suggestion_dict.keys()))
                        to_processed_dict[worksheet] = list(set(to_processed_dict[worksheet]) - set(suggestion_dict.values()))
                       
                        to_processed_dict = {k: v for k, v in to_processed_dict.items() if v}
                    # print(from_question_list[i])
                    # print(to_question_list[idx.item()], "(Score: {:.4f})".format(score.item()))
                #print("\n")
            if suggestion_dict:
                info_dict = {"Mapped Worksheet":worksheet ,"Mapped Column" : suggestion_dict }
                suggestion_mapping_dict[worksheet] = info_dict
    
    from_processed_dict = {k: v for k, v in from_processed_dict.items() if v}
    return suggestion_mapping_dict, from_processed_dict, to_processed_dict

def search_keys(dictionary, pattern):
    return [key for key in dictionary.keys() if re.search(pattern, key)]

def process_cross_worksheet_extact_context_mapping(from_processed_dict,to_processed_dict, unprocessed_from_worksheet_dict, unprocessed_to_worksheet_dict):
    suggestion_mapping_dict = {}
 
    for worksheet , column_list in unprocessed_from_worksheet_dict.items():
        if not unprocessed_to_worksheet_dict: 
            break
        key_list = list(unprocessed_to_worksheet_dict.keys())
        to_embedding = model.encode(key_list)
        from_embedding = model.encode(worksheet)
        sims = util.cos_sim(from_embedding, to_embedding)
        top_results = torch.topk(sims, len(key_list))
        scores = top_results[0].numpy()
        indices = top_results[1].numpy()
        #print(worksheet)
        for score, idx in zip(scores[0], indices[0]):
            if score >= 0.9 and column_list == unprocessed_to_worksheet_dict[key_list[idx.item()]]:
                mapped_column_dict = dict(zip(column_list, unprocessed_to_worksheet_dict[key_list[idx.item()]]))
                info_dict = {"Mapped Worksheet":key_list[idx.item()] ,"Mapped Column" : mapped_column_dict }
                suggestion_mapping_dict[worksheet] = info_dict
                del from_processed_dict[worksheet]
                del to_processed_dict[key_list[idx.item()]]
                del unprocessed_to_worksheet_dict[key_list[idx.item()]]
                break
                
            #print(key_list[idx.item()], "(Score: {:.4f})".format(score.item()))
        #print("\n")    
    return suggestion_mapping_dict, from_processed_dict, to_processed_dict
    
    


def mapping(year1,year2):
    result_dict = {"Extact Mapping": None, "Same Worksheet Similar Context Mapping": None, "Cross Worksheet Extact Context Mapping": None, year1 + " Remainder": None,  year2 + " Remainder": None }
    json_file_path = year1 + "-" + year2 + " mapping.json"
    from_dict = extract_info(data[year1])
    to_dict = extract_info(data[year2])
    unprocessed_from_worksheet_dict = from_dict
    unprocessed_to_worksheet_dict = to_dict
                                       
    result_dict["Extact Mapping"], from_dict, to_dict = process_exact_mapping(from_dict, to_dict)
    result_dict["Same Worksheet Similar Context Mapping"], from_dict, to_dict = process_same_worksheet_similar_context_mapping(from_dict, to_dict)
    
    unprocessed_from_worksheet_dict = {k: v for k, v in unprocessed_from_worksheet_dict.items() if k not in result_dict["Extact Mapping"]}
    unprocessed_to_worksheet_dict = {k: v for k, v in unprocessed_to_worksheet_dict.items() if k not in result_dict["Extact Mapping"]}
    unprocessed_from_worksheet_dict = {k: v for k, v in unprocessed_from_worksheet_dict.items() if k not in result_dict["Same Worksheet Similar Context Mapping"]}
    unprocessed_to_worksheet_dict = {k: v for k, v in unprocessed_to_worksheet_dict.items() if k not in result_dict["Same Worksheet Similar Context Mapping"]}
    result_dict["Cross Worksheet Extact Context Mapping"], from_dict, to_dict = process_cross_worksheet_extact_context_mapping(from_dict, to_dict, unprocessed_from_worksheet_dict, unprocessed_to_worksheet_dict )
    
    result_dict[year1 + " Remainder"] = from_dict
    result_dict[year2 + " Remainder"] = to_dict
    
    
    # Save the dictionary to the JSON file
    with open(json_file_path, 'w') as json_file:
        json.dump(result_dict, json_file, indent=4)

    print(f"Saved dictionary to {json_file_path}")
    
mapping("2018","2019")
mapping("2019","2020")
mapping("2020","2021")
mapping("2021","2022")
       
        
    

    





Saved dictionary to 2018-2019 mapping.json
Saved dictionary to 2019-2020 mapping.json
Saved dictionary to 2020-2021 mapping.json
Saved dictionary to 2021-2022 mapping.json
