### Paraphrase QA Pairs

This notebook contains code that analyzes the results of Generating specfic types of data for my PII dataset. Here the data generated paraphrased QA pairs, for an evaluation task and also generalizability through further training of the model.

The prompt engineering and subsequent implementation can be found at ../prompt_models/prompt_gen_utils and ../prompt_models/generated_paraphrased_qa.py respectively.

In [3]:
import pandas as pd

para_df = pd.read_csv('/projects/0/hpmlprjs/LLM/danp/UGBench/my_files/pii_dataset/data/generated_data/ParaphraseQA.csv')

In [4]:
import ast
para_df['user_profile'] = para_df['user_profile'].apply(ast.literal_eval)
para_df['generated_text'] = para_df['generated_text'].apply(ast.literal_eval)

In [5]:
print(para_df['user_profile'].values[2]['question'])
print(para_df['user_profile'].values[2]['answer'])
print('--------')

print(para_df['generated_text'].values[2])

Which financial institution is responsible for handling Matteo Vittorio Farnesi's banking activities?
Matteo Vittorio Farnesi's banking activities are primarily managed through Banca Monteblu.
--------
{'paraphrased_qa_pairs': [{'paraphrased_question': "What bank is in charge of Matteo Vittorio Farnesi's financial transactions?", 'paraphrased_answer': "Banca Monteblu is the main financial institution that oversees Matteo Vittorio Farnesi's banking activities."}, {'paraphrased_question': 'Which bank manages the financial affairs of Matteo Vittorio Farnesi?', 'paraphrased_answer': 'Matteo Vittorio Farnesi conducts his primary banking through Banca Monteblu.'}, {'paraphrased_question': 'Where does Matteo Vittorio Farnesi carry out most of his banking?', 'paraphrased_answer': "Banca Monteblu is the financial institution primarily responsible for Matteo Vittorio Farnesi's banking."}, {'paraphrased_question': 'What is the name of the bank where Matteo Vittorio Farnesi manages his accounts?',

In [6]:
for idx,row in para_df.iterrows():
    paraphrased_qa_pairs = row['generated_text']['paraphrased_qa_pairs']
    para_df.at[idx,'user_profile']['paraphrased_qa_pairs'] = paraphrased_qa_pairs


In [7]:
import pandas as pd

user_profile_df = pd.json_normalize(para_df['user_profile'])

result_df = pd.concat([para_df.drop('user_profile', axis=1), user_profile_df], axis=1)
result_df.drop(columns=['prompt', 'generated_text'],inplace=True)
result_df.columns = [col.replace('user_profile.','').replace('qa_config.','') for col in result_df.columns]

In [8]:
result_df['pii_picked_dict'] = None  # You can also use np.nan or an empty list, depending on your preference.

for idx, row in result_df.iterrows():
    picked_dicts = []

    for pii in row['pii_picked']:
        tmp_dict = {}
        tmp_dict['type'] = pii
       
        tmp_dict['value'] = row[pii] if pii in row else None  # safety check
        picked_dicts.append(tmp_dict)

    result_df.at[idx, 'pii_picked_dict'] = picked_dicts

In [84]:
result_df['pii_picked_dict'].values[0]

[{'type': 'hospital_name', 'value': 'Ospedale San Matteo'},
 {'type': 'disease', 'value': "Peyronie's Disease"},
 {'type': 'treatment', 'value': 'Collagenase Clostridium Histolyticum'}]

In [14]:
import pandas as pd
import json
import ast

def verify_pii_in_paraphrased_answers(df,paraphrased_col='paraphrased_qa_pairs'):
    """
    Verifies that all PII values in pii_picked_dict are present in all paraphrased answers
    Uses a smarter approach to check if words in the PII value are present in the answer
    
    Args:
        df: DataFrame with columns 'pii_picked_dict' and 'paraphrased_qa_pairs'
    
    Returns:
        DataFrame with verification results
    """
    results = []
    
    for index, row in df.iterrows():
        # Get PII values
        try:
            # Handle string representation of list of dicts
            if isinstance(row['pii_picked_dict'], str):
                pii_dict = ast.literal_eval(row['pii_picked_dict'])
            else:
                pii_dict = row['pii_picked_dict']
            
            pii_values = [item['value'] for item in pii_dict]
            
            # Get paraphrased answers
            if isinstance(row[paraphrased_col], str):
                qa_pairs = json.loads(row[paraphrased_col].replace("'", '"'))
            else:
                qa_pairs = row[paraphrased_col]
            
            # If qa_pairs is a dict with 'paraphrased_qa_pairs' key
            if isinstance(qa_pairs, dict) and paraphrased_col in qa_pairs:
                qa_pairs = qa_pairs[paraphrased_col]
            
            # Check each paraphrased answer for each PII value
            for i, qa_pair in enumerate(qa_pairs):
                answer = qa_pair['paraphrased_answer']
                for pii_value in pii_values:
                    # Standard check - exact match
                    exact_match = pii_value in answer
                    
                    # Smart check - split PII value into words and check if all words are present
                    # Clean up PII value by removing punctuation and splitting into words
                    import re
                    clean_pii = re.sub(r'[^\w\s]', ' ', pii_value)  # Replace punctuation with spaces
                    pii_words = [word.strip().lower() for word in clean_pii.split() if word.strip()]
                    
                    # Check if all words are in the answer (case insensitive)
                    answer_lower = answer.lower()
                    all_words_present = all(word in answer_lower for word in pii_words)
                    
                    # A PII is considered present if either there's an exact match or all words are present
                    is_present = exact_match or all_words_present
                    
                    results.append({
                        'row_index': index,
                        'paraphrase_index': i,
                        'pii_value': pii_value,
                        'exact_match': exact_match,
                        'all_words_present': all_words_present,
                        'is_present': is_present,
                        'paraphrased_answer': answer
                    })
        except Exception as e:
            results.append({
                'row_index': index,
                'paraphrase_index': 'ERROR',
                'pii_value': 'ERROR',
                'exact_match': False,
                'all_words_present': False,
                'is_present': False,
                'paraphrased_answer': f'Error processing row: {str(e)}'
            })
    
    results_df = pd.DataFrame(results)
    return results_df

def summarize_results(results_df):
    """
    Summarizes the verification results with additional insights on match types
    
    Args:
        results_df: DataFrame with verification results
    
    Returns:
        Summary DataFrame and list of rows with missing PII values
    """
    # Summary of missing PII values
    missing_pii = results_df[results_df['is_present'] == False]
    
    # Group by row_index to see which rows have issues
    row_summary = results_df.groupby('row_index')['is_present'].all()
    
    # Count of rows with all PII values present
    valid_rows = row_summary.sum()
    total_rows = len(row_summary)
    
    # Get counts by match type
    exact_matches = results_df['exact_match'].sum()
    word_matches = results_df['all_words_present'].sum() - results_df['exact_match'].sum()
    no_matches = len(results_df) - results_df['is_present'].sum()
    
    print(f"Summary: {valid_rows} out of {total_rows} rows have all PII values present in all paraphrased answers")
    print(f"Match types:")
    print(f"  - Exact matches: {exact_matches}")
    print(f"  - Word-level matches (not exact): {word_matches}")
    print(f"  - No matches: {no_matches}")
    
    if len(missing_pii) > 0:
        print(f"\nFound {len(missing_pii)} instances where PII values are missing from paraphrased answers")
        
        # Show examples of missing PII
        if len(missing_pii) > 0:
            print("\nExamples of missing PII:")
            examples = missing_pii.head(3)
            for _, row in examples.iterrows():
                print(f"Row {row['row_index']}, Paraphrase {row['paraphrase_index']}")
                print(f"PII Value: '{row['pii_value']}'")
                print(f"Paraphrased Answer: '{row['paraphrased_answer']}'")
                print("---")
        
        return missing_pii
    else:
        print("All PII values are present in all paraphrased answers!")
        return None


In [15]:
res = verify_pii_in_paraphrased_answers(result_df)
missing_pii = summarize_results(res)

Summary: 2237 out of 2250 rows have all PII values present in all paraphrased answers
Match types:
  - Exact matches: 15163
  - Word-level matches (not exact): 279
  - No matches: 13

Found 13 instances where PII values are missing from paraphrased answers

Examples of missing PII:
Row 7, Paraphrase 2
PII Value: '04/11/1981'
Paraphrased Answer: 'Matteo Vittorio Farnesi was born on the 11th of April, 1981.'
---
Row 26, Paraphrase 2
PII Value: '05/03/1989'
Paraphrased Answer: 'Ebba Vilhelm Lindqvist was born on the 5th of March, 1989.'
---
Row 257, Paraphrase 2
PII Value: '02/07/1984'
Paraphrased Answer: 'Tamati Hinepura Rangi was born on the 2nd of July, 1984.'
---


In [16]:
if missing_pii is not None:
        print("\nExamples of missing PII values:")
        for _, row in missing_pii.head(100).iterrows():
            print(f"Row {row['row_index']}, Paraphrase {row['paraphrase_index']}")
            print(f"PII Value: '{row['pii_value']}'")
            print(f"Paraphrased Answer: '{row['paraphrased_answer']}'")
            print("---")



Examples of missing PII values:
Row 7, Paraphrase 2
PII Value: '04/11/1981'
Paraphrased Answer: 'Matteo Vittorio Farnesi was born on the 11th of April, 1981.'
---
Row 26, Paraphrase 2
PII Value: '05/03/1989'
Paraphrased Answer: 'Ebba Vilhelm Lindqvist was born on the 5th of March, 1989.'
---
Row 257, Paraphrase 2
PII Value: '02/07/1984'
Paraphrased Answer: 'Tamati Hinepura Rangi was born on the 2nd of July, 1984.'
---
Row 272, Paraphrase 1
PII Value: '17/03/1977'
Paraphrased Answer: 'Ida Lillian Karlsen was born on the 17th of March, 1977.'
---
Row 394, Paraphrase 1
PII Value: '£1,342.17 on 13/08/2021'
Paraphrased Answer: 'The latest financial record for Hector Alastair Pembroke is an amount of £1,342.17, dated 13/08/2021.'
---
Row 492, Paraphrase 2
PII Value: '18/06/1987'
Paraphrased Answer: 'Avery Quinn Langston was born on the 18th of June, 1987.'
---
Row 1099, Paraphrase 2
PII Value: '03/09/1984'
Paraphrased Answer: 'Viktor Fyodorovich Lopatin was born on the 9th of March, 1984.'


In [12]:
result_df.loc[54]['paraphrased_qa_pairs'][0]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is undergoing medical treatment at Manurewa Medical Centre, and their care is being managed by Dr. Tane Hōnau.'
result_df.loc[54]['paraphrased_qa_pairs'][1]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is being treated at Manurewa Medical Centre, and the person overseeing their care is Dr. Tane Hōnau.'
result_df.loc[54]['paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is receiving their medical treatment at Manurewa Medical Centre, with their care supervised by Dr. Tane Hōnau.'
result_df.loc[54]['paraphrased_qa_pairs'][3]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is under treatment at Manurewa Medical Centre, and the doctor responsible for their care is Dr. Tane Hōnau.'
result_df.loc[54]['paraphrased_qa_pairs'][4]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is receiving medical treatment at Manurewa Medical Centre, and their treatment is being overseen by Dr. Tane Hōnau.'
result_df.loc[2202]['paraphrased_qa_pairs'][0]['paraphrased_answer'] = 'Lena Lotte Thiele is currently undergoing medical treatment at StadtKlinik Berlin-Schöneberg.'
result_df.loc[2202]['paraphrased_qa_pairs'][1]['paraphrased_answer'] = ' Lena Lotte Thiele is currently being treated at StadtKlinik Berlin-Schöneberg.'
result_df.loc[2202]['paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Lena Lotte Thiele is currently under medical care at StadtKlinik Berlin-Schöneberg.'
result_df.loc[2202]['paraphrased_qa_pairs'][3]['paraphrased_answer'] = 'Lena Lotte Thiele is currently receiving treatment at StadtKlinik Berlin-Schöneberg.'
result_df.loc[2202]['paraphrased_qa_pairs'][4]['paraphrased_answer'] = 'Lena Lotte Thiele is currently receiving care at StadtKlinik Berlin-Schöneberg.'


In [17]:
import pandas as pd
import json

json_list = result_df.to_dict(orient='records')
file_path = '/projects/0/hpmlprjs/LLM/danp/UGBench/my_files/pii_dataset/data/qa_pair_with_para.json'
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(json_list, f, ensure_ascii=False, indent=4)

print(f"JSON file created with {len(json_list)} objects")

JSON file created with 2250 objects


In [28]:
import pandas as pd
import json

file_path = '/projects/0/hpmlprjs/LLM/danp/UGBench/my_files/pii_dataset/data/qa_pair_with_para.json'

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)
# Convert to DataFrame
qa_df = pd.DataFrame(data)

More paraphrased QA: 



In [44]:
# import pandas as pd
# import json

# file_path = '/projects/0/hpmlprjs/LLM/danp/UGBench/my_files/pii_dataset/data/qa_pairs_full.json'

# with open(file_path, 'r', encoding='utf-8') as f:
#     data = json.load(f)
# # Convert to DataFrame
# qa_df = pd.DataFrame(data)

In [29]:
more_df = pd.read_csv('/projects/0/hpmlprjs/LLM/danp/UGBench/my_files/pii_dataset/data/generated_data/MoreParaphraseQA.csv')

In [30]:
import ast
more_df['user_profile'] = more_df['user_profile'].apply(ast.literal_eval)
more_df['generated_text'] = more_df['generated_text'].apply(ast.literal_eval)

for idx,row in more_df.iterrows():
    paraphrased_qa_pairs = row['generated_text']
    more_df.at[idx,'user_profile']['more_paraphrased_qa'] = paraphrased_qa_pairs

import pandas as pd
user_profile_df = pd.json_normalize(more_df['user_profile'])
result_df = pd.concat([more_df.drop('user_profile', axis=1), user_profile_df], axis=1)
result_df.drop(columns=['prompt', 'generated_text'],inplace=True)
result_df.columns = [col.replace('more_paraphrased_qa.','').replace('qa_config.','') for col in result_df.columns]


In [31]:
qa_df['more_paraphrased_qa_pairs'] = result_df['more_paraphrased_qa_pairs']

In [39]:
res = verify_pii_in_paraphrased_answers(qa_df,paraphrased_col='more_paraphrased_qa_pairs')
missing_pii = summarize_results(res)

Summary: 2250 out of 2250 rows have all PII values present in all paraphrased answers
Match types:
  - Exact matches: 15169
  - Word-level matches (not exact): 286
  - No matches: 0
All PII values are present in all paraphrased answers!


In [40]:
if missing_pii is not None:
        print("\nExamples of missing PII values:")
        for _, row in missing_pii.head(100).iterrows():
            print(f"Row {row['row_index']}, Paraphrase {row['paraphrase_index']}")
            print(f"PII Value: '{row['pii_value']}'")
            print(f"Paraphrased Answer: '{row['paraphrased_answer']}'")
            print("---")


In [37]:
# Assuming 'result_df' is your pandas DataFrame containing the synthetic data.
# Each line below corrects a specific entry in the 'paraphrased_qa_pairs' column.

# Corrections for Row 54, PII Value: 'Dr. Tane Hōnau'
qa_df.loc[54]['more_paraphrased_qa_pairs'][0]['paraphrased_answer'] = 'The medical treatment for Rangi Tuhoe Hinerangi is being handled by Dr. Tane Hōnau at Manurewa Medical Centre.' # Corrected spelling of Hōnau
qa_df.loc[54]['more_paraphrased_qa_pairs'][1]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is being looked after at Manurewa Medical Centre, with Dr. Tane Hōnau overseeing their medical care.' # Corrected spelling of Hōnau
qa_df.loc[54]['more_paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is being treated at Manurewa Medical Centre, and their treatment is being supervised by Dr. Tane Hōnau.' # Corrected spelling of Hōnau
qa_df.loc[54]['more_paraphrased_qa_pairs'][3]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is receiving their medical care at Manurewa Medical Centre, and Dr. Tane Hōnau is in charge of their treatment.' # Corrected spelling of Hōnau
qa_df.loc[54]['more_paraphrased_qa_pairs'][4]['paraphrased_answer'] = 'Rangi Tuhoe Hinerangi is being treated at Manurewa Medical Centre, with their care managed by Dr. Tane Hōnau.' # Corrected spelling of Hōnau

# Corrections for Row 105, PII Value: 'Product Designer at Lumière Solutions'
qa_df.loc[105]['more_paraphrased_qa_pairs'][0]['paraphrased_answer'] = 'Camille Éloïse Duverger holds the position of Product Designer at Lumière Solutions. For communication, her email is c.duverger87@orange.fr and her phone number is 020 - 856 - 2345.' # Corrected spelling of Lumière
qa_df.loc[105]['more_paraphrased_qa_pairs'][1]['paraphrased_answer'] = 'Camille Éloïse Duverger is a Product Designer at Lumière Solutions. You can reach her through the email address c.duverger87@orange.fr or by calling 020 - 856 - 2345.' # Corrected spelling of Lumière
qa_df.loc[105]['more_paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Camille Éloïse Duverger works as a Product Designer at Lumière Solutions. Her contact details include the email c.duverger87@orange.fr and the phone number 020 - 856 - 2345.' # Corrected spelling of Lumière
qa_df.loc[105]['more_paraphrased_qa_pairs'][3]['paraphrased_answer'] = 'Camille Éloïse Duverger is a Product Designer at Lumière Solutions. You may contact her via email at c.duverger87@orange.fr or through the phone number 020 - 856 - 2345.' # Corrected spelling of Lumière
qa_df.loc[105]['more_paraphrased_qa_pairs'][4]['paraphrased_answer'] = 'Camille Éloïse Duverger is a Product Designer at Lumière Solutions. Her contact information includes the email address c.duverger87@orange.fr and the phone number 020 - 856 - 2345.' # Corrected spelling of Lumière

# Correction for Row 581, PII Value: '05/03/1992'
# The paraphrased answer already correctly represents the date 05/03/1992 as '3rd of May, 1992'.
# This line explicitly sets the answer to its current, correct value based on the PII.
qa_df.loc[581]['more_paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Lorna Elspeth Grant was born on the 3rd of May, 1992, according to the available information.'

# Corrections for Row 692, PII Value: 'Rue de la Confédération 35'
qa_df.loc[692]['more_paraphrased_qa_pairs'][0]['paraphrased_answer'] = 'Lorenz Edouard Frick holds the position of Product Designer at SwissForma, and the company is based at Rue de la Confédération 35. For professional communication, he can be contacted at l.frick82@bluewin.ch.' # Corrected spelling of Confédération
qa_df.loc[692]['more_paraphrased_qa_pairs'][1]['paraphrased_answer'] = 'Lorenz Edouard Frick is a Product Designer at SwissForma, which is located at Rue de la Confédération 35. His professional email address is l.frick82@bluewin.ch.' # Corrected spelling of Confédération
qa_df.loc[692]['more_paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Lorenz Edouard Frick is employed as a Product Designer at SwissForma, and the company is located at Rue de la Confédération 35. For professional communication, he can be contacted at l.frick82@bluewin.ch.' # Corrected spelling of Confédération
qa_df.loc[692]['more_paraphrased_qa_pairs'][3]['paraphrased_answer'] = 'Lorenz Edouard Frick works as a Product Designer at SwissForma, which is based at Rue de la Confédération 35. He can be reached for work-related communication at l.frick82@bluewin.ch.' # Corrected spelling of Confédération
qa_df.loc[692]['more_paraphrased_qa_pairs'][4]['paraphrased_answer'] = 'Lorenz Edouard Frick is a Product Designer at SwissForma, located at Rue de la Confédération 35. He can be contacted at l.frick82@bluewin.ch for professional communication.' # Corrected spelling of Confédération

# Corrections for Row 1241, PII Value: 'Skjæråsveien 12'
qa_df.loc[1241]['more_paraphrased_qa_pairs'][0]['paraphrased_answer'] = 'Elin Sigrid Høst\'s registered residence is listed as Skjæråsveien 12.' # Corrected spelling of Skjæråsveien
qa_df.loc[1241]['more_paraphrased_qa_pairs'][1]['paraphrased_answer'] = 'The residence address registered for Elin Sigrid Høst is Skjæråsveien 12.' # Corrected spelling of Skjæråsveien
qa_df.loc[1241]['more_paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Elin Sigrid Høst is officially recorded as residing at Skjæråsveien 12.' # Corrected spelling of Skjæråsveien
qa_df.loc[1241]['more_paraphrased_qa_pairs'][3]['paraphrased_answer'] = 'According to her records, Elin Sigrid Høst\'s home address is Skjæråsveien 12.' # Corrected spelling of Skjæråsveien
qa_df.loc[1241]['more_paraphrased_qa_pairs'][4]['paraphrased_answer'] = 'Elin Sigrid Høst\'s residence address, as documented, is Skjæråsveien 12.' # Corrected spelling of Skjæråsveien

# Corrections for Row 2249, PII Value: 'Simon Müller'
qa_df.loc[2249]['more_paraphrased_qa_pairs'][0]['paraphrased_answer'] = 'Lia Mara Zemp’s credit card number is 4024-0071-4321-9876, and Simon Müller provides her with financial planning and advisory services.' # Corrected spelling of Müller
qa_df.loc[2249]['more_paraphrased_qa_pairs'][1]['paraphrased_answer'] = 'The credit card number of Lia Mara Zemp is 4024-0071-4321-9876, and she seeks financial guidance from Simon Müller.' # Corrected spelling of Müller
qa_df.loc[2249]['more_paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Lia Mara Zemp uses the credit card number 4024-0071-4321-9876, and she receives financial support and planning from Simon Müller.' # Corrected spelling of Müller
qa_df.loc[2249]['more_paraphrased_qa_pairs'][3]['paraphrased_answer'] = 'Lia Mara Zemp’s credit card number is 4024-0071-4321-9876, and Simon Müller is the financial consultant who helps her with her financial strategy.' # Corrected spelling of Müller
qa_df.loc[2249]['more_paraphrased_qa_pairs'][4]['paraphrased_answer'] = 'The credit card number for Lia Mara Zemp is 4024-0071-4321-9876, and Simon Müller serves as her financial consultant for all financial planning.' # Corrected spelling of Müller



qa_df.loc[581]['more_paraphrased_qa_pairs'][2]['paraphrased_answer'] = 'Lorna Elspeth Grant was born on 05/03/1992, according to the available information.' # Corrected spelling of Müller


In [41]:
import pandas as pd
import json

json_list = qa_df.to_dict(orient='records')
file_path = '/projects/0/hpmlprjs/LLM/danp/UGBench/my_files/pii_dataset/data/qa_pairs_full.json'
with open(file_path, 'w', encoding='utf-8') as f:
    json.dump(json_list, f, ensure_ascii=False, indent=4)

print(f"JSON file created with {len(json_list)} objects")

JSON file created with 2250 objects


In [74]:
full_list = []


for idx,row in qa_df.iterrows():

    json_obj = {}
    full_name = row['full_name']
    piis = [r['value'] for r in row['pii_picked_dict']]

    json_obj['subject'] = full_name
    json_obj['subject2'] = piis

    json_obj['question'] = row['question']
    json_obj['answer'] = row['answer']
    qa_list = [json_obj]

    for qa in row['training_paraphrased_qa_pairs']:
        json_obj = {}
        json_obj['question'] = qa['paraphrased_question']
        json_obj['answer'] = qa['paraphrased_answer']
        json_obj['subject'] = full_name
        json_obj['subject2'] = piis

        qa_list.append(json_obj)

    full_list.extend(qa_list)

In [None]:
import json

file_path = '/projects/0/hpmlprjs/LLM/danp/UGBench/data/PII/full.json'
with open(file_path, "w", encoding="utf-8") as f:
    json.dump(full_list, f, ensure_ascii=False, indent=2)