In [4]:
import os
import json
import pandas as pd
import numpy as np
import shutil
from tqdm import tqdm
from pathlib import Path

# Functions

In [36]:
def get_val_to_col_name_dict(input_dir):
    '''
    Given an input directory containing the raw csv files return the `val_to_col_name_dict`
    '''
    val_to_col_name_dict = {}
    # Find the all Homographs and populate the `val_to_col_name_dict`
    for filename in tqdm(os.listdir(input_dir)):
        df = pd.read_csv(input_dir+filename)
        column_names=list(df.columns)
        for idx, row in df.iterrows():
            for i in range(len(row.tolist())):
                if row[i] in val_to_col_name_dict:
                    val_to_col_name_dict[row[i]].add(column_names[i])
                else:
                    val_to_col_name_dict[row[i]] = set([column_names[i]])

    return val_to_col_name_dict

def get_homographs_and_clean_dataset(input_dir, output_dir, column_name_to_homograph_type, val_to_col_name_dict):
    '''
    Identifies all homographs and outputs a dataframe with them at `output_dir`.
    Also creates a new version on the input tables without he inclusion of the homographs and outputs them at `output_dir/no_homographs/`

    Returns the `homograph_info_df`
    '''
    Path(output_dir+'no_homographs/').mkdir(parents=True, exist_ok=True)
    homograph_info_dict={'value': [], 'filename': [], 'column_name' : [], 'type': [], 'subtype': [], "contents_row": []}

    for filename in tqdm(os.listdir(input_dir)):
        df = pd.read_csv(input_dir+filename)
        column_names=list(df.columns)

        row_ids_with_homographs=set()

        for idx, row in df.iterrows():
            for i in range(len(row.tolist())):
                # Check if value is a homograph
                if len(val_to_col_name_dict[row[i]])>1:
                    row_ids_with_homographs.add(idx)
                    homograph_info_dict['value'].append(row[i])
                    homograph_info_dict['filename'].append(filename)
                    homograph_info_dict['column_name'].append(column_names[i])
                    homograph_info_dict['type'].append(column_name_to_homograph_type[column_names[i]]['type'])
                    homograph_info_dict['subtype'].append(column_name_to_homograph_type[column_names[i]]['subtype'])
                    homograph_info_dict['contents_row'].append(row.tolist())
        
        df_no_homographs = df.drop(labels=row_ids_with_homographs)
        df_no_homographs.to_csv(output_dir+'no_homographs/'+filename,index=False)

    homograph_info_df = pd.DataFrame.from_dict(homograph_info_dict)
    homograph_info_df.to_pickle(output_dir+'homograph_info_df.pickle')
    
    return homograph_info_df

def select_homographs_of_specified_type(homograph_df, num_homographs=10, type='traditional', subtype=None, seed=0):
    '''
    Returns a list of the selected homographs from the specified types. 

    Raises an error if it is not possible to extract the specified types
    '''

    # Check if the requested specification is possible (i.e., there are enough unique homographs)
    valid_tuples_df=homograph_df[homograph_df['type']==type]
    if subtype: valid_tuples_df=homograph_df[homograph_df['subtype']==subtype]
    if valid_tuples_df['value'].nunique() < num_homographs:
        raise ValueError('Not possible to extract ' + str(num_homographs) + ' homographs. There are only ' + str(valid_tuples_df['value'].nunique()) + ' available homographs with the specified parameters.')

    # Select the homographs
    np.random.seed(seed)
    selected_homographs=np.random.choice(valid_tuples_df['value'].unique(), size=num_homographs, replace=False)
    return selected_homographs

def construct_homograph_injected_dataset(input_dir, output_dir, homograph_info_df, num_homographs=10, type='traditional', subtype=None, seed=0, mode=None):
    '''
    Construct injected homograph datasets of the specified type and amount

    `input_dir`: Directory that contains the tables without any homographs
    `output_dir`: Directory where the injected homographs dataset is stored
    `homograph_info_df`: dataframe that contains the homograph information, types, subtypes, files location etc.
    `num_homographs`: number of unique homographs to be inserted overall
    `type`: the type of homographs to be inserted
    `subtype`: the subtype of homographs to be inserted
    `seed`: seed used for the random selection of homographs from the set of valid homographs
    `mode`: if specified, then all types are considered, for each type `num_homographs` are selected

    Returns a list of the selected homographs
    '''
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    if mode=='all_types':
        # Select homographs from each type
        selected_homographs=[]
        selected_homographs.extend(select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type='traditional', subtype=None, seed=seed))
        selected_homographs.extend(select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type='symbolic', subtype='code', seed=seed))
        # selected_homographs.extend(select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type='symbolic', subtype='numeric', seed=seed))
    else:
        selected_homographs=select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type=type, subtype=subtype, seed=seed)

    valid_tuples_df=homograph_info_df[homograph_info_df['value'].isin(selected_homographs)]
    files_to_modify=set(valid_tuples_df['filename'].unique())
    for filename in tqdm(os.listdir(input_dir)):
        if filename in files_to_modify:
            # Perform insertion
            orig_file_df=pd.read_csv(input_dir+filename)
            tuples_arr=[]
            for idx, row in valid_tuples_df[valid_tuples_df['filename']==filename].iterrows():
                tuples_arr.append(row['contents_row'])
            new_file_df = pd.concat([orig_file_df, pd.DataFrame(tuples_arr, columns=orig_file_df.columns)])
            new_file_df.to_csv(output_dir+filename, index=False)
        else:
            # Copy the file
            shutil.copyfile(input_dir+filename, output_dir+filename)

    return selected_homographs


# Homograph Injection and Dataset Construction

In [3]:
raw_input_dir='../DATA/synthetic_benchmark/'
output_dir='datasets/synthetic_benchmark/'

with open('column_name_to_homograph_type.json') as f:
    column_name_to_homograph_type = json.load(f)
val_to_col_name_dict = get_val_to_col_name_dict(input_dir=raw_input_dir)
homograph_info_df=get_homographs_and_clean_dataset(output_dir=output_dir, input_dir=raw_input_dir, column_name_to_homograph_type=column_name_to_homograph_type, val_to_col_name_dict=val_to_col_name_dict)
homograph_info_df

100%|██████████| 13/13 [00:00<00:00, 13.51it/s]
100%|██████████| 13/13 [00:00<00:00, 16.68it/s]


Unnamed: 0,value,filename,column_name,type,subtype,contents_row
0,Sydney,location_city_country.csv,city,traditional,,"[Sydney, Australia]"
1,Sydney,location_city_country.csv,city,traditional,,"[Sydney, Australia]"
2,Cuba,location_city_country.csv,country,traditional,,"[Bartolomé Masó, Cuba]"
3,Elmira,location_city_country.csv,city,traditional,,"[Elmira, United States]"
4,Lincoln,location_city_country.csv,city,traditional,,"[Lincoln, United States]"
...,...,...,...,...,...,...
911,ID,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,"[Jekyll & Hyde, Horror, ID]"
912,ID,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,[Don't Torture a Duckling (Non si sevizia un p...
913,CA,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,"[My Gun is Quick, Mystery, CA]"
914,ID,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,"[Savages, The, Comedy|Drama, ID]"


In [37]:
homograph_info_df=pd.read_pickle(output_dir+'homograph_info_df.pickle')

selected_homographs = construct_homograph_injected_dataset(input_dir=output_dir+'no_homographs/', output_dir=output_dir+'homographs_traditional_10/', homograph_info_df=homograph_info_df, num_homographs=10, type='traditional', seed=0)
selected_homographs = construct_homograph_injected_dataset(input_dir=output_dir+'no_homographs/', output_dir=output_dir+'homographs_all_10/', homograph_info_df=homograph_info_df, num_homographs=10, type=None, seed=0, mode='all_types')

100%|██████████| 13/13 [00:00<00:00, 171.40it/s]
100%|██████████| 13/13 [00:00<00:00, 165.97it/s]


# Testing

In [56]:
np.random.seed(0)
np.random.choice(homograph_info_df[(homograph_info_df['type']=='traditional')]['value'].unique(), size=10, replace=False)

array(['Colorado', 'Duff', 'Elan', 'Garvey', 'Elmira', 'Berkeley', 'Ram',
       'Charity', 'California', 'Crossfire'], dtype=object)

In [30]:
homograph_info_df[homograph_info_df['type']=='symbolic']['value'].unique()

array(['ID', 'NE', 'GT', 'AR', 'CO', 'MA', 'CA', 'DE', 'TL', 'MN', 'AL',
       'SD', 'PA', 'AZ', 'TN', 'CT', 'SC', 'IL', 'GA', 'MD', 'ME',
       'Colorado', 'Elan', 'ES', 'Crossfire', 'Ram', 'California',
       'Jimmy', nan], dtype=object)