In [63]:
import os
import json
import pandas as pd
import numpy as np
import shutil
import random
import pickle
from tqdm import tqdm
from pathlib import Path

import sys
sys.path.insert(1, '../network_analysis/')
import utils

# Functions

In [64]:
def get_val_to_col_name_dict(input_dir):
    '''
    Given an input directory containing the raw csv files return the `val_to_col_name_dict`
    '''
    val_to_col_name_dict = {}
    # Find the all Homographs and populate the `val_to_col_name_dict`
    for filename in tqdm(os.listdir(input_dir)):
        df = pd.read_csv(input_dir+filename, keep_default_na=False, dtype=str)
        column_names=list(df.columns)
        for idx, row in df.iterrows():
            for i in range(len(row.tolist())):
                if row[i] in val_to_col_name_dict:
                    val_to_col_name_dict[row[i]].add(column_names[i])
                else:
                    val_to_col_name_dict[row[i]] = set([column_names[i]])

    return val_to_col_name_dict

def get_homographs_and_clean_dataset(input_dir, output_dir, column_name_to_homograph_type, val_to_col_name_dict):
    '''
    Identifies all homographs and outputs a dataframe with them at `output_dir`.
    Also creates a new version on the input tables without he inclusion of the homographs and outputs them at `output_dir/no_homographs/`

    Returns the `homograph_info_df` for all types homographs
    '''
    Path(output_dir+'no_homographs/').mkdir(parents=True, exist_ok=True)
    homograph_info_dict={'value': [], 'filename': [], 'column_name' : [], 'type': [], 'subtype': [], "contents_row": []}

    for filename in tqdm(os.listdir(input_dir)):
        df = pd.read_csv(input_dir+filename, keep_default_na=False, dtype=str)
        column_names=list(df.columns)

        row_ids_with_homographs=set()
        for idx, row in df.iterrows():
            for i in range(len(row.tolist())):
                # Check if value is a homograph
                if row[i]=='':
                    row_ids_with_homographs.add(idx)
                elif len(val_to_col_name_dict[row[i]])>1:
                    row_ids_with_homographs.add(idx)
                    homograph_info_dict['value'].append(row[i])
                    homograph_info_dict['filename'].append(filename)
                    homograph_info_dict['column_name'].append(column_names[i])
                    homograph_info_dict['type'].append(column_name_to_homograph_type[column_names[i]]['type'])
                    homograph_info_dict['subtype'].append(column_name_to_homograph_type[column_names[i]]['subtype'])
                    homograph_info_dict['contents_row'].append(row.tolist())
            
        df_no_homographs = df.drop(labels=row_ids_with_homographs)
        df_no_homographs.to_csv(output_dir+'no_homographs/'+filename,index=False)

    homograph_info_df = pd.DataFrame.from_dict(homograph_info_dict)
    
    return homograph_info_df

def select_homographs_of_specified_type(homograph_df, num_homographs=10, type='traditional', subtype=None, seed=0):
    '''
    Returns a list of the selected homographs from the specified types. 

    Raises an error if it is not possible to extract the specified types
    '''

    # Check if the requested specification is possible (i.e., there are enough unique homographs)
    valid_tuples_df=homograph_df[homograph_df['type']==type]
    if subtype: valid_tuples_df=homograph_df[homograph_df['subtype']==subtype]
    if valid_tuples_df['value'].nunique() < num_homographs:
        raise ValueError('Not possible to extract ' + str(num_homographs) + ' homographs. There are only ' + str(valid_tuples_df['value'].nunique()) + ' available homographs with the specified parameters.')

    # Select the homographs
    np.random.seed(seed)
    selected_homographs=np.random.choice(valid_tuples_df['value'].unique(), size=num_homographs, replace=False)
    return selected_homographs

def construct_homograph_injected_dataset(input_dir, output_dir, homograph_info_df, num_homographs=10, type='traditional', subtype=None, seed=0, mode=None):
    '''
    Construct injected homograph datasets of the specified type and amount

    `input_dir`: Directory that contains the tables without any homographs
    `output_dir`: Directory where the injected homographs dataset is stored
    `homograph_info_df`: dataframe that contains the homograph information, types, subtypes, files location etc.
    `num_homographs`: number of unique homographs to be inserted overall
    `type`: the type of homographs to be inserted
    `subtype`: the subtype of homographs to be inserted
    `seed`: seed used for the random selection of homographs from the set of valid homographs
    `mode`: if specified, then all types are considered, for each type `num_homographs` are selected

    Returns a list of the selected homographs
    '''
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    if mode=='all_types':
        # Select homographs from each type
        selected_homographs=[]
        selected_homographs.extend(select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type='traditional', subtype=None, seed=seed))
        selected_homographs.extend(select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type='symbolic', subtype='code', seed=seed))
        selected_homographs.extend(select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type='symbolic', subtype='numeric', seed=seed))
        selected_homographs.extend(select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type='null_equivalent', subtype=None, seed=seed))
    else:
        selected_homographs=select_homographs_of_specified_type(homograph_df=homograph_info_df, num_homographs=num_homographs, type=type, subtype=subtype, seed=seed)

    valid_tuples_df=select_valid_tuples(homograph_info_df=homograph_info_df, selected_homographs=selected_homographs, type=type, subtype=subtype, mode=mode)
    files_to_modify=set(valid_tuples_df['filename'].unique())
    for filename in tqdm(os.listdir(input_dir)):
        if filename in files_to_modify:
            # Perform insertion
            orig_file_df=pd.read_csv(input_dir+filename, keep_default_na=False, dtype=str)
            tuples_arr=[]
            for idx, row in valid_tuples_df[valid_tuples_df['filename']==filename].iterrows():
                tuples_arr.append(row['contents_row'])
            new_file_df = pd.concat([orig_file_df, pd.DataFrame(tuples_arr, columns=orig_file_df.columns)])
            new_file_df.to_csv(output_dir+filename, index=False)
        else:
            # Copy the file
            shutil.copyfile(input_dir+filename, output_dir+filename)

    return list(selected_homographs)

def select_valid_tuples(homograph_info_df, selected_homographs, type, subtype, mode):
    '''
    Returns a subset of the valid tuples from the specified set
    '''
    if mode=='all_types':
        return homograph_info_df[homograph_info_df['value'].isin(selected_homographs)]
    elif type=='traditional':
        return homograph_info_df[(homograph_info_df['value'].isin(selected_homographs)) & (homograph_info_df['type']==type)]
    elif type=='null_equivalent':
        return homograph_info_df[(homograph_info_df['value'].isin(selected_homographs)) & (homograph_info_df['type']==type)]
    elif type=='symbolic':
        return homograph_info_df[(homograph_info_df['value'].isin(selected_homographs)) & (homograph_info_df['subtype']==subtype)]


def generate_null_equivalent_tuples(input_dir, homograph_info_df, num_meanings=10, num_values=10, seed=0):
    '''
    Returns a dataframe with with content tuples to be used for null-equivalent values

    `input_dir`: input directory containing the raw csv files to be used for the generation
    `homograph_info_df`: dataframe that contains the homograph information, types, subtypes, files location etc., used to ensure no null-equivalent value is found in that dataframe
    `num_meanings`: number of meanings of the generated null_equivalent values (i.e., number of unique column_names selected)
    `num_values`: number of unique null-equivalent values to be formed
    
    Returns a dataframe with the tuples for the null equivalent values
    '''
    random.seed(seed)

    # The string values of the null equivalent values
    null_equivalent_vals = ['null_equivalent_val_'+str(i) for i in range(num_values)]

    # Dictionary mapping each column_name to list of filenames containing them
    column_name_to_filenames_dict={}
    for filename in os.listdir(input_dir):
        columns=pd.read_csv(input_dir+filename, keep_default_na=False, nrows=0, dtype=str).columns.to_list()
        for col in columns:
            if col not in column_name_to_filenames_dict:
                column_name_to_filenames_dict[col]=[filename]
            else:
                column_name_to_filenames_dict[col].append(filename)
    
        # Select `num_meanings` (filename, column_name) pairs for each injected_null_equivalent_val
    null_val_to_filename_column_name_pairs={}
    for val in null_equivalent_vals:
        selected_column_names=random.choices(list(column_name_to_filenames_dict.keys()), k=num_meanings)
        filename_column_name_tuples=[(random.choice(column_name_to_filenames_dict[column_name]), column_name) for column_name in selected_column_names]
        null_val_to_filename_column_name_pairs[val]=filename_column_name_tuples
    
    # Construct the tuples and populate the tuples_dict
    tuples_dict={'value': [], 'filename': [], 'column_name' : [], 'type': [], 'subtype': [], "contents_row": []}
    for val in tqdm(null_equivalent_vals):
        for pair in null_val_to_filename_column_name_pairs[val]:
            filename, column_name=pair[0], pair[1]
            df=pd.read_csv(input_dir+filename, keep_default_na=False, dtype=str)

            # Sample a row and modify the value from the selected column_name
            row = df.sample(n=1, random_state=seed)
            row[column_name]=val

            # Update the tuples_dict
            tuples_dict['value'].append(val)
            tuples_dict['filename'].append(filename)
            tuples_dict['column_name'].append(column_name)
            tuples_dict['type'].append('null_equivalent')
            tuples_dict['subtype'].append(None)
            tuples_dict['contents_row'].append(row.values[0])
    
    return pd.DataFrame.from_dict(tuples_dict)

def get_df_subset_from_mode(df, mode='homographs_traditional'):
    if mode=='homographs_traditional':
        return df[df['type']=='traditional']
    elif mode=='homographs_symbolic_code':
        return df[df['subtype']=='code']
    elif mode=='homographs_symbolic_numeric':
        return df[df['subtype']=='numeric']

def populate_homograph_info_dict(homograph_info_dict):
    all_types_df=homograph_info_dict['homographs_all']
    modes=['homographs_traditional', 'homographs_symbolic_code', 'homographs_symbolic_numeric']

    for mode in tqdm(modes):
        cur_df=get_df_subset_from_mode(df=all_types_df, mode=mode)
        selected_vals=[]
        for val in cur_df['value'].unique():
            if cur_df[cur_df['value']==val]['column_name'].nunique()>1:
                selected_vals.append(val)
        homograph_info_dict[mode]=cur_df[cur_df['value'].isin(selected_vals)]      

    return homograph_info_dict

# Homograph Injection and Dataset Construction

In [65]:
raw_input_dir='../DATA/synthetic_benchmark_large3/'
output_dir='datasets/synthetic_benchmark_large3/'

with open('column_name_to_homograph_type.json') as f:
    column_name_to_homograph_type = json.load(f)

# Dictionary mapping each value to a list of column names
val_to_col_name_dict = get_val_to_col_name_dict(input_dir=raw_input_dir)

# Construct the homograph_info_dict and populate with all types of homographs
homograph_info_dict={}
homograph_info_dict['homographs_all']=get_homographs_and_clean_dataset(output_dir=output_dir, input_dir=raw_input_dir, column_name_to_homograph_type=column_name_to_homograph_type, val_to_col_name_dict=val_to_col_name_dict)
homograph_info_dict = populate_homograph_info_dict(homograph_info_dict)

100%|██████████| 192/192 [00:11<00:00, 16.80it/s]
100%|██████████| 192/192 [00:11<00:00, 16.15it/s]
100%|██████████| 3/3 [00:04<00:00,  1.51s/it]


In [66]:
print("The input dataset contains:", homograph_info_dict['homographs_all']['value'].nunique(), "unique all homograph values.")
print("There are:", homograph_info_dict['homographs_traditional']['value'].nunique(), 'unique traditional only homographs', 
homograph_info_dict['homographs_symbolic_code']['value'].nunique(), 'unique symbolic(code) only homographs',
homograph_info_dict['homographs_symbolic_numeric']['value'].nunique(), 'unique symbolic(numeric) only homographs')

The input dataset contains: 2740 unique all homograph values.
There are: 526 unique traditional only homographs 1083 unique symbolic(code) only homographs 906 unique symbolic(numeric) only homographs


In [67]:
# Append the null_tuples_df into the homograph_info_df
null_tuples_df = generate_null_equivalent_tuples(input_dir=raw_input_dir, homograph_info_df=homograph_info_dict['homographs_all'], num_meanings=10, num_values=100)
homograph_info_dict['homographs_all']=pd.concat([homograph_info_dict['homographs_all'], null_tuples_df])
with open('datasets/synthetic_benchmark_large3/homograph_info_dict.pickle', 'wb') as handle:
    pickle.dump(homograph_info_dict, handle)

100%|██████████| 100/100 [00:01<00:00, 51.40it/s]


## Injected Datasets Construction

In [68]:
# homograph_info_df=pd.read_pickle(output_dir+'homograph_info_df.pickle')
with open('datasets/synthetic_benchmark_large3/homograph_info_dict.pickle', 'rb') as handle:
    homograph_info_dict = pickle.load(handle)
num_homographs=100
modes=['homographs_traditional', 'homographs_symbolic_code', 'homographs_symbolic_numeric']
homs_dict = {}
for mode in modes:
    subtype=None
    if mode=='homographs_traditional':
        type='traditional'
    elif mode=='homographs_symbolic_code':
        type='symbolic';subtype='code'
    elif mode=='homographs_symbolic_numeric':
        type='symbolic';subtype='numeric'
    homs = construct_homograph_injected_dataset(input_dir=output_dir+'no_homographs/', output_dir=output_dir+mode+'_'+str(num_homographs)+'/', homograph_info_df=homograph_info_dict[mode], num_homographs=num_homographs, type=type, subtype=subtype, seed=0)
    homs_dict[mode]=homs
    
homs_dict['homographs_null_equivalent'] = construct_homograph_injected_dataset(input_dir=output_dir+'no_homographs/', output_dir=output_dir+'homographs_null_equivalent_'+str(num_homographs)+'/', homograph_info_df=homograph_info_dict['homographs_all'], num_homographs=num_homographs, type='null_equivalent', seed=0)

# TODO: Ensure correct insertion for all types homographs (ensuring exclusive per type insertion)
homs_dict['homographs_all'] = construct_homograph_injected_dataset(input_dir=output_dir+'no_homographs/', output_dir=output_dir+'homographs_all_'+str(num_homographs)+'/', homograph_info_df=homograph_info_dict['homographs_all'], num_homographs=num_homographs, type=None, seed=0, mode='all_types')

with open(output_dir+'selected_homographs.json', 'w') as f:
    json.dump(homs_dict, f, indent=4)

100%|██████████| 192/192 [00:00<00:00, 217.22it/s]
100%|██████████| 192/192 [00:00<00:00, 748.93it/s]
100%|██████████| 192/192 [00:00<00:00, 509.46it/s]
100%|██████████| 192/192 [00:00<00:00, 247.61it/s]
100%|██████████| 192/192 [00:01<00:00, 131.73it/s]


# Dataset Statistics

In [29]:
import pickle
with open('../graph_construction/combined_graphs_output/synthetic_benchmark_large3/homographs_symbolic_numeric_100/bipartite/bipartite.graph', 'rb') as f:
    G=pickle.load(f)
print(G.number_of_nodes(), G.number_of_edges())

119518 187745


In [42]:
utils.get_attribute_of_instance(G, '2000')

['airport_elevation_airport_elevation_1.csv',
 'airport_elevation_airport_elevation_2.csv',
 'airport_elevation_airport_elevation_3.csv',
 'airport_elevation_airport_elevation_4.csv',
 'airport_elevation_airport_elevation_8.csv',
 'airport_elevation_airport_elevation_9.csv',
 'car_model_year_car_make_car_model_car_model_year_1.csv',
 'car_model_year_car_make_car_model_car_model_year_10.csv',
 'car_model_year_car_make_car_model_car_model_year_11.csv',
 'car_model_year_car_make_car_model_car_model_year_12.csv',
 'car_model_year_car_make_car_model_car_model_year_2.csv',
 'car_model_year_car_make_car_model_car_model_year_3.csv',
 'car_model_year_car_make_car_model_car_model_year_4.csv',
 'car_model_year_car_make_car_model_car_model_year_5.csv',
 'car_model_year_car_make_car_model_car_model_year_6.csv',
 'car_model_year_car_make_car_model_car_model_year_7.csv',
 'car_model_year_car_make_car_model_car_model_year_8.csv',
 'car_model_year_car_make_car_model_car_model_year_9.csv']

# Testing

In [26]:
homograph_info_dict['homographs_symbolic_numeric']

Unnamed: 0,value,filename,column_name,type,subtype,contents_row
0,32,airport_elevation_9.csv,airport_elevation,symbolic,numeric,[32]
1,231,airport_elevation_9.csv,airport_elevation,symbolic,numeric,[231]
2,212,airport_elevation_9.csv,airport_elevation,symbolic,numeric,[212]
3,218,airport_elevation_9.csv,airport_elevation,symbolic,numeric,[218]
4,108,airport_elevation_9.csv,airport_elevation,symbolic,numeric,[108]
...,...,...,...,...,...,...
72748,995,row_id_12.csv,row_id,symbolic,numeric,[995]
72749,996,row_id_12.csv,row_id,symbolic,numeric,[996]
72750,997,row_id_12.csv,row_id,symbolic,numeric,[997]
72751,999,row_id_12.csv,row_id,symbolic,numeric,[999]


In [58]:
df = pd.read_pickle('evaluation/synthetic_benchmark_large3/eval_dfs.pickle')['homographs_symbolic_numeric']
df.reset_index(inplace=True, drop=True)
df.iloc[100:160]
# df[df['is_homograph']==True]

Unnamed: 0,node,node_type,approximate_betweenness_centrality,is_homograph,homograph_mode,precision,recall,f1_score
100,8508,cell,0.000868,False,,0.0,0.0,0.0
101,5504,cell,0.000858,False,,0.0,0.0,0.0
102,26009 CEDEX,cell,0.000846,False,,0.0,0.0,0.0
103,8118,cell,0.000838,False,,0.0,0.0,0.0
104,Philipet,cell,0.000828,False,,0.0,0.0,0.0
105,Clemenson,cell,0.000828,False,,0.0,0.0,0.0
106,Lehr,cell,0.000828,False,,0.0,0.0,0.0
107,Lafaye,cell,0.000828,False,,0.0,0.0,0.0
108,Baber,cell,0.000828,False,,0.0,0.0,0.0
109,Fonzo,cell,0.000828,False,,0.0,0.0,0.0


In [56]:
np.random.seed(0)
np.random.choice(homograph_info_df[(homograph_info_df['type']=='traditional')]['value'].unique(), size=10, replace=False)

array(['Colorado', 'Duff', 'Elan', 'Garvey', 'Elmira', 'Berkeley', 'Ram',
       'Charity', 'California', 'Crossfire'], dtype=object)

In [30]:
homograph_info_df[homograph_info_df['type']=='symbolic']['value'].unique()

array(['ID', 'NE', 'GT', 'AR', 'CO', 'MA', 'CA', 'DE', 'TL', 'MN', 'AL',
       'SD', 'PA', 'AZ', 'TN', 'CT', 'SC', 'IL', 'GA', 'MD', 'ME',
       'Colorado', 'Elan', 'ES', 'Crossfire', 'Ram', 'California',
       'Jimmy', nan], dtype=object)

In [37]:
val_to_col_name_dict[7]

{'airport_elevation', 'cost', 'row_id'}

In [50]:
homs_tmp=get_homographs_and_clean_dataset(output_dir='../DATA/tmp/', input_dir='../DATA/testing/', column_name_to_homograph_type=column_name_to_homograph_type, val_to_col_name_dict=val_to_col_name_dict)
homs_tmp

100%|██████████| 12/12 [00:00<00:00, 12.49it/s]


Unnamed: 0,value,filename,column_name,type,subtype,contents_row
0,0,row_id_airport_code_continent_code_airport_ele...,airport_code,symbolic,code,"[0, Usina Mandu Airport, 1597]"
1,10,row_id_airport_code_continent_code_airport_ele...,airport_elevation,symbolic,numeric,"[CCK, Cocos (Keeling) Islands Airport, 10]"
2,0,row_id_airport_code_continent_code_airport_ele...,airport_code,symbolic,code,"[0, Venâncio Aires Airport, 226]"
3,100,row_id_airport_code_continent_code_airport_ele...,airport_elevation,symbolic,numeric,"[NDA, Bandanaira Airport, 100]"
4,10,row_id_airport_code_continent_code_airport_ele...,airport_elevation,symbolic,numeric,"[ARD, Mali Airport, 10]"
...,...,...,...,...,...,...
1139,900,row_id_airport_code_continent_code_airport_ele...,airport_elevation,symbolic,numeric,"[ZZV, Zanesville Municipal Airport, 900]"
1140,13,row_id_airport_code_continent_code_airport_ele...,airport_elevation,symbolic,numeric,"[ARO, Arboletes Airport, 13]"
1141,626,row_id_airport_code_continent_code_airport_ele...,airport_elevation,symbolic,numeric,"[LNN, Willoughby Lost Nation Municipal Airport..."
1142,9,row_id_airport_code_continent_code_airport_ele...,airport_elevation,symbolic,numeric,"[FID, Elizabeth Field, 9]"


In [55]:
homs_tmp['value'].unique()

array(['0', '10', '100', '9', '13', '200', '90', '15', '650', '98', '164',
       '545', '42', '760', 'RVR', '325', '740', 'DNA', '88', '940', 'SIS',
       '3500', 'MKZ', '80', '300', '1500', 'TSX', '207', '1', '6000',
       '928', 'MPV', '120', '228', 'LHS', '57', '600', '745', 'SRX',
       '626', 'LUV', 'GTO', '430', '323', 'SLX', '2500', '750', '924',
       'GLI', '550', '62', '911', 'SSR', 'LOL', '960', '900', '9000',
       '525', '240', '500', '944', '330', 'MKS', '645', 'STS', '1000',
       'GTI', '968', 'APV', '914', '929', 'DBS', 'GLC', 'LSS', 'RSX'],
      dtype=object)