In [47]:
import os
import json
import pandas as pd
from tqdm import tqdm
from pathlib import Path

# Functions

In [50]:
def get_val_to_col_name_dict(input_dir):
    '''
    Given an input directory containing the raw csv files return the `val_to_col_name_dict`
    '''
    val_to_col_name_dict = {}
    # Find the all Homographs and populate the `val_to_col_name_dict`
    for filename in tqdm(os.listdir(input_dir)):
        df = pd.read_csv(input_dir+filename)
        column_names=list(df.columns)
        for idx, row in df.iterrows():
            for i in range(len(row.tolist())):
                if row[i] in val_to_col_name_dict:
                    val_to_col_name_dict[row[i]].add(column_names[i])
                else:
                    val_to_col_name_dict[row[i]] = set([column_names[i]])

    return val_to_col_name_dict

def get_homographs_and_clean_dataset(input_dir, output_dir, column_name_to_homograph_type, val_to_col_name_dict):
    '''
    Identifies all homographs and outputs a dataframe with them at `output_dir`.
    Also creates a new version on the input tables without he inclusion of the homographs and outputs them at `output_dir/no_homographs/`

    Returns the `homograph_info_df`
    '''
    Path(output_dir+'no_homographs/').mkdir(parents=True, exist_ok=True)
    homograph_info_dict={'value': [], 'filename': [], 'column_name' : [], 'type': [], 'subtype': [], "contents_row": []}

    for filename in tqdm(os.listdir(input_dir)):
        df = pd.read_csv(input_dir+filename)
        column_names=list(df.columns)

        row_ids_with_homographs=set()

        for idx, row in df.iterrows():
            for i in range(len(row.tolist())):
                # Check if value is a homograph
                if len(val_to_col_name_dict[row[i]])>1:
                    row_ids_with_homographs.add(idx)
                    homograph_info_dict['value'].append(row[i])
                    homograph_info_dict['filename'].append(filename)
                    homograph_info_dict['column_name'].append(column_names[i])
                    homograph_info_dict['type'].append(column_name_to_homograph_type[column_names[i]]['type'])
                    homograph_info_dict['subtype'].append(column_name_to_homograph_type[column_names[i]]['subtype'])
                    homograph_info_dict['contents_row'].append(row.tolist())
        
        df_no_homographs = df.drop(labels=row_ids_with_homographs)
        df_no_homographs.to_csv(output_dir+'no_homographs/'+filename,index=False)

    homograph_info_df = pd.DataFrame.from_dict(homograph_info_dict)
    homograph_info_df.to_pickle(output_dir+'homograph_info_df.pickle')
    
    return homograph_info_df

In [51]:
raw_input_dir='../DATA/synthetic_benchmark/'
output_dir='datasets/synthetic_benchmark/'

with open('column_name_to_homograph_type.json') as f:
    column_name_to_homograph_type = json.load(f)
val_to_col_name_dict = get_val_to_col_name_dict(input_dir=raw_input_dir)
homograph_info_df=get_homographs_and_clean_dataset(output_dir=output_dir, input_dir=raw_input_dir, column_name_to_homograph_type=column_name_to_homograph_type, val_to_col_name_dict=val_to_col_name_dict)
homograph_info_df

100%|██████████| 13/13 [00:00<00:00, 13.15it/s]
100%|██████████| 13/13 [00:00<00:00, 15.52it/s]


Unnamed: 0,value,filename,column_name,type,subtype,contents_row
0,Sydney,location_city_country.csv,city,traditional,,"[Sydney, Australia]"
1,Sydney,location_city_country.csv,city,traditional,,"[Sydney, Australia]"
2,Cuba,location_city_country.csv,country,traditional,,"[Bartolomé Masó, Cuba]"
3,Elmira,location_city_country.csv,city,traditional,,"[Elmira, United States]"
4,Lincoln,location_city_country.csv,city,traditional,,"[Lincoln, United States]"
...,...,...,...,...,...,...
911,ID,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,"[Jekyll & Hyde, Horror, ID]"
912,ID,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,[Don't Torture a Duckling (Non si sevizia un p...
913,CA,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,"[My Gun is Quick, Mystery, CA]"
914,ID,product_movie_title_movie_genre_country_code.csv,country_code,symbolic,code,"[Savages, The, Comedy|Drama, ID]"
