In [47]:
import pandas as pd

In [53]:
def read_csv(file, separator):
    df = pd.read_csv(file,  sep=separator, quotechar='"', engine='python', on_bad_lines='skip')
    return df

def clean_df(df,dropped_columns):
    clean_df = df.drop(dropped_columns,axis=1)
    for col in clean_df.columns:
        clean_df[col] = clean_df[col].apply(lambda x: str(x).strip().lower() if isinstance(x,str) else x)
    clean_df['phone'] = pd.to_numeric(clean_df['phone'],downcast='integer',errors='coerce')
    return clean_df

In [56]:
facebook_dataset = './datasets/facebook_dataset.csv'
df_facebook = read_csv(facebook_dataset, ',')

google_dataset = './datasets/google_dataset.csv'
df_google = read_csv(google_dataset, ',')

website_dataset = './datasets/website_dataset.csv'
df_website = read_csv(website_dataset, ';')

facebook_dropped_columns = ['country_code', 'description', 'link', 'page_type', 'phone_country_code', 'region_code', 'zip_code']
google_dropped_columns = ['country_code', 'phone_country_code', 'raw_address', 'raw_phone', 'region_code', 'text', 'zip_code']
website_dropped_columns = ['domain_suffix', 'language', 'tld']

df_facebook_clean = clean_df(df_facebook,facebook_dropped_columns)
df_google_clean = clean_df(df_google,google_dropped_columns)
df_website_clean = clean_df(df_website, website_dropped_columns)


df_website_clean.rename(columns={'root_domain': 'domain'}, inplace=True)

In [65]:
df_google_facebook = df_google_clean.merge(df_facebook_clean, on=['domain','phone'], how='outer')
df_google_facebook.columns = [col.replace("_x","_facebook").replace("_y","_google") for col in  df_google_facebook.columns]

print(df_google_facebook.columns)

Index(['address_facebook', 'category', 'city_facebook',
       'country_name_facebook', 'name_facebook', 'phone',
       'region_name_facebook', 'domain', 'address_google', 'categories',
       'city_google', 'country_name_google', 'email', 'name_google',
       'region_name_google'],
      dtype='object')


In [77]:
df_merged = df_google_facebook.merge(df_website_clean, on=['domain','phone'], how='outer')

In [79]:
reordered_columns = ['domain','phone',
                     'legal_name','site_name','name_facebook','name_google',
                     's_category','category','categories','email',
                     'main_country','country_name_facebook','country_name_google',
                     'main_city','city_facebook','city_google',
                     'main_region','region_name_facebook','region_name_google',
                     'address_facebook','address_google']
df_merged = df_merged[reordered_columns]

In [80]:
df_merged.to_csv("./results/merged_dataset.csv",index=False)