# Extract wikidata IDS from Openrefine

In [29]:
import pandas as pd
import requests
from io import StringIO
import uuid

In [30]:
url = "http://129.194.213.75/command/core/export-rows"
params = {
    'format': 'csv',
    'project': '2623392316428'
}

response = requests.post(url, params=params)

if response.status_code == 200:
    # Read the content of the response directly into a DataFrame
    df = pd.read_csv(StringIO(response.text), dtype ='str', low_memory=False)
else:
    print(f"Failed to load data. Status code: {response.status_code}")

In [31]:
df.head()

Unnamed: 0,id_exhibition_section,identity,randomINT,facet_count,id_personage,id,name,first_name,full_name,wikidata_id,...,country_d,x_address_d,y_address_d,x_city_d,y_city_d,gender_lower,address1,id_address1,address2,id_address2
0,3194,,320860,2,37218,33569,Manganaris,Yannis,Manganaris Yannis,,...,,,,,,m,,,,
1,3875,,517062,2,49029,41584,Abrams,Lionel,Abrams Lionel,,...,,,,,,m,,,,
2,3875,,257730,3,49030,41585,Battiss,Walter,Walter Battiss,,...,,,,,,m,,,,
3,3194,,792446,4,37221,33572,Farid,Maurice,Farid Maurice,,...,,,,,,m,,,,
4,3194,,583494,1,37222,33573,Rizk Ayoub,Mayez,Rizk Ayoub Mayez,,...,,,,,,,,,,


In [39]:
exhibitor = pd.read_csv("/Users/carboni/Documents/UNIGE/basart_downloads/2023/data/processed/personage/Exhibitor.csv",  dtype ='str')

In [40]:
exhibitor["full_name"] = exhibitor["first_name"] + " " + exhibitor["name"]

In [41]:
exhibitor.head()

Unnamed: 0,id_exhibition_section,id_personage,id,name,first_name,membership,biography,nationality,instructor,notes,...,x_address_d,y_address_d,x_city_d,y_city_d,gender_lower,address1,id_address1,address2,id_address2,full_name
0,3194,37218,33569,Manganaris,Yannis,,,,,,...,,,,,m,,,,,Yannis Manganaris
1,3875,49029,41584,Abrams,Lionel,,,,,,...,,,,,m,,,,,Lionel Abrams
2,3875,49030,41585,Battiss,Walter,,,,,,...,,,,,m,,,,,Walter Battiss
3,3194,37221,33572,Farid,Maurice,,,,,,...,,,,,m,,,,,Maurice Farid
4,3194,37222,33573,Rizk Ayoub,Mayez,,,,,,...,,,,,,,,,,Mayez Rizk Ayoub


In [42]:
df_wiki= df[['id_personage', 'wikidata_id', 'full_name']]

In [43]:
merged_data = exhibitor.merge(df_wiki, on='id_personage', how='left', suffixes=('_exhibitor', '_wiki'))

In [44]:
merged_data.head()

Unnamed: 0,id_exhibition_section,id_personage,id,name,first_name,membership,biography,nationality,instructor,notes,...,x_city_d,y_city_d,gender_lower,address1,id_address1,address2,id_address2,full_name_exhibitor,wikidata_id,full_name_wiki
0,3194,37218,33569,Manganaris,Yannis,,,,,,...,,,m,,,,,Yannis Manganaris,,Manganaris Yannis
1,3875,49029,41584,Abrams,Lionel,,,,,,...,,,m,,,,,Lionel Abrams,,Abrams Lionel
2,3875,49030,41585,Battiss,Walter,,,,,,...,,,m,,,,,Walter Battiss,,Walter Battiss
3,3194,37221,33572,Farid,Maurice,,,,,,...,,,m,,,,,Maurice Farid,,Farid Maurice
4,3194,37222,33573,Rizk Ayoub,Mayez,,,,,,...,,,,,,,,Mayez Rizk Ayoub,,Rizk Ayoub Mayez


In [None]:
merged_data['Full_name'] = merged_data['Full_name_y'].combine_first(merged_data['Full_name_x'])

In [8]:
merged_data["full_name"] = merged_data["first_name"] + " " + merged_data["name"]

In [17]:
merged_data["full_name"] = merged_data["full_name"].str.strip()

In [13]:
def generate_uuid(name):
    name_str = str(name)
    return str(uuid.uuid3(uuid.NAMESPACE_DNS, name_str))

In [19]:
merged_data['uuid'] = merged_data['full_name'].apply(generate_uuid)

In [20]:
merged_data.head()

Unnamed: 0,id_exhibition_section,id_personage,id,name,first_name,membership,biography,nationality,instructor,notes,...,x_city_d,y_city_d,gender_lower,address1,id_address1,address2,id_address2,wikidata_id,full_name,uuid
0,3194,37218,33569,Manganaris,Yannis,,,,,,...,,,m,,,,,,Yannis Manganaris,7c1dac39-2918-3cf8-bb24-3b9b194edede
1,3875,49029,41584,Abrams,Lionel,,,,,,...,,,m,,,,,,Lionel Abrams,9e34b27c-7a96-32ed-bdf6-f2529ea23675
2,3875,49030,41585,Battiss,Walter,,,,,,...,,,m,,,,,,Walter Battiss,34153523-e149-3042-95e0-ffcfd816162e
3,3194,37221,33572,Farid,Maurice,,,,,,...,,,m,,,,,,Maurice Farid,2e7ef02a-1e79-3650-bf7e-093cfb853888
4,3194,37222,33573,Rizk Ayoub,Mayez,,,,,,...,,,,,,,,,Mayez Rizk Ayoub,0aa3f273-9fbb-30b5-b20f-941cae107f1b


In [16]:
columns = merged_data.columns

# Iterate through the columns
for col in columns:
    # Check if the column name contains a space
    if ' ' in col:
        # Replace spaces with underscores
        merged_data.rename(columns={col: col.replace(' ', '_')}, inplace=True)

In [21]:
merged_data.to_csv('/Users/carboni/uuid_exhibitor.csv', index=False)

In [22]:
merged_data.to_xml('/Users/carboni/Documents/UNIGE/basart_downloads/2023/data/processed/personage/personage_wd.xml', root_name="data", pretty_print=True) 