# Extract wikidata IDS from Openrefine

In [34]:
import pandas as pd
import requests
from io import StringIO
import uuid

In [35]:
url = "http://129.194.213.75/command/core/export-rows"
params = {
    'format': 'csv',
    'project': '2643014612954'
}

response = requests.post(url, params=params)

if response.status_code == 200:
    # Read the content of the response directly into a DataFrame
    df = pd.read_csv(StringIO(response.text), dtype ='str', low_memory=False)
else:
    print(f"Failed to load data. Status code: {response.status_code}")

In [36]:
df.head()

Unnamed: 0,id_exhibition_section,id_personage,wikidata_id,full_name_wd,no_wd_full_name,id,name,first_name,membership,biography,...,country_d,x_address_d,y_address_d,x_city_d,y_city_d,gender_lower,address1,id_address1,address2,id_address2
0,3194,37218,,Yannis Manganaris,Yannis Manganaris,33569,Manganaris,Yannis,,,...,,,,,,m,,,,
1,3875,49029,,Lionel Abrams,Lionel Abrams,41584,Abrams,Lionel,,,...,,,,,,m,,,,
2,3875,49030,,Walter Batiss,Walter Batiss,41585,Battiss,Walter,,,...,,,,,,m,,,,
3,3194,37221,,Maurice Farid,Maurice Farid,33572,Farid,Maurice,,,...,,,,,,m,,,,
4,3194,37222,,Mayez Rizk Ayoub,Mayez Rizk Ayoub,33573,Rizk Ayoub,Mayez,,,...,,,,,,,,,,


In [37]:
exhibitor = pd.read_csv("/Users/carboni/Downloads/Personage data.csv",  dtype ='str')

In [38]:
exhibitor["full_name_wd"] = exhibitor["first_name"] + " " + exhibitor["name"]

In [39]:
exhibitor.head()

Unnamed: 0,id,name,first_name,id_birth_address,id_death_address,id_birth_date,id_death_date,gender,membership,notes,...,birth_address,death_address,id_user_artist,biography,instructor,address1,id_address1,address2,id_address2,full_name_wd
0,37218,Manganaris,Yannis,14647.0,,1713.0,,M,,,...,,,,,,,,,,Yannis Manganaris
1,49029,Abrams,Lionel,,,3097.0,,M,,,...,,,,,,,,,,Lionel Abrams
2,49030,Battiss,Walter,,,1090.0,,M,,,...,,,,,,,,,,Walter Battiss
3,37221,Farid,Maurice,15109.0,,1704.0,,M,,,...,,,,,,,,,,Maurice Farid
4,37222,Rizk Ayoub,Mayez,,,,,,,,...,,,,,,,,,,Mayez Rizk Ayoub


In [40]:
df_wiki= df[['id_personage', 'wikidata_id', 'full_name_wd']]

In [41]:
exhibitor.rename(columns={'id': 'id_personage'}, inplace=True)

In [42]:
merged_data = exhibitor.merge(df_wiki, on='id_personage', how='left')

In [43]:
merged_data.head()

Unnamed: 0,id_personage,name,first_name,id_birth_address,id_death_address,id_birth_date,id_death_date,gender,membership,notes,...,id_user_artist,biography,instructor,address1,id_address1,address2,id_address2,full_name_wd_x,wikidata_id,full_name_wd_y
0,37218,Manganaris,Yannis,14647.0,,1713.0,,M,,,...,,,,,,,,Yannis Manganaris,,Yannis Manganaris
1,49029,Abrams,Lionel,,,3097.0,,M,,,...,,,,,,,,Lionel Abrams,,Lionel Abrams
2,49030,Battiss,Walter,,,1090.0,,M,,,...,,,,,,,,Walter Battiss,,Walter Batiss
3,37221,Farid,Maurice,15109.0,,1704.0,,M,,,...,,,,,,,,Maurice Farid,,Maurice Farid
4,37222,Rizk Ayoub,Mayez,,,,,,,,...,,,,,,,,Mayez Rizk Ayoub,,Mayez Rizk Ayoub


In [44]:
merged_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74579 entries, 0 to 74578
Data columns (total 28 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id_personage      74579 non-null  object
 1   name              74540 non-null  object
 2   first_name        66428 non-null  object
 3   id_birth_address  26194 non-null  object
 4   id_death_address  1214 non-null   object
 5   id_birth_date     10925 non-null  object
 6   id_death_date     2909 non-null   object
 7   gender            59061 non-null  object
 8   membership        3743 non-null   object
 9   notes             0 non-null      object
 10  source            0 non-null      object
 11  id_type           74579 non-null  object
 12  id_person         0 non-null      object
 13  nationality       12885 non-null  object
 14  id_user           74579 non-null  object
 15  timestamp         74579 non-null  object
 16  birth_address     11644 non-null  object
 17  death_addres

In [45]:
merged_data['Full_name'] = merged_data['full_name_wd_y'].combine_first(merged_data['full_name_wd_x'])

In [46]:
merged_data = merged_data.drop(columns=['full_name_wd_y', 'full_name_wd_x', 'id_user', 'timestamp'])

In [47]:
def generate_uuid(name):
    name_str = str(name)
    return str(uuid.uuid3(uuid.NAMESPACE_DNS, name_str))

In [48]:
merged_data['uuid'] = merged_data['Full_name'].apply(generate_uuid)

In [49]:
merged_data.head()

Unnamed: 0,id_personage,name,first_name,id_birth_address,id_death_address,id_birth_date,id_death_date,gender,membership,notes,...,id_user_artist,biography,instructor,address1,id_address1,address2,id_address2,wikidata_id,Full_name,uuid
0,37218,Manganaris,Yannis,14647.0,,1713.0,,M,,,...,,,,,,,,,Yannis Manganaris,7c1dac39-2918-3cf8-bb24-3b9b194edede
1,49029,Abrams,Lionel,,,3097.0,,M,,,...,,,,,,,,,Lionel Abrams,9e34b27c-7a96-32ed-bdf6-f2529ea23675
2,49030,Battiss,Walter,,,1090.0,,M,,,...,,,,,,,,,Walter Batiss,d859377e-8bc5-3a5b-b3fb-e84c2dca8fcb
3,37221,Farid,Maurice,15109.0,,1704.0,,M,,,...,,,,,,,,,Maurice Farid,2e7ef02a-1e79-3650-bf7e-093cfb853888
4,37222,Rizk Ayoub,Mayez,,,,,,,,...,,,,,,,,,Mayez Rizk Ayoub,0aa3f273-9fbb-30b5-b20f-941cae107f1b


In [50]:
columns = merged_data.columns

# Iterate through the columns
for col in columns:
    # Check if the column name contains a space
    if ' ' in col:
        # Replace spaces with underscores
        merged_data.rename(columns={col: col.replace(' ', '_')}, inplace=True)

In [51]:
merged_data = merged_data.dropna(axis=1, how='all')

In [56]:
merged_data.to_csv('/Users/carboni/Downloads/uuid_exhibitor.csv', index=False)

In [57]:
merged_data.to_xml('/Users/carboni/Documents/UNIGE/basart_downloads/2023/data/processed/personage/personage_wd.xml', root_name="data", pretty_print=True, index=False) 