In [26]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Image, HTML
from PIL import Image
import io
import os
import base64
from urllib.parse import urlparse
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [27]:
class WikidataPaintingScraper:
    def __init__(self):
        self.endpoint_url = "https://query.wikidata.org/sparql"

    def run_query(self, limit=1000):
        query = f"""
        SELECT DISTINCT
        ?painting ?paintingLabel ?image ?artist ?artistLabel
        ?inception ?materialLabel ?collection ?collectionLabel
        ?location ?locationLabel ?creationLocation ?creationLocationLabel
        ?height ?width ?movement ?movementLabel
        WHERE {{
          ?painting wdt:P31 wd:Q3305213;    # painting
                   wdt:P18 ?image;          # image
                   wdt:P170 ?artist;        # creator
                   wdt:P571 ?inception;     # date
                   wdt:P186 ?material;      # material
                   wdt:P195 ?collection;    # collection
                   wdt:P276 ?location;      # location
                   wdt:P2048 ?height;       # height
                   wdt:P2049 ?width.        # width

          OPTIONAL {{ ?painting wdt:P1071 ?creationLocation. }}
          OPTIONAL {{ ?painting wdt:P135 ?movement. }}

          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        LIMIT {limit}
        """

        try:
            response = requests.get(
                self.endpoint_url,
                params={'query': query, 'format': 'json'},
                headers={'User-Agent': 'Mozilla/5.0'},
                timeout=60
            )
            return response.json()
        except Exception as e:
            print(f"Error: {e}")
            return None

In [28]:
scraper = WikidataPaintingScraper()
results = scraper.run_query(limit=10000)

In [29]:
if results:
    paintings_data = []
    for item in results['results']['bindings']:
        painting = {
            'title': item.get('paintingLabel', {}).get('value', ''),
            'image_url': item.get('image', {}).get('value', ''),
            'artist': item.get('artistLabel', {}).get('value', ''),
            'year': item.get('inception', {}).get('value', '')[:4] if 'inception' in item else '',
            'medium': item.get('materialLabel', {}).get('value', ''),
            'collection': item.get('collectionLabel', {}).get('value', ''),
            'current_location': item.get('locationLabel', {}).get('value', ''),
            'creation_location': item.get('creationLocationLabel', {}).get('value', ''),
            'height': float(item.get('height', {}).get('value', 0)) if 'height' in item else 0,
            'width': float(item.get('width', {}).get('value', 0)) if 'width' in item else 0,
            'movement': item.get('movementLabel', {}).get('value', ''),
        }
        paintings_data.append(painting)

    df = pd.DataFrame(paintings_data)
    print(f"Successfully collected {len(df)} paintings!")
else:
    print("Failed to fetch data")
    df = pd.DataFrame()

Successfully collected 10000 paintings!


In [30]:
import re

initial_count = len(df)

# Remove duplicates based on title and artist
df = df.drop_duplicates(subset=['title', 'artist'])

# Remove entries with empty titles
df = df[df['title'].str.len() > 0]

# Remove entries with empty image URLs
df = df[df['image_url'].str.len() > 0]

# Remove entries with empty movements
df = df[df['movement'].str.len() > 0]

# Remove any URL-like patterns, QIDs
def clean_artist_name(artist):
    if not artist:
        return None

    artist_str = str(artist)

    # Remove entries that are primarily Wikidata QIDs
    if re.match(r'^Q\d+$', artist_str.strip()):
        return None

    # Remove any URL patterns or Wikidata identifiers
    if 'http' in artist_str or '/entity/Q' in artist_str:
        return None

    # Remove any HTML tags
    artist_str = re.sub(r'<[^>]+>', '', artist_str)

    # Remove Wikidata QIDs
    artist_str = re.sub(r'\(Q\d+\)', '', artist_str)
    artist_str = re.sub(r'\bQ\d+\b', '', artist_str)

    # Remove common Wikidata patterns
    patterns_to_remove = [
        r'wd:Q\d+',  # wd:Q12345
        r'entity/Q\d+',  # entity/Q12345
        r'www.wikidata.org',  # any wikidata URLs
    ]

    for pattern in patterns_to_remove:
        artist_str = re.sub(pattern, '', artist_str)

    # Clean up extra spaces and check if result is meaningful
    artist_str = re.sub(r'\s+', ' ', artist_str).strip()

    # If the result is empty, very short, or still looks like an ID, return None
    if (not artist_str or
        len(artist_str) < 2 or
        re.match(r'^[QW]\d+$', artist_str) or
        'wikidata' in artist_str.lower()):
        return None

    return artist_str

df['artist'] = df['artist'].apply(clean_artist_name)

# Remove entries where artist name is None or empty after cleaning
df = df[df['artist'].notna() & (df['artist'].str.len() > 0)]

# Remove any artists that still contain QID patterns
def has_qid_pattern(artist_name):
    """Check if artist name still contains QID patterns"""
    if not artist_name:
        return True
    patterns = [
        r'Q\d+',  # Any Q followed by numbers
        r'wd:Q',  # wd:Q pattern
        r'entity/Q',  # entity/Q pattern
    ]
    return any(re.search(pattern, str(artist_name)) for pattern in patterns)

# Remove entries that still have QID patterns in artist names
before_qid_filter = len(df)
df = df[~df['artist'].apply(has_qid_pattern)]
qid_removed = before_qid_filter - len(df)

# Convert year to integer (not float), remove entries with invalid years
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df = df[df['year'].notna()]

# Convert year to integer type
df = df[df['year'].between(1200, 2024)]  # Reasonable year range
df['year'] = df['year'].astype(int)

# Handle missing values
df['creation_location'] = df['creation_location'].fillna('Unknown')
df['creation_location'] = df['creation_location'].replace('', 'Unknown')
df['current_location'] = df['current_location'].fillna('Unknown')
df['current_location'] = df['current_location'].replace('', 'Unknown')
df['collection'] = df['collection'].fillna('Unknown')
df['collection'] = df['collection'].replace('', 'Unknown')
df['medium'] = df['medium'].fillna('Unknown')
df['medium'] = df['medium'].replace('', 'Unknown')

# Remove entries with invalid dimensions (negative or zero values)
df = df[(df['height'] > 0) & (df['width'] > 0)]

# Remove any remaining rows with critical missing data
critical_columns = ['title', 'artist', 'year', 'image_url', 'movement']
df = df.dropna(subset=critical_columns)

# Reset and create orderly IDs
df = df.reset_index(drop=True)
df['id'] = range(1, len(df) + 1)

# Reorder columns to have ID first
column_order = ['id'] + [col for col in df.columns if col != 'id']
df = df[column_order]

final_count = len(df)
total_removed = initial_count - final_count

print(f"Cleaning complete: {initial_count} → {final_count} paintings ({total_removed} removed)")


# Verify no QIDs remain in artist names
def check_for_remaining_qids(df):
    """Check if any QID patterns remain in artist names"""
    qid_patterns = [
        r'Q\d+',  # Any Q followed by numbers
        r'wd:Q',  # wd:Q pattern
        r'entity/Q',  # entity/Q pattern
    ]
    problematic_artists = []

    for artist in df['artist']:
        for pattern in qid_patterns:
            if re.search(pattern, str(artist)):
                problematic_artists.append(artist)
                break

    return problematic_artists

remaining_qids = check_for_remaining_qids(df)
if remaining_qids:
    print(f"\n Warning: Found {len(remaining_qids)} artists with remaining QID patterns:")
    for artist in remaining_qids[:5]:  # Show first 5
        print(f"   - {artist}")
else:
    print(f"\n Success: No QID patterns found in artist names!")

Cleaning complete: 10000 → 1196 paintings (8804 removed)

 Success: No QID patterns found in artist names!


In [31]:
print("Dataset Information:")
print(f"Shape: {df.shape}")
display(df)

Dataset Information:
Shape: (1196, 12)


Unnamed: 0,id,title,image_url,artist,year,medium,collection,current_location,creation_location,height,width,movement
0,1,Manaò tupapaú,http://commons.wikimedia.org/wiki/Special:File...,Paul Gauguin,1892,oil paint,Buffalo AKG Art Museum,Buffalo AKG Art Museum,Tahiti,28.75,92.4,primitivism
1,2,Virgin of the Councillors,http://commons.wikimedia.org/wiki/Special:File...,Lluís Dalmau,1445,oak panel,Museu Nacional d'Art de Catalunya,Museu Nacional d'Art de Catalunya,Unknown,311.00,311.5,Gothic painting
2,3,The Peasant Wedding,http://commons.wikimedia.org/wiki/Special:File...,Pieter Brueghel the Elder,1568,oil paint,Kunsthistorisches Museum,Q820370,Unknown,113.00,164.0,Northern Renaissance
3,4,The Blue Boy,http://commons.wikimedia.org/wiki/Special:File...,Thomas Gainsborough,1770,oil paint,"The Huntington Library, Art Museum, and Botani...","The Huntington Library, Art Museum, and Botani...",England,179.40,123.8,Romanticism
4,5,Madonna with the Long Neck,http://commons.wikimedia.org/wiki/Special:File...,Parmigianino,1530,panel,Uffizi Gallery,Uffizi Gallery,Unknown,219.00,132.0,Mannerism
...,...,...,...,...,...,...,...,...,...,...,...,...
1191,1192,The Legend of Polydoros,http://commons.wikimedia.org/wiki/Special:File...,Titian,1506,panel,Civic Museums of Padua,Civic Museums of Padua,Unknown,35.00,162.0,High Renaissance
1192,1193,The Duel After the Masquerade,http://commons.wikimedia.org/wiki/Special:File...,Jean-Léon Gérôme,1857,oil paint,Condé Museum,Condé Museum,Unknown,68.00,99.0,academic art
1193,1194,Galitzin Triptych,http://commons.wikimedia.org/wiki/Special:File...,Pietro Perugino,1481,tempera,National Gallery of Art,National Gallery of Art,Unknown,95.00,30.0,Italian Renaissance
1194,1195,The Family of Philip V,http://commons.wikimedia.org/wiki/Special:File...,Jean Ranc,1723,oil paint,Museo del Prado,Museo del Prado,Unknown,44.00,65.0,Rococo painting


In [33]:
sample_df = df.head(15).reset_index(drop=True)

for idx, row in sample_df.iterrows():
    print(f"\n{idx+1}. {row['title']}")
    print(f"   Artist: {row['artist']}")
    print(f"   Year: {row['year']}")
    print(f"   Medium: {row['medium']}")
    print(f"   Location: {row['current_location']}")
    print(f"   Dimensions: {row['height']:.1f} cm × {row['width']:.1f} cm")
    print(f"   Movement: {row['movement']}")

    if row['image_url']:
        try:
            display(Image(url=row['image_url'], width=200))
        except:
            print(f"   Image URL: {row['image_url']}")
    print("-" * 50)


1. Manaò tupapaú
   Artist: Paul Gauguin
   Year: 1892
   Medium: oil paint
   Location: Buffalo AKG Art Museum
   Dimensions: 28.8 cm × 92.4 cm
   Movement: primitivism
   Image URL: http://commons.wikimedia.org/wiki/Special:FilePath/Paul%20Gauguin%20-%20Mana%C3%B2%20tupapa%C3%BA%20%28Spirit%20of%20the%20Dead%20Watching%29%20-%201965-1%20-%20Albright%E2%80%93Knox%20Art%20Gallery.tiff
--------------------------------------------------

2. Virgin of the Councillors
   Artist: Lluís Dalmau
   Year: 1445
   Medium: oak panel
   Location: Museu Nacional d'Art de Catalunya
   Dimensions: 311.0 cm × 311.5 cm
   Movement: Gothic painting
   Image URL: http://commons.wikimedia.org/wiki/Special:FilePath/Dalmau%20Mare%20de%20Deu%20dels%20Consellers.jpg
--------------------------------------------------

3. The Peasant Wedding
   Artist: Pieter Brueghel the Elder
   Year: 1568
   Medium: oil paint
   Location: Q820370
   Dimensions: 113.0 cm × 164.0 cm
   Movement: Northern Renaissance
   Image 

In [34]:
from google.colab import files


csv_filename = 'paintings_dataset.csv'
df.to_csv(csv_filename, index=False, encoding='utf-8')

files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [35]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

csv_filename_test = 'paintings_dataset_test.csv'
test_df.to_csv(csv_filename_test, index=False, encoding='utf-8')

files.download(csv_filename_test)


csv_filename_train = 'paintings_dataset_train.csv'
train_df.to_csv(csv_filename_train, index=False, encoding='utf-8')

files.download(csv_filename_train)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>