In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display, Image, HTML
from PIL import Image
import io
import os
import base64
from urllib.parse import urlparse
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

In [None]:
class WikidataPaintingScraper:
    def __init__(self):
        self.endpoint_url = "https://query.wikidata.org/sparql"

    def run_query(self, limit=1000):
        query = f"""
        SELECT DISTINCT
        ?painting ?paintingLabel ?image ?artist ?artistLabel
        ?inception ?materialLabel ?collection ?collectionLabel
        ?location ?locationLabel ?creationLocation ?creationLocationLabel
        ?height ?width
        WHERE {{
          ?painting wdt:P31 wd:Q3305213;    # painting
                   wdt:P18 ?image;          # image
                   wdt:P170 ?artist;        # creator
                   wdt:P571 ?inception;     # date
                   wdt:P186 ?material;      # material
                   wdt:P195 ?collection;    # collection
                   wdt:P276 ?location;      # location
                   wdt:P2048 ?height;       # height
                   wdt:P2049 ?width.        # width

          OPTIONAL {{ ?painting wdt:P1071 ?creationLocation. }}

          SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
        }}
        LIMIT {limit}
        """

        try:
            response = requests.get(
                self.endpoint_url,
                params={'query': query, 'format': 'json'},
                headers={'User-Agent': 'Mozilla/5.0'},
                timeout=60
            )
            return response.json()
        except Exception as e:
            print(f"Error: {e}")
            return None

In [None]:
scraper = WikidataPaintingScraper()
results = scraper.run_query(limit=5000)

In [None]:
if results:
    paintings_data = []
    for item in results['results']['bindings']:
        painting = {
            'title': item.get('paintingLabel', {}).get('value', ''),
            'image_url': item.get('image', {}).get('value', ''),
            'artist': item.get('artistLabel', {}).get('value', ''),
            'year': item.get('inception', {}).get('value', '')[:4] if 'inception' in item else '',
            'medium': item.get('materialLabel', {}).get('value', ''),
            'collection': item.get('collectionLabel', {}).get('value', ''),
            'current_location': item.get('locationLabel', {}).get('value', ''),
            'creation_location': item.get('creationLocationLabel', {}).get('value', ''),
            'height': float(item.get('height', {}).get('value', 0)) if 'height' in item else 0,
            'width': float(item.get('width', {}).get('value', 0)) if 'width' in item else 0,
        }
        paintings_data.append(painting)

    df = pd.DataFrame(paintings_data)
    print(f"Successfully collected {len(df)} paintings!")
else:
    print("Failed to fetch data")
    df = pd.DataFrame()

Successfully collected 5000 paintings!


In [None]:
import re

initial_count = len(df)

# Remove duplicates based on title and artist
df = df.drop_duplicates(subset=['title', 'artist'])

# Remove entries with empty titles
df = df[df['title'].str.len() > 0]

# Remove entries with empty image URLs
df = df[df['image_url'].str.len() > 0]

# Remove any URL-like patterns, QIDs
def clean_artist_name(artist):
    if not artist:
        return None

    artist_str = str(artist)

    # Remove entries that are primarily Wikidata QIDs
    if re.match(r'^Q\d+$', artist_str.strip()):
        return None

    # Remove any URL patterns or Wikidata identifiers
    if 'http' in artist_str or '/entity/Q' in artist_str:
        return None

    # Remove any HTML tags
    artist_str = re.sub(r'<[^>]+>', '', artist_str)

    # Remove Wikidata QIDs
    artist_str = re.sub(r'\(Q\d+\)', '', artist_str)
    artist_str = re.sub(r'\bQ\d+\b', '', artist_str)

    # Remove common Wikidata patterns
    patterns_to_remove = [
        r'wd:Q\d+',  # wd:Q12345
        r'entity/Q\d+',  # entity/Q12345
        r'www.wikidata.org',  # any wikidata URLs
    ]

    for pattern in patterns_to_remove:
        artist_str = re.sub(pattern, '', artist_str)

    # Clean up extra spaces and check if result is meaningful
    artist_str = re.sub(r'\s+', ' ', artist_str).strip()

    # If the result is empty, very short, or still looks like an ID, return None
    if (not artist_str or
        len(artist_str) < 2 or
        re.match(r'^[QW]\d+$', artist_str) or
        'wikidata' in artist_str.lower()):
        return None

    return artist_str

df['artist'] = df['artist'].apply(clean_artist_name)

# Remove entries where artist name is None or empty after cleaning
df = df[df['artist'].notna() & (df['artist'].str.len() > 0)]

# Remove any artists that still contain QID patterns
def has_qid_pattern(artist_name):
    """Check if artist name still contains QID patterns"""
    if not artist_name:
        return True
    patterns = [
        r'Q\d+',  # Any Q followed by numbers
        r'wd:Q',  # wd:Q pattern
        r'entity/Q',  # entity/Q pattern
    ]
    return any(re.search(pattern, str(artist_name)) for pattern in patterns)

# Remove entries that still have QID patterns in artist names
before_qid_filter = len(df)
df = df[~df['artist'].apply(has_qid_pattern)]
qid_removed = before_qid_filter - len(df)

# Convert year to integer (not float), remove entries with invalid years
df['year'] = pd.to_numeric(df['year'], errors='coerce')
df = df[df['year'].notna()]

# Convert year to integer type
df = df[df['year'].between(1200, 2024)]  # Reasonable year range
df['year'] = df['year'].astype(int)

# Handle missing values
df['creation_location'] = df['creation_location'].fillna('Unknown')
df['creation_location'] = df['creation_location'].replace('', 'Unknown')
df['current_location'] = df['current_location'].fillna('Unknown')
df['current_location'] = df['current_location'].replace('', 'Unknown')
df['collection'] = df['collection'].fillna('Unknown')
df['collection'] = df['collection'].replace('', 'Unknown')
df['medium'] = df['medium'].fillna('Unknown')
df['medium'] = df['medium'].replace('', 'Unknown')

# Remove entries with invalid dimensions (negative or zero values)
df = df[(df['height'] > 0) & (df['width'] > 0)]

# Remove any remaining rows with critical missing data
critical_columns = ['title', 'artist', 'year', 'image_url']
df = df.dropna(subset=critical_columns)

# Reset and create orderly IDs
df = df.reset_index(drop=True)
df['id'] = range(1, len(df) + 1)

# Reorder columns to have ID first
column_order = ['id'] + [col for col in df.columns if col != 'id']
df = df[column_order]

final_count = len(df)
total_removed = initial_count - final_count

print(f"Cleaning complete: {initial_count} → {final_count} paintings ({total_removed} removed)")


# Verify no QIDs remain in artist names
def check_for_remaining_qids(df):
    """Check if any QID patterns remain in artist names"""
    qid_patterns = [
        r'Q\d+',  # Any Q followed by numbers
        r'wd:Q',  # wd:Q pattern
        r'entity/Q',  # entity/Q pattern
    ]
    problematic_artists = []

    for artist in df['artist']:
        for pattern in qid_patterns:
            if re.search(pattern, str(artist)):
                problematic_artists.append(artist)
                break

    return problematic_artists

remaining_qids = check_for_remaining_qids(df)
if remaining_qids:
    print(f"\n Warning: Found {len(remaining_qids)} artists with remaining QID patterns:")
    for artist in remaining_qids[:5]:  # Show first 5
        print(f"   - {artist}")
else:
    print(f"\n Success: No QID patterns found in artist names!")

In [None]:
print("Dataset Information:")
print(f"Shape: {df.shape}")
display(df)

Dataset Information:
Shape: (1323, 11)


Unnamed: 0,id,title,image_url,artist,year,medium,collection,current_location,creation_location,height,width
0,1,The Vale of Rest,http://commons.wikimedia.org/wiki/Special:File...,John Everett Millais,1858,oil paint,National Gallery,Tate Britain,Unknown,102.9,172.7
1,2,The Crown of Thorns,http://commons.wikimedia.org/wiki/Special:File...,Anthony van Dyck,1618,oil paint,Museo del Prado,Museo del Prado,Unknown,225.0,197.0
2,3,Assumption of the Virgin,http://commons.wikimedia.org/wiki/Special:File...,Andrea del Castagno,1449,poplar panel,Solly Collection,"paintings of the Italian Renaissance, hall XVIII",Italy,131.0,150.7
3,4,Garden at Sainte-Adresse,http://commons.wikimedia.org/wiki/Special:File...,Claude Monet,1867,canvas,Metropolitan Museum of Art,Metropolitan Museum of Art,Unknown,98.1,129.9
4,5,Nocturne: Blue and Silver – Chelsea,http://commons.wikimedia.org/wiki/Special:File...,James McNeill Whistler,1871,panel,Tate,Tate Britain,London,50.2,60.8
...,...,...,...,...,...,...,...,...,...,...,...
1318,1319,Baptism of Christ,http://commons.wikimedia.org/wiki/Special:File...,Titian,1511,panel,Capitoline Museums,Capitoline Museums,Unknown,115.0,89.0
1319,1320,Baptism of Christ,http://commons.wikimedia.org/wiki/Special:File...,Parmigianino,1519,paint,Gemäldegalerie Berlin,Gemäldegalerie Berlin,Unknown,197.0,137.0
1320,1321,Saint Paul,http://commons.wikimedia.org/wiki/Special:File...,Masaccio,1426,tempera,National Museum of San Matteo,National Museum of San Matteo,Unknown,51.0,30.0
1321,1322,An Elegant Party in the Open,http://commons.wikimedia.org/wiki/Special:File...,Willem Pieterszoon Buytewech,1617,oil paint,Mauritshuis,Mauritshuis,Unknown,71.0,94.0


In [None]:
sample_df = df.head(15).reset_index(drop=True)

for idx, row in sample_df.iterrows():
    print(f"\n{idx+1}. {row['title']}")
    print(f"   Artist: {row['artist']}")
    print(f"   Year: {row['year']}")
    print(f"   Medium: {row['medium']}")
    print(f"   Location: {row['current_location']}")
    print(f"   Dimensions: {row['height']:.1f} cm × {row['width']:.1f} cm")

    if row['image_url']:
        try:
            display(Image(url=row['image_url'], width=200))
        except:
            print(f"   Image URL: {row['image_url']}")
    print("-" * 50)


1. The Vale of Rest
   Artist: John Everett Millais
   Year: 1858
   Medium: oil paint
   Location: Tate Britain
   Dimensions: 102.9 cm × 172.7 cm
   Image URL: http://commons.wikimedia.org/wiki/Special:FilePath/Millais%20-%20Das%20Tal%20der%20Stille.jpg
--------------------------------------------------

2. The Crown of Thorns
   Artist: Anthony van Dyck
   Year: 1618
   Medium: oil paint
   Location: Museo del Prado
   Dimensions: 225.0 cm × 197.0 cm
   Image URL: http://commons.wikimedia.org/wiki/Special:FilePath/Anthonis%20van%20Dyck%20004.jpg
--------------------------------------------------

3. Assumption of the Virgin
   Artist: Andrea del Castagno
   Year: 1449
   Medium: poplar panel
   Location: paintings of the Italian Renaissance, hall XVIII
   Dimensions: 131.0 cm × 150.7 cm
   Image URL: http://commons.wikimedia.org/wiki/Special:FilePath/Del%20Castagno%20Andrea%20Our%20Lady%20of%20the%20Assumption%20with%20Sts%20Miniato%20and%20Julian.jpg
------------------------------

In [None]:
from google.colab import files


csv_filename = 'paintings_dataset.csv'
df.to_csv(csv_filename, index=False, encoding='utf-8')

files.download(csv_filename)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

csv_filename_test = 'paintings_dataset_test.csv'
test_df.to_csv(csv_filename_test, index=False, encoding='utf-8')

files.download(csv_filename_test)


csv_filename_train = 'paintings_dataset_train.csv'
train_df.to_csv(csv_filename_train, index=False, encoding='utf-8')

files.download(csv_filename_train)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>