TÂCHE 1

In [51]:
!pip -q install sparqlwrapper

In [52]:
import sys
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

# Sujet: montagnes (Q8502)
query = """SELECT DISTINCT ?itemLabel ?paysLabel ?image WHERE {
  ?item wdt:P31/wdt:P279* wd:Q8502 .
  OPTIONAL { ?item wdt:P17 ?pays . }
  ?item wdt:P18 ?image .
  
  FILTER(CONTAINS(STR(?image), "commons.wikimedia.org"))

  SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". }
}
LIMIT 300
"""

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0], 
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

array = []
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    array.append((
        result["itemLabel"]["value"],
        result.get("paysLabel", {}).get("value"),
        result["image"]["value"]
    ))

dataframe = pd.DataFrame(array, columns=["label", "pays", "image"])
dataframe = dataframe.drop_duplicates(subset=["image"]).reset_index(drop=True)

dataframe.head(10), len(dataframe)

(                label        pays  \
 0           Antofalla   Argentine   
 1              Aracar   Argentine   
 2       volcan Domuyo   Argentine   
 3          Antillanca       Chili   
 4          Acamarachi       Chili   
 5         mont Mageik  États-Unis   
 6                Solo       Chili   
 7             Copahue       Chili   
 8  volcan Antofagasta   Argentine   
 9    Volcán Momotombo   Nicaragua   
 
                                                image  
 0  http://commons.wikimedia.org/wiki/Special:File...  
 1  http://commons.wikimedia.org/wiki/Special:File...  
 2  http://commons.wikimedia.org/wiki/Special:File...  
 3  http://commons.wikimedia.org/wiki/Special:File...  
 4  http://commons.wikimedia.org/wiki/Special:File...  
 5  http://commons.wikimedia.org/wiki/Special:File...  
 6  http://commons.wikimedia.org/wiki/Special:File...  
 7  http://commons.wikimedia.org/wiki/Special:File...  
 8  http://commons.wikimedia.org/wiki/Special:File...  
 9  http://commons.w

In [53]:
import os

os.makedirs("images", exist_ok=True)
os.makedirs("data", exist_ok=True)

In [54]:
import requests
import shutil
from urllib.parse import urlparse
import time

def download_image(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, allow_redirects=True, headers=headers, stream=True)

    if r.status_code == 200:
        filename = os.path.basename(urlparse(url).path)
        path = os.path.join("images", filename)

        with open(path, "wb") as img:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, img)
    time.sleep(3)
    return r.status_code

In [55]:
# Téléchargement des 100 premières images
#dataframe = dataframe.head(100)
#dataframe["status"] = dataframe["image"].apply(download_image)

#dataframe["status"].value_counts()

In [56]:
len(os.listdir("images"))

100

In [60]:
import os, json, time
from urllib.parse import quote
import requests
from PIL import Image, ExifTags

IMG_DIR = "images"
OUT_JSON = "data/images_metadata.json"
os.makedirs("data", exist_ok=True)

# 1) Liste des fichiers
files = sorted([
    f for f in os.listdir(IMG_DIR)
    if os.path.isfile(os.path.join(IMG_DIR, f))
])

# 2) Commons API pour la licence
COMMONS_API = "https://commons.wikimedia.org/w/api.php"

def get_license_info(file_name: str) -> dict:
    """
    Récupère la licence via l'API Commons (extmetadata).
    Retourne un dict simple.
    """
    params = {
        "action": "query",
        "format": "json",
        "titles": f"File:{file_name}",
        "prop": "imageinfo",
        "iiprop": "extmetadata",
    }
    headers = {"User-Agent": "ImageRecoStudentProject/1.0"}
    r = requests.get(COMMONS_API, params=params, headers=headers, timeout=30)
    r.raise_for_status()
    data = r.json()

    pages = data.get("query", {}).get("pages", {})
    page = next(iter(pages.values()), {})
    infos = page.get("imageinfo", [])
    if not infos:
        return {}

    ext = infos[0].get("extmetadata", {}) or {}
    return {
        "license_short": ext.get("LicenseShortName", {}).get("value"),
        "license_url": ext.get("LicenseUrl", {}).get("value"),
        "usage_terms": ext.get("UsageTerms", {}).get("value"),
    }

# 3) EXIF (si dispo)
KEEP_EXIF = {"Model", "Make", "DateTimeOriginal", "DateTime", "LensModel"}

def extract_exif_basic(img: Image.Image) -> dict:
    try:
        exif = img.getexif()
        if not exif:
            return {}
        out = {}
        for tag_id, value in exif.items():
            tag = ExifTags.TAGS.get(tag_id, str(tag_id))
            if tag in KEEP_EXIF and not isinstance(value, (bytes, bytearray)):
                out[tag] = value
        return out
    except Exception:
        return {}

# 4) Construction du JSON
metadata = []
skipped = []

for i, file_name in enumerate(files):
    path = os.path.join(IMG_DIR, file_name)

    try:
        with Image.open(path) as img:
            width, height = img.size
            fmt = img.format
            exif = extract_exif_basic(img)

        size_kb = os.path.getsize(path) / 1024

        # URL source (reconstruite)
        source_url = "https://commons.wikimedia.org/wiki/Special:FilePath/" + quote(file_name)

        # licence (API)
        license_info = get_license_info(file_name)

        metadata.append({
            "file_name": file_name,
            "width": width,
            "height": height,
            "format": fmt,
            "file_size_kb": round(size_kb, 2),

            "source_url": source_url,
            "license": license_info,
            "exif": exif
        })

        time.sleep(0.1)  # politesse API (augmente si tu vois du 429)

    except Exception as e:
        skipped.append((file_name, str(e)))

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)
    
print("Entrées JSON:", len(metadata))
print("Ignorées:", len(skipped))

Entrées JSON: 100
Ignorées: 0


TÂCHE 2