TÂCHE 1

In [51]:
!pip -q install sparqlwrapper

In [52]:
import sys
import pandas as pd
from SPARQLWrapper import SPARQLWrapper, JSON

endpoint_url = "https://query.wikidata.org/sparql"

# Sujet: montagnes (Q8502)
query = """SELECT DISTINCT ?itemLabel ?paysLabel ?image WHERE {
  ?item wdt:P31/wdt:P279* wd:Q8502 .
  OPTIONAL { ?item wdt:P17 ?pays . }
  ?item wdt:P18 ?image .
  
  FILTER(CONTAINS(STR(?image), "commons.wikimedia.org"))

  SERVICE wikibase:label { bd:serviceParam wikibase:language "fr". }
}
LIMIT 300
"""

def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (
        sys.version_info[0], 
        sys.version_info[1],
    )
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    return sparql.query().convert()

array = []
results = get_results(endpoint_url, query)

for result in results["results"]["bindings"]:
    array.append((
        result["itemLabel"]["value"],
        result.get("paysLabel", {}).get("value"),
        result["image"]["value"]
    ))

dataframe = pd.DataFrame(array, columns=["label", "pays", "image"])
dataframe = dataframe.drop_duplicates(subset=["image"]).reset_index(drop=True)

dataframe.head(10), len(dataframe)

(                label        pays  \
 0           Antofalla   Argentine   
 1              Aracar   Argentine   
 2       volcan Domuyo   Argentine   
 3          Antillanca       Chili   
 4          Acamarachi       Chili   
 5         mont Mageik  États-Unis   
 6                Solo       Chili   
 7             Copahue       Chili   
 8  volcan Antofagasta   Argentine   
 9    Volcán Momotombo   Nicaragua   
 
                                                image  
 0  http://commons.wikimedia.org/wiki/Special:File...  
 1  http://commons.wikimedia.org/wiki/Special:File...  
 2  http://commons.wikimedia.org/wiki/Special:File...  
 3  http://commons.wikimedia.org/wiki/Special:File...  
 4  http://commons.wikimedia.org/wiki/Special:File...  
 5  http://commons.wikimedia.org/wiki/Special:File...  
 6  http://commons.wikimedia.org/wiki/Special:File...  
 7  http://commons.wikimedia.org/wiki/Special:File...  
 8  http://commons.wikimedia.org/wiki/Special:File...  
 9  http://commons.w

In [53]:
import os

os.makedirs("images", exist_ok=True)
os.makedirs("data", exist_ok=True)

In [54]:
import requests
import shutil
from urllib.parse import urlparse
import time

def download_image(url):
    headers = {"User-Agent": "Mozilla/5.0"}
    r = requests.get(url, allow_redirects=True, headers=headers, stream=True)

    if r.status_code == 200:
        filename = os.path.basename(urlparse(url).path)
        path = os.path.join("images", filename)

        with open(path, "wb") as img:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, img)
    time.sleep(3)
    return r.status_code

In [55]:
# Téléchargement des 100 premières images
#dataframe = dataframe.head(100)
#dataframe["status"] = dataframe["image"].apply(download_image)

#dataframe["status"].value_counts()

In [56]:
len(os.listdir("images"))

100

In [None]:
import os, json, time
from urllib.parse import unquote
from PIL import Image, ExifTags

IMG_DIR = "images"
OUT_JSON = "data/images_metadata.json"
os.makedirs("data", exist_ok=True)

# liste des fichiers images
files = sorted([
    f for f in os.listdir(IMG_DIR)
    if os.path.isfile(os.path.join(IMG_DIR, f))
])

metadata = []
seen = set()

for local_fn in files:
    if local_fn in seen:
        continue
    seen.add(local_fn)

    local_path = os.path.join(IMG_DIR, local_fn)

    try:
        with Image.open(local_path) as img:
            width, height = img.size
            fmt = img.format

        size_kb = os.path.getsize(local_path) / 1024

        metadata.append({
            "file_name": local_fn,
            "width": width,
            "height": height,
            "format": fmt,
            "file_size_kb": round(size_kb, 2),
        })

    except Exception:
        continue

with open(OUT_JSON, "w", encoding="utf-8") as f:
    json.dump(metadata, f, indent=2, ensure_ascii=False)

len(metadata)

Entrées JSON: 100
Images ignorées: 0
Exemple: {'file_name': '%D0%92%D0%B5%D0%BB%D0%B8%D0%B2%D0%B0%D1%80%20%28%D0%94%D0%B5%D1%88%D0%B0%D1%82%29%2004.jpg', 'width': 1280, 'height': 960, 'format': 'JPEG', 'file_size_kb': 592.88, 'source_url': None, 'label': None, 'pays': None, 'license': {}, 'exif': {'Make': 'SONY', 'Model': 'DSC-P120', 'DateTime': '2017:04:14 12:43:19'}}


TÂCHE 2