In [45]:
import requests
import pandas as pd
import time

API_TOKEN = "Token 47486a33ddb1d39bd6bfe84dd669baef"
url = "https://api.mindat.org/geomaterials"

headers = {
    "Authorization": API_TOKEN,
    "Accept": "application/json"
}

fields = (
    "id,name,ima_formula,colour,lustre,"
    "hmin,hmax,density_min,density_max,csystem,groupid,entrytype_text,shortcode_ima"
)

params = {
    "entrytype": 0,
    "ima": True,
    "fields": fields,
    "format": "json",
    "page-size": 100,
    "page": 1
}

all_results = []
existing_ids = set()

# === FETCH FROM PAGE 1 to PAGE 49 (or last successful)
while params["page"] < 50:
    print(f"Fetching page {params['page']}...")
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"❌ Error {response.status_code} on page {params['page']}")
            break

        data = response.json()
        results = data.get("results", [])
        if not results:
            print("✅ All data fetched.")
            break

        new_results = [r for r in results if r["id"] not in existing_ids]
        all_results.extend(new_results)
        existing_ids.update(r["id"] for r in new_results)

        params["page"] += 1
        time.sleep(1)

    except Exception as e:
        print(f"❌ Error on page {params['page']}: {e}")
        print(response.text[:500] if response else "No response content.")
        break

# === CONTINUE FROM PAGE 50
params["page"] = 50
while True:
    print(f"Fetching page {params['page']}...")
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"❌ Error {response.status_code} on page {params['page']}")
            break

        data = response.json()
        results = data.get("results", [])
        if not results:
            print("✅ All data fetched.")
            break

        new_results = [r for r in results if r["id"] not in existing_ids]
        all_results.extend(new_results)
        existing_ids.update(r["id"] for r in new_results)

        params["page"] += 1
        time.sleep(1)

    except Exception as e:
        print(f"❌ Error on page {params['page']}: {e}")
        print(response.text[:500] if response else "No response content.")
        break

print(f"\n✅ Finished. Total minerals in memory: {len(all_results)}")


Fetching page 1...
Fetching page 2...
Fetching page 3...
Fetching page 4...
Fetching page 5...
Fetching page 6...
Fetching page 7...
Fetching page 8...
Fetching page 9...
Fetching page 10...
Fetching page 11...
Fetching page 12...
Fetching page 13...
Fetching page 14...
Fetching page 15...
Fetching page 16...
Fetching page 17...
Fetching page 18...
Fetching page 19...
Fetching page 20...
Fetching page 21...
Fetching page 22...
Fetching page 23...
Fetching page 24...
Fetching page 25...
Fetching page 26...
Fetching page 27...
Fetching page 28...
Fetching page 29...
Fetching page 30...
❌ Error 500 on page 30
Fetching page 50...
Fetching page 51...
Fetching page 52...
Fetching page 53...
Fetching page 54...
Fetching page 55...
Fetching page 56...
Fetching page 57...
Fetching page 58...
Fetching page 59...
Fetching page 60...
Fetching page 61...
Fetching page 62...
Fetching page 63...
✅ All data fetched.

✅ Finished. Total minerals in memory: 4129


In [46]:
# === FILL GAP: Pages 30 to 49 ===
params["page"] = 30
while params["page"] < 50:
    print(f"Refetching page {params['page']}...")
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code != 200:
            print(f"❌ Error {response.status_code} on page {params['page']}")
            break

        data = response.json()
        results = data.get("results", [])
        if not results:
            print("✅ No more results.")
            break

        new_results = [r for r in results if r["id"] not in existing_ids]
        all_results.extend(new_results)
        existing_ids.update(r["id"] for r in new_results)

        params["page"] += 1
        time.sleep(1)

    except Exception as e:
        print(f"❌ Error on page {params['page']}: {e}")
        print(response.text[:500] if response else "No response content.")
        break

print(f"\n✅ Now total minerals in memory: {len(all_results)}")


Refetching page 30...
Refetching page 31...
Refetching page 32...
Refetching page 33...
Refetching page 34...
Refetching page 35...
Refetching page 36...
Refetching page 37...
Refetching page 38...
Refetching page 39...
Refetching page 40...
Refetching page 41...
Refetching page 42...
Refetching page 43...
Refetching page 44...
Refetching page 45...
Refetching page 46...
Refetching page 47...
Refetching page 48...
Refetching page 49...

✅ Now total minerals in memory: 6129


In [73]:
import pandas as pd
import re

# Create the DataFrame from all_results
df = pd.DataFrame(all_results)

# Add Mindat URL
df["url"] = "https://www.mindat.org/min-" + df["id"].astype(str) + ".html"

# Format chemical formulas: replace <sub> and <sup> with Unicode characters
def replace_html_tags(formula: str) -> str:
    sub_map = str.maketrans("0123456789", "₀₁₂₃₄₅₆₇₈₉")
    sup_map = str.maketrans("0123456789+-=()", "⁰¹²³⁴⁵⁶⁷⁸⁹⁺⁻⁼⁽⁾")

    # Replace <sup>...</sup> blocks
    formula = re.sub(
        r"<sup>(.*?)</sup>",
        lambda m: m.group(1).translate(sup_map),
        formula
    )

    # Replace <sub>...</sub> blocks
    formula = re.sub(
        r"<sub>(.*?)</sub>",
        lambda m: m.group(1).translate(sub_map),
        formula
    )

    return formula

# Apply formatting to the ima_formula column
df["ima_formula"] = df["ima_formula"].astype(str).apply(replace_html_tags)

# fix unicode errors
df["ima_formula"] = df["ima_formula"].str.replace("&#9723;", "◻", regex=False)
df["ima_formula"] = df["ima_formula"].str.replace("&#8776;", "≈", regex=False)
df["ima_formula"] = df["ima_formula"].str.replace("&#60;", "<", regex=False)
df["ima_formula"] = df["ima_formula"].str.replace("&#8804;", "≤", regex=False)



# Display the cleaned DataFrame
display(df)


Unnamed: 0,id,name,ima_formula,groupid,entrytype_text,colour,hmin,hmax,lustre,csystem,shortcode_ima,url
0,1,Abelsonite,NiC₃₁H₃₂N₄,0,mineral,"Pink-purple, dark greyish purple, pale purplis...",2.0,3.0,Sub-metallic to adamantine,Triclinic,Abl,https://www.mindat.org/min-1.html
1,2,Abenakiite-(Ce),Na₂₆Ce₆(Si₆O₁₈)(PO₄)₆(CO₃)₆(SO₂)O,0,mineral,Pale brown,4.0,5.0,Vitreous,Trigonal,Abk-Ce,https://www.mindat.org/min-2.html
2,3,Abernathyite,K(UO₂)(AsO₄) &middot; 3H₂O,29311,mineral,yellow,2.5,3.0,Vitreous,Tetragonal,Abn,https://www.mindat.org/min-3.html
3,4,Abhurite,Sn²⁺₂₁O₆(OH)₁₄Cl₁₆,0,mineral,Colourless,2.0,2.0,,Trigonal,Abh,https://www.mindat.org/min-4.html
4,9,Abswurmbachite,Cu²⁺Mn³⁺₆O₈(SiO₄),43692,mineral,Black,6.5,6.5,,Tetragonal,Abs,https://www.mindat.org/min-9.html
...,...,...,...,...,...,...,...,...,...,...,...,...
6124,46049,Waimirite-(Y),YF₃,50244,mineral,"Pale pink, colorless",5.5,6.0,,Orthorhombic,Wai-Y,https://www.mindat.org/min-46049.html
6125,46052,Jahnsite-(CaFeMg),CaFe²⁺Mg₂Fe³⁺₂(PO₄)₄(OH)₂ &middot; 8H₂O,53039,mineral,Brownish orange,4.0,4.0,,Monoclinic,Jah-CaFeMg,https://www.mindat.org/min-46052.html
6126,46055,Torrecillasite,"Na(As,Sb)³⁺₄O₆Cl",471029,mineral,Colorless,2.5,2.5,,Orthorhombic,Trc,https://www.mindat.org/min-46055.html
6127,46057,Backite,Pb₂AlTeO₆Cl,0,mineral,Dark to pale blue-grey,2.0,3.0,,Trigonal,Bac,https://www.mindat.org/min-46057.html


In [75]:
# Save to CSV
df.to_csv("mindat_minerals.csv", index=False)