**Imports and Setup**

In [9]:
import os

import numpy as np
import pandas as pd
import requests
import dotenv
import json

import duckdb

# setup
API_KEY = dotenv.get_key(".env", "api_min")

# API root entry point
MINDAT_API_URL = "https://api.mindat.org"

# authorization header that must be included with each request.
headers = {'Authorization': 'Token ' + API_KEY}

# directories
DATA_DIR = os.path.join(os.path.abspath('./'), 'data')
QUERIES_DIR = os.path.join(os.path.abspath('./'), 'queries')

# fixing pandas analyzer
duckdb.execute("SET GLOBAL pandas_analyze_sample = 100_000")

<duckdb.duckdb.DuckDBPyConnection at 0x22fc7a6dab0>

**Making a simple request**

In [None]:

# making a request

end_point = "/geomaterials/"
filter_dict = {
  'page':1,
  'page_size':1000
}

df_all = pd.DataFrame()
interest_cols = ['id','guid','name','colour','entrytype','mindat_formula','occurrence']

endpoint = MINDAT_API_URL+end_point
print("endpoint", endpoint)
response = requests.get(endpoint,params=filter_dict,headers=headers)

mineral_list = response.json()['results']
df_page = pd.DataFrame.from_dict(mineral_list)
df_page[interest_cols][:10]

**EXTRACT: Making page requests**

In [3]:
# making a request

endpoint = MINDAT_API_URL + "/geomaterials/"
# interest_cols = ['id','guid','name','colour','entrytype','mindat_formula','occurrence','crystal_system']

df_all = pd.DataFrame()
for page in range(1,3):
  filter_dict = {
    'page': page,
    'page_size':1000
  }
  
  response = requests.get(endpoint,params=filter_dict,headers=headers,timeout=5)

  try:
    mineral_list = response.json()['results']
    print('page', page, response)

    # data frame handling
    df_page = pd.DataFrame.from_dict(mineral_list)
    df_all = pd.concat([df_all, df_page])

  except Exception as e:
    print('Erro ao tentar criar dataframe!', 'pagina', page)

df_all.reset_index(inplace=True, drop=True)
df_all

page 1 <Response [200]>
page 2 <Response [200]>


Unnamed: 0,id,longid,guid,name,updttime,mindat_formula,mindat_formula_note,ima_formula,ima_status,ima_notes,...,rock_parent,rock_parent2,rock_root,rock_bgs_code,meteoritical_code,key_elements,shortcode_ima,rimin,rimax,weighting
0,1,1:1:1:5,464e5cfa-be77-4568-a724-62137f35df18,Abelsonite,2022-08-03 21:27:09,Ni(C<sub>31</sub>H<sub>32</sub>N<sub>4</sub>),,NiC<sub>31</sub>H<sub>32</sub>N<sub>4</sub>,[APPROVED],[],...,0,0,0,,,"[N, Ni]",Abl,,,13
1,2,1:1:2:4,fb40f3e1-f058-46fe-9008-e0036aae3ec7,Abenakiite-(Ce),2023-09-11 22:14:49,Na<sub>26</sub>Ce<sub>6</sub>(Si<sub>6</sub>O<...,,Na<sub>26</sub>Ce<sub>6</sub>(Si<sub>6</sub>O<...,[APPROVED],[],...,0,0,0,,,[Ce],Abk-Ce,,,19
2,3,1:1:3:3,7e270e8f-93b1-419c-b49e-217d547d3df7,Abernathyite,2024-01-11 10:28:12,K(UO<sub>2</sub>)(AsO<sub>4</sub>)&middot;3H<s...,,K(UO<sub>2</sub>)(AsO<sub>4</sub>) &middot; 3H...,"[APPROVED, GRANDFATHERED]",[],...,0,0,0,,,"[As, U]",Abn,1.608,1.608,103
3,4,1:1:4:2,7b925f9e-322d-4ad2-9689-03849f7ea289,Abhurite,2023-08-07 12:59:27,Sn<sub>21</sub>Cl<sub>16</sub>(OH)<sub>14</sub...,,Sn<sup>2+</sup><sub>21</sub>O<sub>6</sub>(OH)<...,[APPROVED],[],...,0,0,0,,,"[Cl, Sn]",Abh,,,134
4,5,1:1:5:1,8c40e602-73b0-49c8-b514-475c99ea5831,Ablykite,2008-05-08 18:21:13,,,,[],[],...,0,0,0,,,[],,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,2004,1:1:2004:2,1f5ffe35-d66e-4b43-a98c-150cc5e14cfa,Idrizite,2007-06-28 15:21:10,"(Mg,Fe)(Al,Fe)<sub>2</sub>(SO<sub>4</sub>)<sub...",,,[],[REJECTED],...,0,0,0,,,[],,,,1
1996,2005,1:1:2005:9,97baebde-5ceb-4dba-88e4-ca1e5be2ab36,Zinc-bearing Cerussite,2020-06-30 16:36:17,"(Pb,Zn)CO<sub>3</sub>",,,[],[],...,0,0,0,,,[Pb],,,,6
1997,2006,1:1:2006:6,b3bf5c01-bbe5-4707-8f2d-12f07ae2f2cd,Iimoriite-(Y),2024-01-26 09:30:31,Y<sub>2</sub>[SiO<sub>4</sub>][CO<sub>3</sub>],,Y<sub>2</sub>(SiO<sub>4</sub>)(CO<sub>3</sub>),[APPROVED],[RENAMED],...,0,0,0,,,[Y],Iim-Y,,,220
1998,2007,1:1:2007:3,bd9ac53f-cc3f-452a-be56-99d7079dd930,Ikaite,2022-11-11 09:32:12,CaCO<sub>3</sub>&middot;6H<sub>2</sub>O,,Ca(CO<sub>3</sub>) &middot; 6H<sub>2</sub>O,[APPROVED],[],...,0,0,0,,,,Ika,1.46,1.538,214


**LOAD**
* loading data into csv file using duckdb

In [7]:
# creating a duckdb object
df_duck = duckdb.from_df(df_all)
df_duck.write_csv(os.path.join(DATA_DIR, "bronze_geomaterials.csv"), overwrite=True)

error


In [10]:
df_duck

┌───────┬────────────┬──────────────────────┬──────────────────────┬───┬───────────────┬─────────┬─────────┬───────────┐
│  id   │   longid   │         guid         │         name         │ … │ shortcode_ima │  rimin  │  rimax  │ weighting │
│ int64 │  varchar   │       varchar        │       varchar        │   │    varchar    │ varchar │ varchar │   int64   │
├───────┼────────────┼──────────────────────┼──────────────────────┼───┼───────────────┼─────────┼─────────┼───────────┤
│     1 │ 1:1:1:5    │ 464e5cfa-be77-4568…  │ Abelsonite           │ … │ Abl           │         │         │        13 │
│     2 │ 1:1:2:4    │ fb40f3e1-f058-46fe…  │ Abenakiite-(Ce)      │ … │ Abk-Ce        │         │         │        19 │
│     3 │ 1:1:3:3    │ 7e270e8f-93b1-419c…  │ Abernathyite         │ … │ Abn           │ 1.608   │ 1.608   │       103 │
│     4 │ 1:1:4:2    │ 7b925f9e-322d-4ad2…  │ Abhurite             │ … │ Abh           │         │         │       134 │
│     5 │ 1:1:5:1    │ 8c40e602-

**bronze_geomaterials**

In [13]:
duck_bronze_geomaterials = duckdb.read_csv(os.path.join(DATA_DIR,'bronze_geomaterials.csv'))
duck_bronze_geomaterials

┌───────┬─────────────┬──────────────────────┬──────────────────────┬───┬───────────────┬────────┬────────┬───────────┐
│  id   │   longid    │         guid         │         name         │ … │ shortcode_ima │ rimin  │ rimax  │ weighting │
│ int64 │   varchar   │       varchar        │       varchar        │   │    varchar    │ double │ double │   int64   │
├───────┼─────────────┼──────────────────────┼──────────────────────┼───┼───────────────┼────────┼────────┼───────────┤
│     1 │ 1:1:1:5     │ 464e5cfa-be77-4568…  │ Abelsonite           │ … │ Abl           │   NULL │   NULL │        13 │
│     2 │ 1:1:2:4     │ fb40f3e1-f058-46fe…  │ Abenakiite-(Ce)      │ … │ Abk-Ce        │   NULL │   NULL │        19 │
│     3 │ 1:1:3:3     │ 7e270e8f-93b1-419c…  │ Abernathyite         │ … │ Abn           │  1.608 │  1.608 │       103 │
│     4 │ 1:1:4:2     │ 7b925f9e-322d-4ad2…  │ Abhurite             │ … │ Abh           │   NULL │   NULL │       134 │
│     5 │ 1:1:5:1     │ 8c40e602-73b0-49

In [None]:
with open(os.path.join(QUERIES_DIR,'geomaterials_silver.sql')) as f:
  query = f.read()
  f.close()

duckdb.sql(query)

**TRASH**

In [None]:
# making a request

endpoint = MINDAT_API_URL + "/geomaterials/"
# interest_cols = ['id','guid','name','colour','entrytype','mindat_formula','occurrence','crystal_system']

df_teste = pd.DataFrame()

filter_dict = {
  'page': 49,
  'page_size':10
}

response = requests.get(endpoint,params=filter_dict,headers=headers)

try:
  mineral_list = response.json()['results']
  print('page', filter_dict['page'], response)
  
  # data frame handling
  print(type(mineral_list))
  df_teste = pd.DataFrame.from_dict(mineral_list)

except Exception as e:
  print('Erro ao tentar criar dataframe!', 'pagina', filter_dict['page'])

df_teste.reset_index(inplace=True, drop=True)
mineral_list[0]