**Imports and Setup**

In [2]:
import os

import numpy as np
import pandas as pd
import requests
import dotenv
import json
import time

import duckdb

# setup
API_KEY = dotenv.get_key(".env", "api_min")

# API root entry point
MINDAT_API_URL = "https://api.mindat.org"

# authorization header that must be included with each request.
headers = {'Authorization': 'Token ' + API_KEY}

# directories
DATA_DIR = os.path.join(os.path.abspath('./'), 'data')
QUERIES_DIR = os.path.join(os.path.abspath('./'), 'queries')

# fixing pandas analyzer
duckdb.execute("SET GLOBAL pandas_analyze_sample = 100_000")

<duckdb.duckdb.DuckDBPyConnection at 0x274decc4d30>

**Making a simple request**

In [11]:
# making a request

end_point = "/geomaterials/"
filter_dict = {
  'page':57,
  'page_size':1000
}

# df_all = pd.DataFrame()
# interest_cols = ['id','guid','name','colour','entrytype','mindat_formula','occurrence']

endpoint = MINDAT_API_URL+end_point
print("endpoint", endpoint)
response = requests.get(endpoint,params=filter_dict,headers=headers)
print(response)
# mineral_list = response.json()['results']
# df_page = pd.DataFrame.from_dict(mineral_list)
# df_page[interest_cols][:10]

endpoint https://api.mindat.org/geomaterials/
<Response [404]>


**EXTRACT: Making page requests**

In [34]:
# making a request

endpoint = MINDAT_API_URL + "/geomaterials/"

df_all = pd.DataFrame()
for page in range(1,60):
  filter_dict = {
    'page': page,
    'page_size':1000
  }
  
  code = 0
  while (code != 200 and code != 404):
    time.sleep(1)
    print('Trying', endpoint, page)
    response = requests.get(endpoint,params=filter_dict,headers=headers,timeout=30)
    code = response.status_code
  
  try:
    mineral_list = response.json()['results']
    print('page', page, response)

    # data frame handling
    df_page = pd.DataFrame.from_dict(mineral_list)
    df_all = pd.concat([df_all, df_page])
  except Exception as e:
    print('Erro ao tentar criar dataframe!', 'pagina', page)

df_all.reset_index(inplace=True, drop=True)
df_all

Trying https://api.mindat.org/geomaterials/ 1
page 1 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 2
page 2 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 3
page 3 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 4
page 4 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 5
page 5 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 6
page 6 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 7
page 7 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 8
page 8 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 9
page 9 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 10
page 10 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 11
page 11 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 12
page 12 <Response [200]>
Trying https://api.mindat.org/geomaterials/ 13
Trying https://api.mindat.org/geomaterials/ 13
page 13 <Response [200]>
Trying https://api.mindat.org/geoma

Unnamed: 0,id,longid,guid,name,updttime,mindat_formula,mindat_formula_note,ima_formula,ima_status,ima_notes,...,rock_parent,rock_parent2,rock_root,rock_bgs_code,meteoritical_code,key_elements,shortcode_ima,rimin,rimax,weighting
0,1,1:1:1:5,464e5cfa-be77-4568-a724-62137f35df18,Abelsonite,2022-08-03 21:27:09,Ni(C<sub>31</sub>H<sub>32</sub>N<sub>4</sub>),,NiC<sub>31</sub>H<sub>32</sub>N<sub>4</sub>,[APPROVED],[],...,0,0,0,,,"[N, Ni]",Abl,,,13
1,2,1:1:2:4,fb40f3e1-f058-46fe-9008-e0036aae3ec7,Abenakiite-(Ce),2023-09-11 22:14:49,Na<sub>26</sub>Ce<sub>6</sub>(Si<sub>6</sub>O<...,,Na<sub>26</sub>Ce<sub>6</sub>(Si<sub>6</sub>O<...,[APPROVED],[],...,0,0,0,,,[Ce],Abk-Ce,,,19
2,3,1:1:3:3,7e270e8f-93b1-419c-b49e-217d547d3df7,Abernathyite,2024-01-11 10:28:12,K(UO<sub>2</sub>)(AsO<sub>4</sub>)&middot;3H<s...,,K(UO<sub>2</sub>)(AsO<sub>4</sub>) &middot; 3H...,"[APPROVED, GRANDFATHERED]",[],...,0,0,0,,,"[As, U]",Abn,1.608,1.608,103
3,4,1:1:4:2,7b925f9e-322d-4ad2-9689-03849f7ea289,Abhurite,2023-08-07 12:59:27,Sn<sub>21</sub>Cl<sub>16</sub>(OH)<sub>14</sub...,,Sn<sup>2+</sup><sub>21</sub>O<sub>6</sub>(OH)<...,[APPROVED],[],...,0,0,0,,,"[Cl, Sn]",Abh,,,134
4,5,1:1:5:1,8c40e602-73b0-49c8-b514-475c99ea5831,Ablykite,2008-05-08 18:21:13,,,,[],[],...,0,0,0,,,[],,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55070,471265,1:1:471265:3,,Sooty chalcocite,2024-09-04 21:34:23,,,,[],[],...,0,0,0,,,,,,,0
55071,471266,1:1:471266:0,,sordavalite,2024-09-04 21:42:46,,,,[],[],...,0,0,0,,,,,,,0
55072,471267,1:1:471267:7,,Monazite,,,,,[],[],...,0,0,0,,,[],,,,0
55073,471268,1:1:471268:4,,Gasparite Group,2024-09-07 21:46:30,REE(AsO<sub>4</sub>),,,[],[],...,0,0,0,,,[As],,,,0


In [36]:
# df_all.to_csv(os.path.join(DATA_DIR, "raw_geomaterials.csv"))

In [95]:
df_all

Unnamed: 0,id,longid,guid,name,updttime,mindat_formula,mindat_formula_note,ima_formula,ima_status,ima_notes,...,rock_parent,rock_parent2,rock_root,rock_bgs_code,meteoritical_code,key_elements,shortcode_ima,rimin,rimax,weighting
0,1,1:1:1:5,464e5cfa-be77-4568-a724-62137f35df18,Abelsonite,2022-08-03 21:27:09,Ni(C<sub>31</sub>H<sub>32</sub>N<sub>4</sub>),,NiC<sub>31</sub>H<sub>32</sub>N<sub>4</sub>,[APPROVED],[],...,0,0,0,,,"[N, Ni]",Abl,,,13
1,2,1:1:2:4,fb40f3e1-f058-46fe-9008-e0036aae3ec7,Abenakiite-(Ce),2023-09-11 22:14:49,Na<sub>26</sub>Ce<sub>6</sub>(Si<sub>6</sub>O<...,,Na<sub>26</sub>Ce<sub>6</sub>(Si<sub>6</sub>O<...,[APPROVED],[],...,0,0,0,,,[Ce],Abk-Ce,,,19
2,3,1:1:3:3,7e270e8f-93b1-419c-b49e-217d547d3df7,Abernathyite,2024-01-11 10:28:12,K(UO<sub>2</sub>)(AsO<sub>4</sub>)&middot;3H<s...,,K(UO<sub>2</sub>)(AsO<sub>4</sub>) &middot; 3H...,"[APPROVED, GRANDFATHERED]",[],...,0,0,0,,,"[As, U]",Abn,1.608,1.608,103
3,4,1:1:4:2,7b925f9e-322d-4ad2-9689-03849f7ea289,Abhurite,2023-08-07 12:59:27,Sn<sub>21</sub>Cl<sub>16</sub>(OH)<sub>14</sub...,,Sn<sup>2+</sup><sub>21</sub>O<sub>6</sub>(OH)<...,[APPROVED],[],...,0,0,0,,,"[Cl, Sn]",Abh,,,134
4,5,1:1:5:1,8c40e602-73b0-49c8-b514-475c99ea5831,Ablykite,2008-05-08 18:21:13,,,,[],[],...,0,0,0,,,[],,,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55070,471265,1:1:471265:3,,Sooty chalcocite,2024-09-04 21:34:23,,,,[],[],...,0,0,0,,,,,,,0
55071,471266,1:1:471266:0,,sordavalite,2024-09-04 21:42:46,,,,[],[],...,0,0,0,,,,,,,0
55072,471267,1:1:471267:7,,Monazite,,,,,[],[],...,0,0,0,,,[],,,,0
55073,471268,1:1:471268:4,,Gasparite Group,2024-09-07 21:46:30,REE(AsO<sub>4</sub>),,,[],[],...,0,0,0,,,[As],,,,0


In [29]:
df_all['vhnmax'] = df_all['vhnmax'].apply(lambda x: str.split(x,sep='')[0])
df_all[''] = df_all['vhnmax'].apply(lambda x: str.split(x,sep='')[0])

ValueError: empty separator

**LOAD - raw**
* loading data into csv file using duckdb

In [127]:
# creating a duckdb object
raw = duckdb.read_csv(os.path.join(DATA_DIR, "raw_geomaterials.csv"), sample_size=-1)
duckdb.sql("select * from raw").show(max_rows=200, max_width=10000)

# df_duck.write_csv(os.path.join(DATA_DIR, "bronze_geomaterials.csv"), overwrite=True)

┌───────────┬───────┬─────────────┬──────────────────────────────────────┬──────────────────────────────────────────┬─────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────────┬────────────────────────────────┬───────────┬───────┬────────────┬─────────┬───────────┬────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────

**bronze_geomaterials**

In [97]:
bronze_geomaterials = duckdb.read_csv(os.path.join(DATA_DIR,'bronze_geomaterials.csv'), sample_size=100_000)
bronze_geomaterials.show()

┌───────┬─────────────┬──────────────────────┬──────────────────────┬───┬───────────────┬────────┬────────┬───────────┐
│  id   │   longid    │         guid         │         name         │ … │ shortcode_ima │ rimin  │ rimax  │ weighting │
│ int64 │   varchar   │       varchar        │       varchar        │   │    varchar    │ double │ double │   int64   │
├───────┼─────────────┼──────────────────────┼──────────────────────┼───┼───────────────┼────────┼────────┼───────────┤
│     1 │ 1:1:1:5     │ 464e5cfa-be77-4568…  │ Abelsonite           │ … │ Abl           │   NULL │   NULL │        13 │
│     2 │ 1:1:2:4     │ fb40f3e1-f058-46fe…  │ Abenakiite-(Ce)      │ … │ Abk-Ce        │   NULL │   NULL │        19 │
│     3 │ 1:1:3:3     │ 7e270e8f-93b1-419c…  │ Abernathyite         │ … │ Abn           │  1.608 │  1.608 │       103 │
│     4 │ 1:1:4:2     │ 7b925f9e-322d-4ad2…  │ Abhurite             │ … │ Abh           │   NULL │   NULL │       134 │
│     5 │ 1:1:5:1     │ 8c40e602-73b0-49

In [98]:
duckdb.describe(bronze_geomaterials)

AttributeError: module 'duckdb' has no attribute 'describe'

**silver_geomaterials**

* unsing ``silver_geomaterials.sql`` query to get a treated analytical table.

In [42]:
with open(os.path.join(QUERIES_DIR,'silver_geomaterials.sql')) as f:
  query = f.read()
  f.close()

silver_geomaterials = duckdb.sql(query)
# silver_geomaterials.write_csv(os.path.join(DATA_DIR, "silver_geomaterials.csv"), overwrite=True)
silver_geomaterials.show(max_width=10000)

┌───────────┬─────────────────────────────┬───────────────┬───────────────────────────────────┬──────────────────────────┬─────────────────┬───────────────────┬────────────────────────────────────────────────────────────────────┬───────────────────────────┬─────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────┬─────────────────────┐
│ idMineral │          descName           │ descEntryType │         descMindatFormula         │       descElements       │ descKeyElements │ descCrystalSystem │                             descColour                             │       descImaStatus       │ idVarietyOf │                                                                      