**Imports and Setup**

In [2]:
import os
import dotenv
import json
import time

import numpy as np
import pandas as pd

import requests
import duckdb

# setup
API_KEY = dotenv.get_key(".env", "api_min")

# API root entry point
URL = "https://api.mindat.org"

# authorization header that must be included with each request.
headers = {'Authorization': 'Token ' + API_KEY}

# directories
DATA_DIR = os.path.join(os.path.abspath('./'), 'data')
QUERIES_DIR = os.path.join(os.path.abspath('./'), 'queries')

# fixing pandas analyzer
duckdb.execute("SET GLOBAL pandas_analyze_sample = 100_000")

<duckdb.duckdb.DuckDBPyConnection at 0x1cb59cea4b0>

**Making a simple request**

In [None]:
# making a request

end_point = "/geomaterials/"
filter_dict = {
  'page':57,
  'page_size':1000
}

# df_all = pd.DataFrame()
# interest_cols = ['id','guid','name','colour','entrytype','mindat_formula','occurrence']

endpoint = URL+end_point
print("endpoint", endpoint)
response = requests.get(endpoint,params=filter_dict,headers=headers)
print(response)
# mineral_list = response.json()['results']
# df_page = pd.DataFrame.from_dict(mineral_list)
# df_page[interest_cols][:10]

**EXTRACT: Making page requests**

In [None]:
# making a request

endpoint = URL + "/geomaterials/"

df_all = pd.DataFrame()
for page in range(1,60):
  filter_dict = {
    'page': page,
    'page_size':1000
  }
  
  code = 0
  while (code != 200 and code != 404):
    time.sleep(1)
    print('Trying', endpoint, page)
    response = requests.get(endpoint,params=filter_dict,headers=headers,timeout=30)
    code = response.status_code
  
  try:
    mineral_list = response.json()['results']
    print('page', page, response)

    # data frame handling
    df_page = pd.DataFrame.from_dict(mineral_list)
    df_all = pd.concat([df_all, df_page])
  except Exception as e:
    print('Erro ao tentar criar dataframe!', 'pagina', page)

df_all.reset_index(inplace=True, drop=True)
# df_all.to_csv(os.path.join(DATA_DIR, "raw_geomaterials.csv"))
df_all

In [53]:
# creating a duckdb object

bronze_geomaterials = duckdb.read_parquet(os.path.join(DATA_DIR, "bronze_geomaterials.parquet"))
duckdb.sql("select distinct opticaldispersion from bronze_geomaterials where opticaldispersion is not null").show(max_width=10000, max_rows=10)

┌───────────────────────────────────────────────────────────┐
│                     opticaldispersion                     │
│                          varchar                          │
├───────────────────────────────────────────────────────────┤
│ r > v, strong                                             │
│ not observed                                              │
│ r<< v, extreme. Extremely inclined monoclinic dispersion. │
│ medium                                                    │
│ r<v weak                                                  │
│ ·                                                         │
│ ·                                                         │
│ ·                                                         │
│ no                                                        │
│ strong, r > v; orientation is Y = b; X ≈ a, Z ≈ c.        │
│ Medium, r < v.                                            │
│ very weak with r > v                                      │
│ medium

In [55]:
with open(os.path.join(QUERIES_DIR, 'silver_geomaterials.sql')) as q:
  duckdb.sql(q.read()).show(max_rows=100, max_width=10000)
  q.close

┌─────────┬───────────────┬──────────────────────┬─────────────────┬───────────────┬─────────────────┬───────────────┬─────────────────────┬──────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬───────────────────────────────┬────────────────────────────────────────────────────────────────────────────────────────────────┬───────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬──────────┬──────────┬──────────┬──────────┬──────────────────┬───────────────────────────────────────┬──────────────┬──────────────┬───────────────────────────────────────────────────────────────────┬──────────────┬────────────┬───────────┬──────────┬───────────┬───────────────┬───

In [21]:
pd.set_option('display.max_rows', None)
raw.describe().to_df().transpose()

Unnamed: 0,0,1,2,3,4,5
aggr,count,mean,stddev,min,max,median
column000,55075.0,27537.0,15898.927375,0.0,55074.0,27537.0
id,55075.0,35573.365883,58568.030147,1.0,471269.0,29250.0
longid,55075,,,1:1:10000:5,1:1:9:7,
guid,54558,,,0000f368-1a21-4019-98c4-5d3c725ab2b5,ffff6d75-220c-456d-bb84-ab4d72bcb04a,
name,55075,,,"""Cu-excess"" tennantite",ｷﾑﾘ石,
updttime,30590,,,2006-11-13 10:20:12,2024-09-09 21:43:10,
mindat_formula,25275,,,&#9723;{CaNa}{Fe<sup>2+</sup><sub>4</sub>Al}(...,☐{X<sup>2+</sup><sub>2</sub>}{Z<sup>2+</sup><s...,
mindat_formula_note,1541,,,,δN,
ima_formula,5974,,,&#9723;(Al<sub>2</sub>Li)Al<sub>6</sub>(Si<sub...,{Cu<sub>9</sub>Al[SiO<sub>3</sub>(OH)]<sub>2</...,


In [None]:
meta_cols = [longid,guid,updttime,]
chemical_cols


**bronze_geomaterials**

In [None]:
bronze_geomaterials = duckdb.read_csv(os.path.join(DATA_DIR,'bronze_geomaterials.csv'), sample_size=100_000)
bronze_geomaterials.show()

In [None]:
duckdb.describe(bronze_geomaterials)

**silver_geomaterials**

* unsing ``silver_geomaterials.sql`` query to get a treated analytical table.

In [None]:
with open(os.path.join(QUERIES_DIR,'silver_geomaterials.sql')) as f:
  query = f.read()
  f.close()

silver_geomaterials = duckdb.sql(query)
# silver_geomaterials.write_csv(os.path.join(DATA_DIR, "silver_geomaterials.csv"), overwrite=True)
silver_geomaterials.show(max_width=10000)