In [None]:
import os,sys
from clickhouse_driver import connect,Client
from pprint import pprint

try:
	sys.path.append("../../")
	from nsdf.kernel import NormalizeEnv, SetEnv
	env=NormalizeEnv({"include-vault": ["clickhouse-doublecloud"]})
	print("Loaded env")
	SetEnv(env)
except:
	CLICKHOUSE_HOST       = str(os.environ.get("CLICKHOUSE_HOST","127.0.0.1")).strip()
	CLICKHOUSE_PORT       = str(os.environ.get("CLICKHOUSE_PORT","9000")).strip()
	CLICKHOUSE_USER       = str(os.environ.get("CLICKHOUSE_USER","default")).strip()
	CLICKHOUSE_PASSWORD   = str(os.environ.get("CLICKHOUSE_PASSWORD","")).strip()
	CLICKHOUSE_SECURE     = eval(os.environ.get("CLICKHOUSE_SECURE","True"))

def Connect():
	return Client(host=CLICKHOUSE_HOST, port=str(CLICKHOUSE_PORT), user=CLICKHOUSE_USER, password=CLICKHOUSE_PASSWORD, secure=CLICKHOUSE_SECURE)


client=Connect()
client.execute("SHOW CREATE TABLE nsdf.catalog")

In [None]:
def AddSuffix(number,type="", base=1024):
	if number < (base**2): return str(number//(base**0))+type
	if number < (base**2): return str(number//(base**1))+"K"+type
	if number < (base**3): return str(number//(base**2))+"M"+type
	if number < (base**4): return str(number//(base**3))+"G"+type
	if number < (base**5): return str(number//(base**4))+"T"+type
	return str(number//(base**5))+"P"+type

def PrintInfo(name,count,size):
	print("repository =",name.ljust(19)
		," - number of files =",AddSuffix(count,base=1000).ljust(7)
		," - total file size =",AddSuffix(size,base=1024,type="B").ljust(7))

if True:

	# total count per catalog
	my_table = client.execute("""
SELECT catalog,COUNT(size),SUM(size)
from nsdf.catalog
group by catalog;
	""")

	tot_count,tot_size= 0,0
	for name,count,size in my_table:
		tot_count += count
		tot_size += size
	my_table = [["all repositories",tot_count,tot_size]]+my_table

	my_table.sort(key=lambda x:-x[2])
	repository_table = list(my_table)
	for name,count,size in my_table:
		PrintInfo(name,count,size)

In [None]:
import os,sys
from pprint import pprint

# print all buckets
buckets=[it for it in client.execute(f"SELECT DISTINCT catalog,bucket FROM nsdf.catalog")]
print(buckets[0:10])

In [None]:
def GetFileNames (catalog = None, group_buckets = False, group_suffix = False):
	if group_buckets: 
		grouping = "catalog, bucket"
		selecting ="catalog, bucket"
		header=["repository", "bucket", "file count", "files sizes"]
	elif group_suffix:
		grouping  = "catalog, suffix"
		selecting = "catalog, suffix"
		header=["repository", "suffix","file count", "files sizes"]
	else:
		grouping="catalog, bucket, suffix"
		selecting = "catalog, bucket, suffix"
		header=["repository", "bucket", "suffix","file count", "files sizes"]

	catalog_restriction = f"catalog='{catalog}'" if catalog is not None else  "1==1"
 
	ret = client.execute(f"""
		SELECT {selecting},count(size) as NumSize,SUM(size) as TotSize
		FROM (
			SELECT 
				arrayStringConcat(['.',splitByChar('.',name)[-1]]) as suffix,
				size as size , 
				catalog, 
				bucket
			FROM 
				nsdf.catalog
			WHERE {catalog_restriction} AND (
				name like '_%._'   OR 
				name like '_%.__'  OR 
				name like '_%.___' OR 
				name like '_%.____'   )
			) derived_table 
			GROUP BY {grouping} 
			ORDER BY NumSize DESC;
	""")

	return [header, ret]

if True:
	header,rows = GetFileNames()
	print (header)
	for I,row in enumerate(rows):
		print(row)
		if I>=10: break

In [None]:
# example of plot of filesize inside a dataset
import pandas as pd
import matplotlib.pyplot as plt

def PlotSizes(filename,sizes):
	sizes=sorted(sizes)
	ipd = 1/plt.rcParams['figure.dpi'] 
	plt.figure(figsize=(1024*ipd,768*ipd))
	plt.title(f"{filename} #({len(sizes)}) m({sizes[0]}) M({sizes[-1]})")
	plt.plot(range(len(sizes)), sorted(sizes))
	os.makedirs(os.path.dirname(filename),exist_ok=True)
	plt.savefig(filename)
	plt.show()

if True:
	for catalog, bucket in buckets:
		sizes=[it[0] for it in client.execute(f"SELECT size FROM nsdf.catalog WHERE catalog='{catalog}' and bucket='{bucket}'")]
		if not sizes: continue
		PlotSizes(filename=f"/tmp/plots/{catalog}/{bucket}.png",sizes=sizes)
		# remove the `break` if you want all the plots
		break 

In [None]:
# total records 56M files 44PB
client = Connect()
TOT_FILES,TOT_BYTES=client.execute(f"SELECT count(size),SUM(size)/(1024*1024*1024) FROM nsdf.catalog;")[0]
print(TOT_FILES,TOT_BYTES)

In [None]:
# # total size per catalog/bucket
client.execute("""
	SELECT catalog,bucket, SUM(size) as TotSize
	FROM nsdf.catalog
	GROUP BY catalog,bucket
	ORDER BY TotSize DESC;
	""")

In [None]:
# total number of objeccts per catalog/bucket
client.execute("""
	SELECT catalog,bucket, COUNT(size) As NumObjects
	FROM nsdf.catalog
	group by catalog,bucket
	ORDER BY NumObjects DESC;
""")

In [None]:
# example of looking to a specific catalog,bucket
client.execute("""
	SELECT SUM(size) from nsdf.catalog 
	WHERE catalog='aws-open-data' and bucket='noaa-cors-pds';
""")


In [None]:
# LIKE querry for looking into filenames
client.execute("""
	SELECT count(*) from nsdf.catalog
	where name like '%a%'
""")

In [None]:
# size per catalog
client.execute("""
	SELECT catalog,SUM(size)
	FROM nsdf.catalog
	GROUP BY catalog;
""")

In [None]:
# SUM per bucket
client.execute("""
	SELECT catalog,bucket,SUM(size)
	FROM nsdf.catalog
	GROUP BY catalog,bucket
	ORDER BY COUNT(size) DESC;
""")

In [None]:
# COUNT per bucket 
client.execute("""
	SELECT catalog,bucket,COUNT(size)
	FROM nsdf.catalog
	GROUP BY catalog,bucket
	ORDER BY COUNT(size) DESC;
""")

In [None]:
# file size distribution 
client.execute("""
	SELECT size
	FROM nsdf.catalog
	WHERE catalog='mc' and bucket='102'
	ORDER BY size ASC;
""")

In [None]:
# delete records
# ALTER TABLE nsdf.catalog DELETE WHERE 1=1;

In [None]:
# get extension
client.execute("""
	SELECT splitByChar('.','giorgio.scorzelli.h5')[-1]
	FROM nsdf.catalog             
	ORDER BY size DESC
	LIMIT 10
""")

In [None]:
import yaml

from bokeh.layouts import row,column
from bokeh.models.widgets import Div
from bokeh.models import ColumnDataSource, Slider , Dropdown, Select, DataTable,TableColumn, Button, Dropdown
from bokeh.plotting import figure, curdoc
from bokeh.themes import Theme
from bokeh.io import show, output_notebook, curdoc
from bokeh.sampledata.sea_surface_temperature import sea_surface_temperature

from IPython.display import clear_output
from IPython.display import IFrame

import requests
from requests.exceptions import HTTPError

def in_notebook():
	from IPython import get_ipython
	return True if get_ipython() else False

In [None]:
if in_notebook():
	output_notebook()

In [None]:
repository_table2=[]
for name,count,size in repository_table:
	repository_table2.append([
		name,
		format(count, ",").rjust(11,'_') + " ("+AddSuffix(count, type="", base=1000).ljust(7)+")",
		format(size , ",").rjust(23,'_') + " ("+AddSuffix (size ,type="B",base=1024).ljust(7)+")"])

query_result = GetFileNames(catalog = None,group_buckets = True, group_suffix = False)  

CurrentValues     = {
	"filter": "File_type",
	"repository": "all repositories"
}

# /////////////////////////////////////////////////////////////////////
def modify_doc(doc):

	df = sea_surface_temperature.copy()
	source = ColumnDataSource(data = df)

	source_table = ColumnDataSource(data = {
		"Repository"   : [s[0] for s in repository_table2],
		"file count"   : [s[1] for s in repository_table2],
		"total storage": [s[2] for s in repository_table2],
		})

	button = Button(label=" ", button_type="success", height = 30)

	dropdown = Dropdown(label="Grouping", button_type="warning", menu=[
		("None", "None"), 
		("Bucket", "Bucket"), 
		("File_type", "File_type")
	])

	df2 = sea_surface_temperature.copy()
	source2 = ColumnDataSource(data = df2)

	source_table2 = ColumnDataSource(data = {
		"repository": [s[0] for s in query_result[1]],
		"project"   : [s[1] for s in query_result[1]],
		"file type" : [s[2] for s in query_result[1]],
		"file count": [s[3] for s in query_result[1]],
		"storage"   : [s[4] for s in query_result[1]]
	})

	plot = figure(
		x_axis_type = 'datetime', 
		y_range = (0, 25),
		y_axis_label = 'Temperature (Celsius)',
		title = "Sea Surface Temperature at 43.18, -70.43")

	plot.line('time', 'temperature', source = source)

	table = DataTable(
		source = source_table,
		height=220,
		columns = [
			TableColumn(field = "Repository", title = "Repository"),
			TableColumn(field = "file count", title = "file count"),
			TableColumn(field = "total storage", title = "total storage")
		])

	table2 = DataTable(
		source = source_table2,
		columns = [
			TableColumn(field = "repository" , title = "repository"),
			TableColumn(field = "project"	, title = "project"   ),
			TableColumn(field = "file type"  , title = "file type"   ),
			TableColumn(field = "file count" , title = "file count"   ),
			TableColumn(field = "storage"	, title = "storage"   )
		])

	def update_table2 ():
		pass

	def UpdateTable():

		catalog=None if CurrentValues["repository"]== "all repositories" else CurrentValues["repository"]

		query_result = GetFileNames(catalog = catalog, group_buckets = False, group_suffix  = False)   
			
		source_table2.data = {
			"repository": [s[0] for s in query_result[1]],
			"project"   : [s[1] for s in query_result[1]],
			"file type" : [s[2] for s in query_result[1]],
			"file count": [s[3] for s in query_result[1]],
			"storage"   : [s[4] for s in query_result[1]]}

	
	def callback(attr, old, new, button = button):  
		print(dropdown.menu,dropdown.select,dropdown.label,dropdown.name)
		#rint(dropdown.menu,dropdown.item)
		button.label = repository_table[new[0]][0]
		CurrentValues["repository"] = repository_table[new[0]][0]

		catalog=None if repository_table[new[0]][0]== "all repositories" else repository_table[new[0]][0]

		query_result = GetFileNames(catalog = catalog, group_buckets = False,  group_suffix = False)   

		source_table2.data = {
			"repository": [s[0] for s in query_result[1]],
			"project"   : [s[1] for s in query_result[1]],
			"file type" : [s[2] for s in query_result[1]],
			"file count": [s[3] for s in query_result[1]],
			"storage"   : [s[4] for s in query_result[1]]}
		
	source_table.selected.on_change('indices', callback)

#	 dropdown.on_click('value',callbackdd)
	def handler(event):
		print (CurrentValues["filter"])
		print(event.item)
		CurrentValues["filter"]	 = event.item
		print (CurrentValues["filter"])
		UpdateTable()

	dropdown.on_click(handler)	

	doc.add_root(column(table,button, dropdown,table2))

In [None]:
show(modify_doc)