In [1]:
import math
import matplotlib.pyplot as plt
import os
import subprocess
import sys
import pandas as pd

from matplotlib.ticker import FuncFormatter

In [2]:
# pull in our gquery module and import the config
sys.path.append('../../scripts')
os.environ['GUFI_CONFIG']='/home/jbent/GUFI/configs/anony'
#os.environ['GUFI_DEBUG']='TRUE'
import gquery as gq

config=gq.query_gconfig()
nthreads=config.config['Threads']
indexroot=config.config['IndexRoot']
print(f"Using {nthreads} threads on {indexroot}")

Using 224 threads on /mnt/nvme1n1/jbent/anony


In [3]:
# function to find the largest file in an index
def largest_file_size(indexroot,nthreads=nthreads):
    select='size'
    tables='vrpentries'
    where=None
    (command,cstr)=gq.get_gufi_command(select=select,tables=tables,where=where,nthreads=nthreads,indexroot=indexroot)
    largest_file = gq.execute_command(command=command,command_string=cstr,aggregate_function='max')
    return largest_file

# function to count the number of files within a where clause within an index
def get_file_count(where,indexroot,nthreads=nthreads):
    select='count(*)'
    tables='vrpentries'
    (command,cstr)=gq.get_gufi_command(select=select,tables=tables,where=where,nthreads=nthreads,indexroot=indexroot)
    count = gq.execute_command(command=command,command_string=cstr,aggregate_function='sum',Verbose=False)
    return count

# can use this interactively for debugging
#largest_file = largest_file_size(config.config['IndexRoot'])
#largest_file

In [17]:
def get_filesize_distro_as_dataframe(indexroot,Verbose=False):
    data = { 'Upper_Bound' : [], 'Num_Files' : []}
    largest_file = largest_file_size(indexroot)
    if Verbose:
        print(f"Largest file in {indexroot} is {largest_file}")
    # now let's build a histogram of all the file counts up to the largest file using powers of 2
    max_pow = math.ceil(math.log2(largest_file))
    for i in range(max_pow):
        lower = (2**(i-1) if (2**(i-1)>=1) else -1)
        upper = 2**i
        where="size > %d and size <= %d" % (lower,upper)
        c = get_file_count(where,indexroot)
        data['Upper_Bound'].append(upper)
        data['Num_Files'].append(c)
        if Verbose:
            print("Bucket %d:%d has %d files (Bucket %d / %d)" % (lower,upper, c, i, max_pow))
    return (pd.DataFrame(data), largest_file)

In [32]:
def add_dataframe(dataframes, indexroot, Verbose=True):
    if indexroot in dataframes:
        print(f"Cowardly refusing to overwrite an existing entry for {indexroot}")
    else:
        (df,lf) = get_filesize_distro_as_dataframe(indexroot=indexroot,Verbose=Verbose)
        dataframes[indexroot] = {'DataFrame' : df, 'LargestFile' : lf}

In [30]:
#(df,largest_file) = get_filesize_distro_as_dataframe(indexroot=config.config['IndexRoot'],Verbose=True)
# build a dataframes structure to hold the data
# dataframes = {} # comment this out because we already built-it

In [42]:
#add_dataframe(dataframes, '/mnt/nvme3n1/jbent/jbent_home/', Verbose=True)
add_dataframe(dataframes, '/mnt/nvme1n1/jbent/scr4/',      Verbose=True)
add_dataframe(dataframes, '/mnt/nvme1n1/jbent/yellprojs/', Verbose=True)
add_dataframe(dataframes, '/mnt/nvme1n1/jbent/ttscratch/', Verbose=True)
add_dataframe(dataframes, '/mnt/nvme3n1/jbent/yellusers',  Verbose=True)
add_dataframe(dataframes, '/mnt/nvme1n1/jbent/anony',      Verbose=True)

Cowardly refusing to overwrite an existing entry for /mnt/nvme1n1/jbent/scr4/
Cowardly refusing to overwrite an existing entry for /mnt/nvme1n1/jbent/yellprojs/
Cowardly refusing to overwrite an existing entry for /mnt/nvme1n1/jbent/ttscratch/
Largest file in /mnt/nvme3n1/jbent/yellusers is 17846906880
Bucket -1:1 has 503253 files (Bucket 0 / 35)
Bucket 1:2 has 18430 files (Bucket 1 / 35)
Bucket 2:4 has 13406 files (Bucket 2 / 35)
Bucket 4:8 has 130293 files (Bucket 3 / 35)
Bucket 8:16 has 138170 files (Bucket 4 / 35)
Bucket 16:32 has 344539 files (Bucket 5 / 35)
Bucket 32:64 has 443898 files (Bucket 6 / 35)
Bucket 64:128 has 390266 files (Bucket 7 / 35)
Bucket 128:256 has 681071 files (Bucket 8 / 35)
Bucket 256:512 has 994173 files (Bucket 9 / 35)
Bucket 512:1024 has 1160034 files (Bucket 10 / 35)
Bucket 1024:2048 has 1506747 files (Bucket 11 / 35)
Bucket 2048:4096 has 1716278 files (Bucket 12 / 35)
Bucket 4096:8192 has 1584484 files (Bucket 13 / 35)
Bucket 8192:16384 has 1193313 file

In [39]:
dataframes.keys()

dict_keys(['/mnt/nvme1n1/jbent/anony', '/mnt/nvme1n1/jbent/scr4/', '/mnt/nvme1n1/jbent/yellprojs/', '/mnt/nvme1n1/jbent/ttscratch/'])

In [38]:
del(dataframes['/mnt/nvme3n1/jbent/jbent_home/'])

In [7]:
# a function to make nice tic marks on the graph
def generate_custom_ticks_labels(max_size):
    i = 1  # Start size
    factor = 1  # Byte multiplier
    label = "B"  # Initial label
    custom_ticks = []
    custom_labels = []

    while i <= max_size:
        custom_ticks.append(i)
        
        if i == 1024:
            factor = i
            label = "K"
        elif i == 1048576:
            factor = i
            label = "M"  
        elif i == 1048576 * 1024:
            factor = i
            label = "G"
        elif i == 1048576 * 1048576:
            factor = i
            label = "T"
        elif i == 1048576 * 1048576 * 1024:
            factor = i
            label = "P"
            
        custom_labels.append(f"{i//factor}{label}")
        i *= 2  # Increase size in powers of 2
        
    return custom_ticks, custom_labels

In [4]:
# so we have multiple dataframes describing file size distros for multiple file systems stored in the dataframes dict
# now we want to make a graph of them. First we need to figure out the max value for the x-axis which is determined
# by the largest file stored across all of the filesystems in our dict
largest_file = max(dataframes.items(), key=lambda x: x[1]['LargestFile'])[1]['LargestFile']
print(largest_file)

NameError: name 'dataframes' is not defined

In [3]:
plt.figure(figsize=(15, 8))

custom_ticks, custom_labels = generate_custom_ticks_labels(largest_file)

# Plot the data as a line graph
markers = ['o', 'x', 's', 'D', '^', 'v']
for idx, (indexroot, values) in enumerate(dataframes.items()):
    df = values['DataFrame']
    shortname = indexroot.rstrip('/').split("/")[-1]
    plt.plot(df['Upper_Bound'], df['Num_Files'], marker=markers[idx], label=shortname)

plt.xscale('log', base=2)

# y-tics are ugly, change them to be in values of millions
# Get the current axis
ax = plt.gca()

def millions(x, pos):
    return f'{int(x/1e6)}'

# Apply custom formatter
formatter = FuncFormatter(millions)
ax.yaxis.set_major_formatter(formatter)

# Add labels and title
plt.xlabel('Upper Bound of Bucket Size (Bytes)')
plt.ylabel('Millions of Files')
plt.title('Distribution of File Sizes')

# Add grid and legend
plt.grid(True)
plt.legend()
plt.xticks(custom_ticks, custom_labels, rotation=45)

# Show the graph
plt.show()


NameError: name 'generate_custom_ticks_labels' is not defined

<Figure size 1080x576 with 0 Axes>