In [None]:
from copy import deepcopy
import os
from os.path import join, getsize

import numpy as np
import pandas as pd
import plotly.graph_objects as go


class Directory:
    """Class to analyse the structure of a directory.
    
    Attributes
        path - path of the directory to analyse
        levels - the number of subdirectory levels to break the 
            directory down by. All files contained within a directory 
            will be included, however files a greater number of 
            subdirectories from the path than this level will be shown 
            in the subdirectory at this level.
    """
    def __init__(self, path, levels):
        self.path = path
        self.levels = levels
        self.set_template()
        self.walk_directory()
        
    def set_template(self):
        """Set the template for bar charts."""
        self.bar_template = dict(
            layout=go.Layout(
                title=dict(
                    x=0,
                    xref='paper',
                ),
                xaxis=dict(
                    showline=True,
                    linewidth=1.5,
                    linecolor='rgb(200,200,200)',
                    gridwidth=1,
                    gridcolor='whitesmoke'
                ),
                yaxis=dict(
                    showline=True,
                    linewidth=1.5,
                    linecolor='rgb(200,200,200)',
                    gridwidth=1,
                    gridcolor='whitesmoke'
                ),
                font=dict(color='rgb(220,220,220)'),
                plot_bgcolor='rgba(0,0,0,0)',
                paper_bgcolor='rgba(0,0,0,0)',
                margin=dict(t=100, r=20, b=20, l=50),
            )
        )
        self.config = {'displayModeBar': False}
        
    def format_size(self, folder_bytes):
        """Format bytes to be displayed with KB, MB, or GB
        
        Arguments
            folder_bytes - the folder size in bytes.
            
        Returns 
            String with the size of the folder in bytes with an 
            appropriate label for the folder size (Bytes, KB, MB or GB).
        """
        quotient = folder_bytes // 1000
        
        if quotient < 1:
            return str(folder_bytes) + ' Bytes'
        elif quotient < 10**3:
            return str(round(folder_bytes / 10**3, 2)) + ' KB'
        elif quotient < 10**6:
            return str(round(folder_bytes / 10**6, 2)) + ' MB'
        else:
            return str(round(folder_bytes / 10**9, 2)) + ' GB'
        
    def walk_directory(self):
        """Walk the directory to find its files and subdirectories.
        
        
        """
        # Number of levels in the directory path
        path_levels = self.path.count('/')
        
        # The number of levels from the directory is specified, therefore
        # to get to that level, the number of levels in the directory's 
        # path have to be added.
        total_levels = path_levels + self.levels
        
        self.files = {}
        self.directories = {}
        file_count = 0
        directory_count = 0
        
        for root, dirs, files in os.walk(self.path):
            for file in files:
                filepath = join(root, file)
                try:
                    size = os.path.getsize(filepath)
                    self.files[file_count] = {'filepath': filepath, 'size': size}
                    file_count += 1
                except (FileNotFoundError, PermissionError):
                    pass

                x = filepath.count('/') - path_levels
                start = 1 if x <= self.levels else x - self.levels
                for i in range(start,  x + 1):
                    self.directories[directory_count] = {
                        'dirpath': filepath.rsplit('/', i)[0], 
                        'size': size
                    }
                    directory_count += 1
    
    def prepare_treemap(self):
        """Prepare data frame to plot treemap."""
        self.directories = (pd.DataFrame
                            .from_dict(self.directories, orient='index')
                            .groupby('dirpath', as_index=False)['size']
                            .sum()
                           )

        self.directories['parents'] = [i.rsplit('/', 1)[0] 
                                       for i in self.directories['dirpath']]
        self.directories['labels'] = [i.rsplit('/', 1)[1] 
                                      for i in self.directories['dirpath']]

        # Set the top parent to empty.
        self.directories.loc[
            self.directories['dirpath'] == self.path, 'parents'] = ""
    
    def prepare_file_types(self):
        """Prepare data frame to plot file type barchart."""
        files = deepcopy(self.files)
        for key,info in files.items():
            filename = info['filepath'].rsplit('/', 1)[1]
            files[key]['filename'] = filename
            files[key]['filetype'] = (filename.split('.')[-1] 
                                      if '.' in filename else 'Unknown')

        df = pd.DataFrame.from_dict(files, orient='index')
        df = df.groupby('filetype', as_index=False)['size'].sum()
        df['description'] = [self.format_size(i) for i in df['size']]
        df = df.sort_values('size', ascending=False)[:15]
        return df
    
    def plot_file_types(self):
        """"""
        df = self.prepare_file_types()
        
        fig = go.Figure()
        
        fig.add_trace(
            go.Bar(
                x=list(df['filetype']),
                y=list(df['size'] / 1000000),
                marker=dict(
                    color='rgba(50,100,200,0.8)',
                    line=dict(
                        color='white',
                        width=2,
                    )
                ),
                text=list(df['description']),
                textfont=dict(color='rgb(220,220,220)'),
                hovertemplate=
                '<extra></extra>'+
                '<b>%{x}</b><br>'+
                '%{text}'
            )
        )
        
        fig.update_layout(
            title="Total Size of Files (MB) by File Type<br>" + self.path,
            yaxis_title="Size (MB)",
            template=self.bar_template,
            hoverlabel=dict(
                bgcolor='rgb(25,25,25)', 
                font=dict(color='rgb(220,220,220)'
                         )
            ),
        )
        
        fig.show(config=self.config)
        
    def plot_treemap(self):
        """Plot a treemap summarising the directory and save as HTML 
        file.
        """
        self.prepare_treemap()
        df = self.directories
        text = [self.format_size(i) for i in df['size']]
        
        colors = ['#003f5c', '#2f4b7c', '#665191', '#a05195', '#d45087', 
                  '#f95d6a', '#ff7c43', '#ffa600',]

        fig = go.Figure()
        
        fig.add_trace(
            go.Treemap(
                ids=df['dirpath'],
                labels=df['labels'],
                parents=df['parents'],
                values=df['size'],
                text=text,
                branchvalues="total",
                hovertemplate=
                '<extra></extra>'+
                '<b>%{label}</b><br>'+
                '%{text}'
            )
        )
        
        fig.update_layout(
            title=dict(
                text="Treemap Showing Folder Sizes in " + self.path,
                x=0.005
            ),
            treemapcolorway=colors,
            width=1200,
            height=700,
            margin=dict(t=40, r=0, b=0, l=0),
        )
        
        #fig.show(config=self.config)
        fig.write_html('treemap.html', config=self.config)

    def return_all_files(self):
        """Return a data frame containing all files in the directory.
        """
        df = pd.DataFrame.from_dict(self.files, orient='index')
        
        df['file'] = [i.rsplit('/', 1)[1] for i in df['filepath']]
        df['filetype'] = [i.rsplit('.', 1)[1] if '.' in i else 'Unknown' 
                          for i in df['file']]
        df['formatted'] = [self.format_size(i) for i in df['size']]
        df = df[['file', 'size', 'formatted', 'filetype', 'filepath']]
        df = df.sort_values('size', ascending=False).reset_index(drop=True)
        return df

    

In [None]:
directory = Directory(path='/Users/xx/Github', levels=10)

## Treemap

In [None]:
directory.plot_treemap()

## File Types

In [None]:
directory.plot_file_types()

## All Files

In [None]:
df = directory.return_all_files()