In [None]:
from copy import deepcopy
import os
from os.path import join, getsize

import numpy as np
import pandas as pd
import plotly.graph_objects as go


class Directory:
    """
    Class to analyse the structure of a specified directory.
    
    Arguments
        path - 
        levels -
    """
    def __init__(self, path, levels):
        self.path = path
        self.levels = levels
        self.set_template()
        self.walk_directory()
        
    def set_template(self):
        """"""
        self.bar_template = dict(
            layout=go.Layout(
                title=dict(
                    x=0,
                    xref='paper',
                ),
                xaxis=dict(
                    showline=True,
                    linewidth=1.5,
                    linecolor='black',
                    gridwidth=1,
                    gridcolor='whitesmoke'
                ),
                yaxis=dict(
                    showline=True,
                    linewidth=1.5,
                    linecolor='black',
                    gridwidth=1,
                    gridcolor='whitesmoke'
                ),
                plot_bgcolor='white',
                paper_bgcolor='white',
            )
        )
        self.config = {'displayModeBar': False}
        
    def format_size(self, folder_bytes):
        """Return the size of the folder, given in bytes, in a readable 
        format with an appropriate label for the folder size 
        (Bytes, KB, MB or GB).
        """
        quotient = folder_bytes // 1000
        
        if quotient < 1:
            return str(folder_bytes) + ' Bytes'
        elif quotient < 10**3:
            return str(round(folder_bytes / 10**3, 2)) + ' KB'
        elif quotient < 10**6:
            return str(round(folder_bytes / 10**6, 2)) + ' MB'
        else:
            return str(round(folder_bytes / 10**9, 2)) + ' GB'
        
    def walk_directory(self):
        """"""
        # Number of levels in the directory path
        path_levels = self.path.count('/')
        
        # The number of levels from the directory is specified, therefore
        # to get to that level, the number of levels in the directory's 
        # path have to be added.
        total_levels = path_levels + self.levels
        
        self.files = {}
        self.directories = {}
        file_count = 0
        directory_count = 0
        
        for root, dirs, files in os.walk(self.path):
            for file in files:
                filepath = join(root, file)
                try:
                    size = os.path.getsize(filepath)
                    self.files[file_count] = {'filepath': filepath, 'size': size}
                    file_count += 1
                except (FileNotFoundError, PermissionError):
                    pass

                x = filepath.count('/') - path_levels
                start = 1 if x <= self.levels else x - self.levels
                for i in range(start,  x + 1):
                    self.directories[directory_count] = {
                        'dirpath': filepath.rsplit('/', i)[0], 
                        'size': size
                    }
                    directory_count += 1
    
    def prepare_treemap(self):
        """Prepare Data Frame to plot treemap."""
        self.directories = (pd.DataFrame
                            .from_dict(self.directories, orient='index')
                            .groupby('dirpath', as_index=False)['size']
                            .sum()
                           )

        self.directories['parents'] = [i.rsplit('/', 1)[0] 
                                       for i in self.directories['dirpath']]
        self.directories['labels'] = [i.rsplit('/', 1)[1] 
                                      for i in self.directories['dirpath']]

        # Set the top parent to empty.
        self.directories.loc[self.directories['dirpath'] == self.path, 'parents'] = ""
    
    def prepare_file_types(self):
        """Prepare Data Frame to plot file type barchart."""
        files = deepcopy(self.files)
        for key,info in files.items():
            filename = info['filepath'].rsplit('/', 1)[1]
            files[key]['filename'] = filename
            files[key]['filetype'] = filename.split('.')[-1] if '.' in filename else 'Unknown'

        df = pd.DataFrame.from_dict(files, orient='index')
        df = df.groupby('filetype', as_index=False)['size'].sum()
        df['description'] = [self.format_size(i) for i in df['size']]
        df = df.sort_values('size', ascending=False)[:15]
        return df
    
    def plot_file_types(self):
        """"""
        df = self.prepare_file_types()
        
        fig = go.Figure()
        fig.add_trace(
            go.Bar(
                x=list(df['filetype']),
                y=list(df['size'] / 1000000),
                marker=dict(
                    color='rgb(100,100,200)',
                    line=dict(
                        color='white',
                        width=2,
                    )
                ),
                text=list(df['description']),
                hovertemplate=
                '<extra></extra>'+
                '<b>%{x}</b><br>'+
                '%{text}'
            )
        )
        fig.update_layout(
            title="Total Size of Files (MB) by File Type<br>" + self.path,
            yaxis_title="Size (MB)",
            template=self.bar_template,
        )
        fig.show(config=self.config)
        
    def plot_treemap(self):
        """"""
        self.prepare_treemap()
        df = self.directories
        text = [self.format_size(i) for i in df['size']]
        
        colors = ['steelblue', 'skyblue', 'orange', 'purple', 'darkred', 
                  'slateblue', 'olive']

        fig = go.Figure()
        fig.add_trace(
            go.Treemap(
                ids=df['dirpath'],
                labels=df['labels'],
                parents=df['parents'],
                values=df['size'],
                text=text,
                branchvalues="total",
                hovertemplate=
                '<extra></extra>'+
                '<b>%{label}</b><br>'+
                '%{text}'
            )
        )
        fig.update_layout(
            title=dict(
                text="Treemap Showing Folder Sizes in " + self.path,
                x=0.005
            ),
            treemapcolorway=colors,
            width=1200,
            height=700,
            margin=dict(t=40, r=0, b=0, l=0),
        )
        fig.show(config=self.config)
    
    def find_duplicates(self):
        """"""
        df = self.file_types.copy()
        
        # Do not include hidden files.
        df = df[~df['filepath'].str.contains('/\.')]
        df['filename'] = [i.rsplit('/', 1)[1] for i in df['filepath']]
        
        def group_paths(df_group):
            """"""
            count = len(df_group.index)
            total_size = df_group['size'].sum()
            paths = " \n ".join([i for i in df_group['filepath']])
            return pd.Series([count, total_size, paths], index=['count', 'total_size', 'paths'])

        grouped = df.groupby('filename', as_index=False).apply(group_paths)
        grouped = grouped.sort_values('count', ascending=False)
        return grouped
    
    def return_all_files(self):
        """Test
        
        Arguments
            include_hidden - 
        """
        df = pd.DataFrame.from_dict(self.files, orient='index')
        
        df['file'] = [i.rsplit('/', 1)[1] for i in df['filepath']]
        df['filetype'] = [i.rsplit('.', 1)[1] if '.' in i else 'Unknown' 
                          for i in df['file']]
        df['formatted'] = [self.format_size(i) for i in df['size']]
        df = df[['file', 'size', 'formatted', 'filetype', 'filepath']]
        df = df.sort_values('size', ascending=False).reset_index(drop=True)
        return df

    

In [None]:
directory = Directory(path='/System/Library/AssetsV2', levels=6)

## Directory Treemap

In [None]:
directory.plot_treemap()

## File Types

In [None]:
directory.plot_file_types()

## All Files

In [None]:
df = directory.return_all_files()
df.head(20)