# file server age report

This jupyter notebook will take CSV data showing details about files on a network share, and generate graphs based on file size and age.


# 1 - import required libraries

In [None]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt      #for creating graphs and charts
import chardet                       #for file character set encoding detection (ascii, utf-16, utf-8, etc)
import os                            #for deleting temporary files
import shutil                        #for copying  temporary files
from IPython.display import display  #for opening *.png files
from PIL import Image                #for opening *.png files



# 2 - define CSV source data


In [None]:
#CSV_source_file = 'https://raw.githubusercontent.com/nickjeffrey/GetFileDetails/main/filenames.csv'
#CSV_source_file = 'c:/temp/filenames.csv'



In [None]:
# determine default system character set
import sys
print(f"The default system character set is:", sys.stdout.encoding)

In [None]:
# Detect file encoding
with open(CSV_source_file, "rb") as raw_file:
    result = chardet.detect(raw_file.read(100000))  # Read a sample for detection
    source_encoding = result.get("encoding", "").lower()  # Convert to lowercase for consistency
    #source_encoding = result["encoding"]

# Debug output
if source_encoding:
    print(f"Source file is encoded as: {source_encoding}")
else:
    print("WARNING: Unable to detect source file encoding.")


# Define temporary CSV filename
CSV_source_file_temp = f"{CSV_source_file}.tmp"  # Define temporary CSV file name

# Detect file encoding
with open(CSV_source_file, "rb") as raw_file:
    result = chardet.detect(raw_file.read(100000))  # Read a sample for detection
    source_encoding = result.get("encoding", "").lower()  # Convert to lowercase for consistency

# Check if the file encoding needs conversion
if source_encoding and source_encoding != "utf-8":
    print(f"Source file is encoded as {source_encoding}, attempting conversion to UTF-8")

    # Read file using detected encoding
    try:
        print(f"Reading source file {CSV_source_file}")
        with open(CSV_source_file, "r", encoding=source_encoding, errors="replace") as infile:
            content = infile.read()

        # Write to a temporary UTF-8 encoded file
        print(f"Writing temporary file {CSV_source_file_temp} encoded as UTF-8")
        with open(CSV_source_file_temp, "w", encoding="utf-8") as outfile:
            outfile.write(content)

        print(f"File converted from {source_encoding} to UTF-8 successfully.")

        # Replace original file with the UTF-8 encoded temp file
        shutil.move(CSV_source_file_temp, CSV_source_file)
        print(f"Replaced original file {CSV_source_file} with UTF-8 encoded version.")

    except Exception as e:
        print(f"ERROR: Failed to convert file due to: {e}")

else:
    print("File is already UTF-8 encoded or encoding could not be detected.")


In [None]:
#import chardet

# Function to count lines in a CSV file
def count_lines(file_path):
    try:
        # Detect file encoding to handle non-UTF-8 cases
        with open(file_path, "rb") as raw_file:
            result = chardet.detect(raw_file.read(100000))  # Read a sample for detection
            detected_encoding = result.get("encoding", "utf-8")  # Default to utf-8 if detection fails

        print(f"Detected encoding: {detected_encoding}")

        # Read file with detected encoding
        line_count = 0
        with open(file_path, 'r', encoding=detected_encoding, errors="replace") as file:
            for line in file:
                line_count += 1
                if line_count % 1000000 == 0:
                    print(f"Processed {line_count} lines")

        return line_count

    except FileNotFoundError:
        print(f"ERROR: File not found: {file_path}")
        return 0

    except Exception as e:
        print(f"ERROR: Unexpected issue while reading the file: {e}")
        return 0

# Example usage:
file_path = CSV_source_file          # Replace with actual file path
num_lines = count_lines(file_path)   #call the function
print(f"Total number of lines in the file: {num_lines}")


In [None]:
# extract all the lines from the source file that contain at least one comma, save to new temporary file
# this is how we eliminate any bogus lines in the source file (ie blank lines, headers, etc)

def find_lines_with_comma(input_file_path, output_file_path):
    try:
        with open(input_file_path, 'r', encoding='utf-8') as input_file:
            lines = input_file.readlines()

        matching_lines = [line.strip() for line in lines if ',' in line]

        with open(output_file_path, 'w', encoding='utf-8') as output_file:
            for line in matching_lines:
                output_file.write(line + '\n')

        print(f"Matching lines have been written to a working copy of the file at '{output_file_path}'.")

    except FileNotFoundError:
        print(f"File not found: {input_file_path}")
    except UnicodeDecodeError:
        print(f"Unable to decode the file with 'utf-8' encoding: {input_file_path}")
        print(f"The source CSV file may have a weird character set encoding")
        print(f"Try to fix with these commands from the Windows command prompt")
        print(f"type sourcefilename.csv | findstr , > sourcefilename.csv.tmp")
        print(f"copy sourcefilename.csv.tmp sourcefilename.csv")


# Example usage:
input_file_path = CSV_source_file
output_file_path = input_file_path   #start with name of source file
output_file_path += ".tmp"           #append .tmp to filename
find_lines_with_comma(input_file_path, output_file_path)

# Now make the cleaned up file the source file that all subsequent analysis will be performed on
CSV_working_file = output_file_path

In [None]:
# BUG ALERT: some weird filenames that contain oddball characters like
#    embedded quotation marks or multiple commas in the filenames may
#    not be correctly detected by the pd.read_csv directive.
#    Figure out which files were not matched, and come up with an "exceptions list" for further investigation

In [None]:
# figure out the character set encoding of the source file


import chardet
from chardet.universaldetector import UniversalDetector

detector = UniversalDetector()
for line in open(CSV_source_file,'rb'):
    detector.feed(line)
    if detector.done: break
detector.close()
print(detector.result)

# 2 - Load dataset into a Pandas DataFrame

In [None]:
# load the temporary working copy of the file 

df = pd.read_csv(CSV_working_file, on_bad_lines='skip',skip_blank_lines=True)



In [None]:
# look at the top few rows of the data to confirm the labels are correct
df.head()

In [None]:
# look at the bottom few rows of the data
df.tail()

In [None]:
# show number of rows in dataset
print ("Rows in dataset:", len(df))

In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)

In [None]:
# check to see if there are any missing values from the dataset

# all of the results should be zero, which would indicate there are not any null values in the dataset
# if there are any results greater than zero, it would indicate that some pieces of data are missing and should be cleaned up.
df.isnull().sum()

In [None]:
# Find rows with null values
rows_with_nulls = df[df.isnull().any(axis=1)]

if ( len(rows_with_nulls) > 0):
   print("WARNING: Found ", len(rows_with_nulls), " rows with null values")



In [None]:
if ( len(rows_with_nulls) > 0):
  print (rows_with_nulls)
  print(f"WARNING: {rows_with_nulls} rows containing null values were found, indicating problems with the source file.  Please investigate.")
else:
  print("Looking good, there are no rows with null values, indicating a nice and clean input file.")

In [None]:
# drop any rows containing null characters
if ( len(rows_with_nulls) > 0):
  print("Dropping ", len(rows_with_nulls), " rows with null values")
  df.dropna(inplace=True)

In [None]:
# visualize any missing values from the dataset in a histogram
# you want all the bars in the graph to be empty, which would indicate zero missing values

df.isnull().sum().plot.bar()
plt.show()

In [None]:
# another method to visualize missing values from dataset

print ("Checking for missing values in data set")

import matplotlib.pyplot as plt
def plot_nas(df: pd.DataFrame):
    if df.isnull().sum().sum() != 0:
        na_df = (df.isnull().sum() / len(df)) * 100
        na_df = na_df.drop(na_df[na_df == 0].index).sort_values(ascending=False)
        missing_data = pd.DataFrame({'Missing Ratio %' :na_df})
        missing_data.plot(kind = "barh")
        plt.show()
    else:
        print('No NAs found')
plot_nas(df)


In [None]:
#show the names of the columns (also called feature names)
df.columns

In [None]:
#show summary info about dataset
df.info()

In [None]:
# show data types
df.dtypes

## 2.1 - rename features

In [None]:
# rename some features that have incorrect names

if 'DaysSinceCreation'     in df.columns: df.rename(columns={'DaysSinceCreation'     : 'CreationTimeDays'     }, inplace=True)
if 'DaysSinceAccess'       in df.columns: df.rename(columns={'DaysSinceAccess'       : 'AccessTimeDays'       }, inplace=True)
if 'DaysSinceModification' in df.columns: df.rename(columns={'DaysSinceModification' : 'ModificationTimeDays' }, inplace=True)
if 'LastWriteTimeEpoch'    in df.columns: df.rename(columns={'LastWriteTimeEpoch'    : 'ModificationTimeEpoch'}, inplace=True)

In [None]:
# look at the top few rows of the data to confirm the labels are correct
df.head()

## 2.2 - dimensionality reduction by removing features

In this example, we are dropping some of the columns from the dataset that are not useful.
Please note that this is more of a "data science" exercise than a machine learning exercise, so it isn't that these features have no predictive value for a ML algorithm.


In [None]:
# drop any redundant columns from the dataset which does not have any predictive power.

#In this example, we have features for bytes,MegaBytes, Gigabytes.
# We really only need the bytes column, so get rid of the other two.
if 'MegaBytes' in df.columns: df.drop('MegaBytes', axis=1, inplace=True)
if 'GigaBytes' in df.columns: df.drop('GigaBytes', axis=1, inplace=True)


# we really only care about the ModificationTimeDays and AccessTimeDays, so drop the other timestamps
if 'CreationTimeEpoch'     in df.columns: df.drop('CreationTimeEpoch',     axis=1, inplace=True)
if 'CreationTimeDays'      in df.columns: df.drop('CreationTimeDays',      axis=1, inplace=True)
if 'AccessTimeEpoch'       in df.columns: df.drop('AccessTimeEpoch',       axis=1, inplace=True)
if 'ModificationTimeEpoch' in df.columns: df.drop('ModificationTimeEpoch', axis=1, inplace=True)

# There is a column for the MD5 checksum of each file, but at the moment, we only care about file age, so drop MD5sum
if 'MD5sum' in df.columns: df.drop('MD5sum', axis=1, inplace=True)



In [None]:
#Look at the dataset again, you should see several columns have been dropped
df.info()

In [None]:
# Convert the 'bytes' and 'days' columns to integers
# Note that we set bytes to a 64-bit integer to avoid overflow
# import numpy as np

df['Bytes'] = df['Bytes'].astype(np.int64)
df['ModificationTimeDays'] = df['ModificationTimeDays'].astype(int)

In [None]:
# the Bytes and ModificationTimeDays should be of type int32 or int64
df.info()

In [None]:
# At this point, we have 3 columns: Filename, Bytes, ModificationTimeDays
# look at the top few rows of the data to confirm the labels are correct
df.head()

In [None]:
# drop rows to make the dataset a bit faster during testing
# Let's say we want to drop all rows between indices 10000 and 999999 inclusive.
# We can do this with the drop() function and the range() function to generate the indices:
# df.drop(range(10000, 1000000), inplace=True)

In [None]:
#view dimensions of dataset (rows and columns)
print ("Rows,columns in dataset:", df.shape)

In [None]:
# At this point, we have 4 columns: Filename, Bytes, AccessTimeDays, ModificationTimeDays
# look at the top few rows of the data to confirm the labels are correct
df.head()

In [None]:
# At this point, we have 4 columns: Filename, Bytes, AccessTimeDays, ModificationTimeDays
# look at the bottom few rows of the data to confirm the labels are correct
df.tail()

## 2.3 - Cleanup temporary files

In [None]:
# Delete temporary working copy of the CSV file
import os
from pathlib import Path
if os.path.isfile(CSV_working_file):
    os.remove(CSV_working_file)
    print ("Deleting temporary working copy ",CSV_working_file)

# 3 - Categorize files by Last Modification Time

In [None]:
# show a list of all the filenames with age 0 to  90 days old by extracting all rows with ModificationTimeDays <= 90
LastModified_age0to90days = (df[(df['ModificationTimeDays'] >= 0) & (df['ModificationTimeDays'] < 90)])
print("Number of files with last modification date   0 to  90  days: ", (len(LastModified_age0to90days)) )

# show a list of all the filenames with age 90 to 180 days
LastModified_age90to180days = (df[(df['ModificationTimeDays'] >= 90) & (df['ModificationTimeDays'] < 180)])
print("Number of files with last modification date  90 to 180  days: ", (len(LastModified_age90to180days)) )

# show a list of all the filenames with age 180 to 365 days
LastModified_age180to365days = (df[(df['ModificationTimeDays'] >= 180) & (df['ModificationTimeDays'] < 365)])
print("Number of files with last modification date 180 to 365  days: ", (len(LastModified_age180to365days)) )

# show a list of all the filenames with age 1 to 2 years days
LastModified_age1to2years = (df[(df['ModificationTimeDays'] >= (365*1)) & (df['ModificationTimeDays'] < (365*2))])
print("Number of files with last modification date   1 to   2 years: ", (len(LastModified_age1to2years)) )

# show a list of all the filenames with age 2 to 3 years days
LastModified_age2to3years = (df[(df['ModificationTimeDays'] >= (365*2)) & (df['ModificationTimeDays'] < (365*3))])
print("Number of files with last modification date   2 to   3 years: ", (len(LastModified_age2to3years)) )

# show a list of all the filenames with age 3 to 5 years days
LastModified_age3to5years = (df[(df['ModificationTimeDays'] >= (365*3)) & (df['ModificationTimeDays'] < (365*5))])
print("Number of files with last modification date   3 to   5 years: ", (len(LastModified_age3to5years)) )

# show a list of all the filenames with age 5 to 99 years days
LastModified_age5to99years = (df[(df['ModificationTimeDays'] >= (365*5)) & (df['ModificationTimeDays'] < (365*99))])
print("Number of files with last modification date   5 to  99 years: ", (len(LastModified_age5to99years)) )



## 3.1 - Create graphs for file counts

In [None]:
# create a bar chart 

# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')

if __name__ == '__main__':

    # creating data on which bar chart will be plot
    x = "0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"
    y = [len(LastModified_age0to90days), len(LastModified_age90to180days), len(LastModified_age180to365days), len(LastModified_age1to2years), len(LastModified_age2to3years), len(LastModified_age3to5years), len(LastModified_age5to99years)]

    # setting figure size by using figure() function
    plt.figure(figsize = (10, 5))

    # making the bar chart on the data
    plt.bar(x, y)

    # calling the function to add value labels
    addlabels(x, y)

    # giving title to the plot
    plt.title("Filename counts by Last Modification Date")

    # giving X and Y labels
    plt.xlabel("Last Modification Date")
    plt.ylabel("Number of files")

    # Save the graph as a PNG file for future reference
    graph_filename = CSV_source_file                                                    #start with name of source file
    if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
    graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
    graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
    graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
    graph_filename += ".png"                                                            #append .png extension to the filename
    plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
    print(f"Graph saved as {graph_filename}")

    # visualizing the plot
    plt.show()

In [None]:
# create a pie chart showing percentages

#import matplotlib.pyplot as plt

# Data for the pie chart
labels = ["0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"]
sizes = [len(LastModified_age0to90days), len(LastModified_age90to180days), len(LastModified_age180to365days), len(LastModified_age1to2years), len(LastModified_age2to3years), len(LastModified_age3to5years), len(LastModified_age5to99years)]

# The chart gets cluttered and hard to read if there are labels on very small pie slices
# Only put labels on pie slices larger than 10%
# Function to format labels with percentage only if >10%
def label_format(label, pct):
    return f"{label} \n{pct:.1f}%" if pct > 5 else ""  # Show label + % if >10%

# Compute final labels with percentages for slices >5%
total_size = sum(sizes)
final_labels = [label_format(label, (size / total_size) * 100) for label, size in zip(labels, sizes)]

# Set up figure size
plt.figure(figsize=(8, 6))

# Create pie chart
plt.pie(
    sizes,
    labels=final_labels,  # Labels include percentage only if >10%
    labeldistance=0.5,    # Moves labels inside slices
)

# Set title
plt.title("Percentage Filename Counts by Last Modification Date")

# Generate legend labels with percentages
legend_labels = [f"{label} ({(size / total_size) * 100:.1f}%)" for label, size in zip(labels, sizes)]

# Add legend with percentage values
plt.legend(legend_labels, loc="best", bbox_to_anchor=(1, 0.5), title="")

# Save the graph as a PNG file for future reference
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename
plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
print(f"Graph saved as {graph_filename}")

# Show the chart
plt.show()


## 3.2 - Create graphs for byte counts
(this section is time consuming if you have a lot of rows in the dataset)

In [None]:
## figure out the total number of bytes for each file category

# Initialize variables as int64 to prevent overflow
LastModified_bytes0to90days    = np.int64(0)
LastModified_bytes90to180days  = np.int64(0)
LastModified_bytes180to365days = np.int64(0)
LastModified_bytes1to2years    = np.int64(0)
LastModified_bytes2to3years    = np.int64(0)
LastModified_bytes3to5years    = np.int64(0)
LastModified_bytes5to99years   = np.int64(0)

# If df.Bytes is int32 or uint32, it may overflow when adding large values. Convert it to int64:
df["Bytes"] = df["Bytes"].astype("int64")

print("Starting processing of", len(df), "files")
if len(df) > 1000000: print("Detected large input file, please be patient during processing...")
# loop through the contents of the dataframe to find the total number of bytes for each file age category
# Instead of using range(0, len(df)), iterate over the DataFrame's index:
for i in df.index:
    if (i % 100000 == 0):  print("Processed", i, "of", len(df), "files ", df.Filename[i], df.Bytes[i], df.ModificationTimeDays[i]) # print debug output every 100000 lines
    # Check if i is the last index instead of relying on range:
    if i == df.index[-1]: print("Processed", i, "of", len(df), "files ", df.Filename[i], df.Bytes[i], df.ModificationTimeDays[i]) # print debug output for the last line so we know when the loop is finished
    if (df.ModificationTimeDays[i] >=0       and df.ModificationTimeDays[i] <  90):      LastModified_bytes0to90days    += df.Bytes[i]
    if (df.ModificationTimeDays[i] >=90      and df.ModificationTimeDays[i] < 180):      LastModified_bytes90to180days  += df.Bytes[i]
    if (df.ModificationTimeDays[i] >=180     and df.ModificationTimeDays[i] < 365):      LastModified_bytes180to365days += df.Bytes[i]
    if (df.ModificationTimeDays[i] >=(365*1) and df.ModificationTimeDays[i] < (365*2)):  LastModified_bytes1to2years    += df.Bytes[i]
    if (df.ModificationTimeDays[i] >=(365*2) and df.ModificationTimeDays[i] < (365*3)):  LastModified_bytes2to3years    += df.Bytes[i]
    if (df.ModificationTimeDays[i] >=(365*3) and df.ModificationTimeDays[i] < (365*5)):  LastModified_bytes3to5years    += df.Bytes[i]
    if (df.ModificationTimeDays[i] >=(365*5) and df.ModificationTimeDays[i] < (365*99)): LastModified_bytes5to99years   += df.Bytes[i]

In [None]:
print("Bytes with last modification date   0 to  90  days: ", LastModified_bytes0to90days    )
print("Bytes with last modification date  90 to 180  days: ", LastModified_bytes90to180days  )
print("Bytes with last modification date 180 to 365  days: ", LastModified_bytes180to365days )
print("Bytes with last modification date   1 to   2 years: ", LastModified_bytes1to2years    )
print("Bytes with last modification date   2 to   3 years: ", LastModified_bytes2to3years    )
print("Bytes with last modification date   3 to   5 years: ", LastModified_bytes3to5years    )
print("Bytes with last modification date   5 to  99 years: ", LastModified_bytes5to99years   )


In [None]:
# confirm the datatype is int64 (avoids overflow for very large numbers)
print(f"Data type for df.Bytes is:", df.Bytes.dtype)

In [None]:
# convert bytes to more human-readable kilobytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest GB is close enough)
LastModified_Kbytes0to90days    = int(LastModified_bytes0to90days/1024)
LastModified_Kbytes90to180days  = int(LastModified_bytes90to180days/1024)
LastModified_Kbytes180to365days = int(LastModified_bytes180to365days/1024)
LastModified_Kbytes1to2years    = int(LastModified_bytes1to2years/1024)
LastModified_Kbytes2to3years    = int(LastModified_bytes2to3years/1024)
LastModified_Kbytes3to5years    = int(LastModified_bytes3to5years/1024)
LastModified_Kbytes5to99years   = int(LastModified_bytes5to99years/1024)

print("KiloBytes with last modification date   0 to  90  days: ", LastModified_Kbytes0to90days    )
print("KiloBytes with last modification date  90 to 180  days: ", LastModified_Kbytes90to180days  )
print("KiloBytes with last modification date 180 to 365  days: ", LastModified_Kbytes180to365days )
print("KiloBytes with last modification date   1 to   2 years: ", LastModified_Kbytes1to2years    )
print("KiloBytes with last modification date   2 to   3 years: ", LastModified_Kbytes2to3years    )
print("KiloBytes with last modification date   3 to   5 years: ", LastModified_Kbytes3to5years    )
print("KiloBytes with last modification date   5 to  99 years: ", LastModified_Kbytes5to99years   )
print("-----------------------------------------------------------------") 



# convert bytes to more human-readable megabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest GB is close enough)
LastModified_Mbytes0to90days    = int(LastModified_bytes0to90days/1024/1024)
LastModified_Mbytes90to180days  = int(LastModified_bytes90to180days/1024/1024)
LastModified_Mbytes180to365days = int(LastModified_bytes180to365days/1024/1024)
LastModified_Mbytes1to2years    = int(LastModified_bytes1to2years/1024/1024)
LastModified_Mbytes2to3years    = int(LastModified_bytes2to3years/1024/1024)
LastModified_Mbytes3to5years    = int(LastModified_bytes3to5years/1024/1024)
LastModified_Mbytes5to99years   = int(LastModified_bytes5to99years/1024/1024)

print("MegaBytes with last modification date   0 to  90  days: ", LastModified_Mbytes0to90days    )
print("MegaBytes with last modification date  90 to 180  days: ", LastModified_Mbytes90to180days  )
print("MegaBytes with last modification date 180 to 365  days: ", LastModified_Mbytes180to365days )
print("MegaBytes with last modification date   1 to   2 years: ", LastModified_Mbytes1to2years    )
print("MegaBytes with last modification date   2 to   3 years: ", LastModified_Mbytes2to3years    )
print("MegaBytes with last modification date   3 to   5 years: ", LastModified_Mbytes3to5years    )
print("MegaBytes with last modification date   5 to  99 years: ", LastModified_Mbytes5to99years   )
print("-----------------------------------------------------------------") 



# convert bytes to more human-readable gigabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest MB is close enough)
LastModified_Gbytes0to90days    = int(LastModified_bytes0to90days/1024/1024/1024)
LastModified_Gbytes90to180days  = int(LastModified_bytes90to180days/1024/1024/1024)
LastModified_Gbytes180to365days = int(LastModified_bytes180to365days/1024/1024/1024)
LastModified_Gbytes1to2years    = int(LastModified_bytes1to2years/1024/1024/1024)
LastModified_Gbytes2to3years    = int(LastModified_bytes2to3years/1024/1024/1024)
LastModified_Gbytes3to5years    = int(LastModified_bytes3to5years/1024/1024/1024)
LastModified_Gbytes5to99years   = int(LastModified_bytes5to99years/1024/1024/1024)

print("GigaBytes with last modification date   0 to  90  days: ", LastModified_Gbytes0to90days    )
print("GigaBytes with last modification date  90 to 180  days: ", LastModified_Gbytes90to180days  )
print("GigaBytes with last modification date 180 to 365  days: ", LastModified_Gbytes180to365days )
print("GigaBytes with last modification date   1 to   2 years: ", LastModified_Gbytes1to2years    )
print("GigaBytes with last modification date   2 to   3 years: ", LastModified_Gbytes2to3years    )
print("GigaBytes with last modification date   3 to   5 years: ", LastModified_Gbytes3to5years    )
print("GigaBytes with last modification date   5 to  99 years: ", LastModified_Gbytes5to99years   )
print("-----------------------------------------------------------------") 


# convert bytes to more human-readable terabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest TB is close enough)
LastModified_Tbytes0to90days    = int(LastModified_bytes0to90days/1024/1024/1024/1024)
LastModified_Tbytes90to180days  = int(LastModified_bytes90to180days/1024/1024/1024/1024)
LastModified_Tbytes180to365days = int(LastModified_bytes180to365days/1024/1024/1024/1024)
LastModified_Tbytes1to2years    = int(LastModified_bytes1to2years/1024/1024/1024/1024)
LastModified_Tbytes2to3years    = int(LastModified_bytes2to3years/1024/1024/1024/1024)
LastModified_Tbytes3to5years    = int(LastModified_bytes3to5years/1024/1024/1024/1024)
LastModified_Tbytes5to99years   = int(LastModified_bytes5to99years/1024/1024/1024/1024)

print("TeraBytes with last modification date   0 to  90  days: ", LastModified_Tbytes0to90days    )
print("TeraBytes with last modification date  90 to 180  days: ", LastModified_Tbytes90to180days  )
print("TeraBytes with last modification date 180 to 365  days: ", LastModified_Tbytes180to365days )
print("TeraBytes with last modification date   1 to   2 years: ", LastModified_Tbytes1to2years    )
print("TeraBytes with last modification date   2 to   3 years: ", LastModified_Tbytes2to3years    )
print("TeraBytes with last modification date   3 to   5 years: ", LastModified_Tbytes3to5years    )
print("TeraBytes with last modification date   5 to  99 years: ", LastModified_Tbytes5to99years   )
print("-----------------------------------------------------------------") 


# convert bytes to more human-readable petabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest PB is close enough)
LastModified_Pbytes0to90days    = int(LastModified_bytes0to90days/1024/1024/1024/1024/1024)
LastModified_Pbytes90to180days  = int(LastModified_bytes90to180days/1024/1024/1024/1024/1024)
LastModified_Pbytes180to365days = int(LastModified_bytes180to365days/1024/1024/1024/1024/1024)
LastModified_Pbytes1to2years    = int(LastModified_bytes1to2years/1024/1024/1024/1024/1024)
LastModified_Pbytes2to3years    = int(LastModified_bytes2to3years/1024/1024/1024/1024/1024)
LastModified_Pbytes3to5years    = int(LastModified_bytes3to5years/1024/1024/1024/1024/1024)
LastModified_Pbytes5to99years   = int(LastModified_bytes5to99years/1024/1024/1024/1024/1024)

print("PetaBytes with last modification date   0 to  90  days: ", LastModified_Pbytes0to90days    )
print("PetaBytes with last modification date  90 to 180  days: ", LastModified_Pbytes90to180days  )
print("PetaBytes with last modification date 180 to 365  days: ", LastModified_Pbytes180to365days )
print("PetaBytes with last modification date   1 to   2 years: ", LastModified_Pbytes1to2years    )
print("PetaBytes with last modification date   2 to   3 years: ", LastModified_Pbytes2to3years    )
print("PetaBytes with last modification date   3 to   5 years: ", LastModified_Pbytes3to5years    )
print("PetaBytes with last modification date   5 to  99 years: ", LastModified_Pbytes5to99years   )

In [None]:
# Based on the byte counts, figure out the preferred unit value (Bytes, MegaBytes, GigaBytes, TeraBytes) to use for graphing
if (LastModified_bytes0to90days  >= 0):  y_axis_units = "Bytes"  #start with default value of bytes
if (LastModified_Kbytes0to90days >= 10): y_axis_units = "KiloBytes"
if (LastModified_Mbytes0to90days >= 10): y_axis_units = "MegaBytes"
if (LastModified_Gbytes0to90days >= 10): y_axis_units = "GigaBytes"
if (LastModified_Tbytes0to90days >= 10): y_axis_units = "TeraBytes"
if (LastModified_Pbytes0to90days >= 10): y_axis_units = "PetaBytes"
print ("Based on file sizes, the vertical y-axis units will be shown in ", y_axis_units)

## manually edit y_axis_units if desired
##y_axis_units = "GigaBytes"

In [None]:
# create a bar graph with these vertical columns:
# bytes0to90days
# bytes90to180days
# bytes180to365days
# bytes1to2years
# bytes2to3years
# bytes3to5years
# bytes5to99years



# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')

if __name__ == '__main__':

    # creating data on which bar chart will be plot
    x = "0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"
    #
    # Based on how big the numbers are, figure out if the vertical y-axis should be in bytes, Mbytes, Gbytes, Tbytes, Pbytes
    if (y_axis_units == "Bytes"):     y = [ LastModified_bytes0to90days,  LastModified_bytes90to180days,  LastModified_bytes180to365days,  LastModified_bytes1to2years,  LastModified_bytes2to3years,  LastModified_bytes3to5years,  LastModified_bytes5to99years]
    if (y_axis_units == "MegaBytes"): y = [LastModified_Mbytes0to90days, LastModified_Mbytes90to180days, LastModified_Mbytes180to365days, LastModified_Mbytes1to2years, LastModified_Mbytes2to3years, LastModified_Mbytes3to5years, LastModified_Mbytes5to99years]
    if (y_axis_units == "GigaBytes"): y = [LastModified_Gbytes0to90days, LastModified_Gbytes90to180days, LastModified_Gbytes180to365days, LastModified_Gbytes1to2years, LastModified_Gbytes2to3years, LastModified_Gbytes3to5years, LastModified_Gbytes5to99years]
    if (y_axis_units == "TeraBytes"): y = [LastModified_Tbytes0to90days, LastModified_Tbytes90to180days, LastModified_Tbytes180to365days, LastModified_Tbytes1to2years, LastModified_Tbytes2to3years, LastModified_Tbytes3to5years, LastModified_Tbytes5to99years]
    if (y_axis_units == "PetaBytes"): y = [LastModified_Pbytes0to90days, LastModified_Pbytes90to180days, LastModified_Pbytes180to365days, LastModified_Pbytes1to2years, LastModified_Pbytes2to3years, LastModified_Pbytes3to5years, LastModified_Pbytes5to99years]


    # setting figure size by using figure() function
    plt.figure(figsize = (10, 5))

    # making the bar chart on the data
    plt.bar(x, y)

    # calling the function to add value labels
    addlabels(x, y)

    # giving title to the plot
    plt.title("Byte Counts by Last Modification Date")

    # giving X and Y labels
    plt.xlabel("Last Modification Date")
    plt.ylabel("Number of bytes")
    if (y_axis_units == "Bytes"):     plt.ylabel("Bytes")
    if (y_axis_units == "KiloBytes"): plt.ylabel("Kilobytes")
    if (y_axis_units == "MegaBytes"): plt.ylabel("MegaBbytes")
    if (y_axis_units == "GigaBytes"): plt.ylabel("GigaBytes")
    if (y_axis_units == "TeraBytes"): plt.ylabel("TeraBytes")
    if (y_axis_units == "PetaBytes"): plt.ylabel("PetaBytes")

    # Save the graph as a PNG file for future reference
    graph_filename = CSV_source_file                                                    #start with name of source file
    if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  # Remove the last 4 characters (.csv)
    graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
    graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
    graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
    graph_filename += ".png"                                                            #append .png extension to the filename
    plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
    print(f"Graph saved as {graph_filename}")


    # visualizing the plot
    plt.show()

In [None]:
# create pie chart showing percentages

#import matplotlib.pyplot as plt

# Data for the pie chart
labels = ["0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"]
sizes  = [LastModified_bytes0to90days, LastModified_bytes90to180days, LastModified_bytes180to365days, LastModified_bytes1to2years, LastModified_bytes2to3years, LastModified_bytes3to5years, LastModified_bytes5to99years]

# The chart gets cluttered and hard to read if there are labels on very small pie slices
# Only put labels on pie slices larger than 10%
# Function to format labels with percentage only if >10%
def label_format(label, pct):
    return f"{label} \n{pct:.1f}%" if pct > 5 else ""  # Show label + % if >10%

# Compute final labels with percentages for slices >5%
total_size = sum(sizes)
final_labels = [label_format(label, (size / total_size) * 100) for label, size in zip(labels, sizes)]

# Set up figure size
plt.figure(figsize=(8, 6))

# Create pie chart
plt.pie(
    sizes,
    labels=final_labels,  # Labels include percentage only if >10%
    labeldistance=0.5,    # Moves labels inside slices
)

# Set title
plt.title("Percentage Byte Counts by Last Modification Date")

# Generate legend labels with percentages
legend_labels = [f"{label} ({(size / total_size) * 100:.1f}%)" for label, size in zip(labels, sizes)]

# Add legend with percentage values
plt.legend(legend_labels, loc="best", bbox_to_anchor=(1, 0.5), title="")

# Save the graph as a PNG file for future reference
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename
plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
print(f"Graph saved as {graph_filename}")

# Show the chart
plt.show()


## 3.3 - Create CSV reports for each category of files


In [None]:

# Generate reports in CSV format showing the list of filenames, bytes, ModificationTimeDays

## open file for writing
#CSV_temp_file = CSV_source_file   #start with name of source file
#CSV_temp_file += ".tmp"           #append .tmp to filename

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastModified_0to90days.csv"                                    #append date range to filename
df_output_file  = df[ (df['ModificationTimeDays'] >= 0)        & (df['ModificationTimeDays'] < 90)]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastModified_90to180days.csv"                                  #append date range to filename
df_output_file  = df[ (df['ModificationTimeDays'] >= 90)     & (df['ModificationTimeDays'] < 180)]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastModified_180to365days.csv"                                 #append date range to filename
df_output_file = df[ (df['ModificationTimeDays'] >= 180)   & (df['ModificationTimeDays'] < 365)]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastModified_1to2years.csv"                                    #append date range to filename
df_output_file = df[ (df['ModificationTimeDays'] >= (365*1))  & (df['ModificationTimeDays'] < (365*2))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastModified_2to3years.csv"                                    #append date range to filename
df_output_file = df[ (df['ModificationTimeDays'] >= (365*2))  & (df['ModificationTimeDays'] < (365*3))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastModified_3to5years.csv"                                    #append date range to filename
df_output_file = df[ (df['ModificationTimeDays'] >= (365*3))  & (df['ModificationTimeDays'] < (365*5))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastModified_5to99years.csv"                                   #append date range to filename
df_output_file= df[ (df['ModificationTimeDays'] >= (365*5)) & (df['ModificationTimeDays'] < (365*99))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

print ("Finished creating CSV output files")





# 4 - Categorize files by Last Access Time

In [None]:
# show a list of all the filenames with age 0 to  90 days old by extracting all rows with ModificationTimeDays <= 90
LastAccessed_age0to90days = (df[(df['AccessTimeDays'] >= 0) & (df['AccessTimeDays'] < 90)])
print("Number of files with last access date   0 to  90  days: ", (len(LastAccessed_age0to90days)) )

# show a list of all the filenames with age 90 to 180 days
LastAccessed_age90to180days = (df[(df['AccessTimeDays'] >= 90) & (df['AccessTimeDays'] < 180)])
print("Number of files with last access date  90 to 180  days: ", (len(LastModified_age90to180days)) )

# show a list of all the filenames with age 180 to 365 days
LastAccessed_age180to365days = (df[(df['AccessTimeDays'] >= 180) & (df['AccessTimeDays'] < 365)])
print("Number of files with last access date 180 to 365  days: ", (len(LastModified_age180to365days)) )

# show a list of all the filenames with age 1 to 2 years days
LastAccessed_age1to2years = (df[(df['AccessTimeDays'] >= (365*1)) & (df['AccessTimeDays'] < (365*2))])
print("Number of files with last access date   1 to   2 years: ", (len(LastModified_age1to2years)) )

# show a list of all the filenames with age 2 to 3 years days
LastAccessed_age2to3years = (df[(df['AccessTimeDays'] >= (365*2)) & (df['AccessTimeDays'] < (365*3))])
print("Number of files with last access date   2 to   3 years: ", (len(LastModified_age2to3years)) )

# show a list of all the filenames with age 3 to 5 years days
LastAccessed_age3to5years = (df[(df['AccessTimeDays'] >= (365*3)) & (df['AccessTimeDays'] < (365*5))])
print("Number of files with last access date   3 to   5 years: ", (len(LastModified_age3to5years)) )

# show a list of all the filenames with age 5 to 99 years days
LastAccessed_age5to99years = (df[(df['AccessTimeDays'] >= (365*5)) & (df['AccessTimeDays'] < (365*99))])
print("Number of files with last access date   5 to  99 years: ", (len(LastModified_age5to99years)) )



## 4.1 - Create graphs for file counts

In [None]:
# create a bar chart 

# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')

if __name__ == '__main__':

    # creating data on which bar chart will be plot
    x = "0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"
    y = [len(LastAccessed_age0to90days), len(LastAccessed_age90to180days), len(LastAccessed_age180to365days), len(LastAccessed_age1to2years), len(LastAccessed_age2to3years), len(LastAccessed_age3to5years), len(LastAccessed_age5to99years)]

    # setting figure size by using figure() function
    plt.figure(figsize = (10, 5))

    # making the bar chart on the data
    plt.bar(x, y)

    # calling the function to add value labels
    addlabels(x, y)

    # Add main title (suptitle) and subtitle (title)
    plt.suptitle("Filename counts by Last Access Date", fontsize=14, fontweight="bold", y=0.95)
    plt.title("(not all filesystems record Last Access Date, may be less reliable than Last Modification Date)", fontsize=8, pad=2)  # Subtitle with small padding

    # giving X and Y labels
    plt.xlabel("Last Access Date")
    plt.ylabel("Number of files")

    # Save the graph as a PNG file for future reference
    graph_filename = CSV_source_file                                                    #start with name of source file
    if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  # Remove the last 4 characters (.csv)
    graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
    graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
    graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
    graph_filename += ".png"                                                            #append .png extension to the filename
    plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
    print(f"Graph saved as {graph_filename}")


    # visualizing the plot
    plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data for the pie chart
labels = ["0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"]
sizes = [len(LastAccessed_age0to90days), len(LastAccessed_age90to180days), len(LastAccessed_age180to365days), len(LastAccessed_age1to2years), len(LastAccessed_age2to3years), len(LastAccessed_age3to5years), len(LastAccessed_age5to99years)]

# The chart gets cluttered and hard to read if there are labels on very small pie slices
# Only put labels on pie slices larger than 10%
# Function to format labels with percentage only if >5%
def label_format(label, pct):
    return f"{label} \n{pct:.1f}%" if pct > 5 else ""  # Show label + % if >10%

# Compute final labels with percentages for slices >5%
total_size = sum(sizes)
final_labels = [label_format(label, (size / total_size) * 100) for label, size in zip(labels, sizes)]

# Set up figure size
plt.figure(figsize=(8, 6))

# Create pie chart
plt.pie(
    sizes,
    labels=final_labels,  # Labels include percentage only if >10%
    labeldistance=0.5,    # Moves labels inside slices
)

# Add main title (suptitle) and subtitle (title)
plt.suptitle("Percentage Filename counts by Last Access Date", fontsize=14, fontweight="bold", y=0.95)
plt.title("(not all filesystems record Last Access Date, may be less reliable than Last Modification Date)", fontsize=8, pad=2)  # Subtitle with small padding


# Generate legend labels with percentages
legend_labels = [f"{label} ({(size / total_size) * 100:.1f}%)" for label, size in zip(labels, sizes)]

# Add legend with percentage values
plt.legend(legend_labels, loc="best", bbox_to_anchor=(1, 0.5), title="")

# Save the graph as a PNG file for future reference
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename
plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
print(f"Graph saved as {graph_filename}")

# Show the chart
plt.show()


## 4.2 - Create graphs for byte counts
(this section is time consuming if you have a lot of rows in the dataset)

In [None]:
## figure out the total number of bytes for each file category

# Initialize variables as int64 to prevent overflow
LastAccessed_bytes0to90days    = np.int64(0)
LastAccessed_bytes90to180days  = np.int64(0)
LastAccessed_bytes180to365days = np.int64(0)
LastAccessed_bytes1to2years    = np.int64(0)
LastAccessed_bytes2to3years    = np.int64(0)
LastAccessed_bytes3to5years    = np.int64(0)
LastAccessed_bytes5to99years   = np.int64(0)

# If df.Bytes is int32 or uint32, it may overflow when adding large values. Convert it to int64:
df["Bytes"] = df["Bytes"].astype("int64")

print("Starting processing of", len(df), "files")
if len(df) > 1000000: print("Detected large input file, please be patient during processing...")
# loop through the contents of the dataframe to find the total number of bytes for each file age category
# Instead of using range(0, len(df)), iterate over the DataFrame's index:
for i in df.index:
    if (i % 100000 == 0):  print("Processed", i, "of", len(df), "files ", df.Filename[i], df.Bytes[i], df.AccessTimeDays[i]) # print debug output every 100000 lines
    # Check if i is the last index instead of relying on range:
    if i == df.index[-1]: print("Processed", i, "of", len(df), "files ", df.Filename[i], df.Bytes[i], df.ModificationTimeDays[i]) # print debug output for the last line so we know when the loop is finished
    if (df.AccessTimeDays[i] >=0       and df.AccessTimeDays[i] <  90):      LastAccessed_bytes0to90days    += df.Bytes[i]
    if (df.AccessTimeDays[i] >=90      and df.AccessTimeDays[i] < 180):      LastAccessed_bytes90to180days  += df.Bytes[i]
    if (df.AccessTimeDays[i] >=180     and df.AccessTimeDays[i] < 365):      LastAccessed_bytes180to365days += df.Bytes[i]
    if (df.AccessTimeDays[i] >=(365*1) and df.AccessTimeDays[i] < (365*2)):  LastAccessed_bytes1to2years    += df.Bytes[i]
    if (df.AccessTimeDays[i] >=(365*2) and df.AccessTimeDays[i] < (365*3)):  LastAccessed_bytes2to3years    += df.Bytes[i]
    if (df.AccessTimeDays[i] >=(365*3) and df.AccessTimeDays[i] < (365*5)):  LastAccessed_bytes3to5years    += df.Bytes[i]
    if (df.AccessTimeDays[i] >=(365*5) and df.AccessTimeDays[i] < (365*99)): LastAccessed_bytes5to99years   += df.Bytes[i]

In [None]:
print("Bytes with last access date   0 to  90  days: ", LastAccessed_bytes0to90days    )
print("Bytes with last access date  90 to 180  days: ", LastAccessed_bytes90to180days  )
print("Bytes with last access date 180 to 365  days: ", LastAccessed_bytes180to365days )
print("Bytes with last access date   1 to   2 years: ", LastAccessed_bytes1to2years    )
print("Bytes with last access date   2 to   3 years: ", LastAccessed_bytes2to3years    )
print("Bytes with last access date   3 to   5 years: ", LastAccessed_bytes3to5years    )
print("Bytes with last access date   5 to  99 years: ", LastAccessed_bytes5to99years   )


In [None]:
# confirm the datatype is int64 (avoids overflow for very large numbers)
print(f"Data type for df.Bytes is:", df.Bytes.dtype)

In [None]:
# convert bytes to more human-readable kilobytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest GB is close enough)
LastAccessed_Kbytes0to90days    = int(LastAccessed_bytes0to90days/1024)
LastAccessed_Kbytes90to180days  = int(LastAccessed_bytes90to180days/1024)
LastAccessed_Kbytes180to365days = int(LastAccessed_bytes180to365days/1024)
LastAccessed_Kbytes1to2years    = int(LastAccessed_bytes1to2years/1024)
LastAccessed_Kbytes2to3years    = int(LastAccessed_bytes2to3years/1024)
LastAccessed_Kbytes3to5years    = int(LastAccessed_bytes3to5years/1024)
LastAccessed_Kbytes5to99years   = int(LastAccessed_bytes5to99years/1024)

print("KiloBytes with last access date   0 to  90  days: ", LastAccessed_Kbytes0to90days    )
print("KiloBytes with last access date  90 to 180  days: ", LastAccessed_Kbytes90to180days  )
print("KiloBytes with last access date 180 to 365  days: ", LastAccessed_Kbytes180to365days )
print("KiloBytes with last access date   1 to   2 years: ", LastAccessed_Kbytes1to2years    )
print("KiloBytes with last access date   2 to   3 years: ", LastAccessed_Kbytes2to3years    )
print("KiloBytes with last access date   3 to   5 years: ", LastAccessed_Kbytes3to5years    )
print("KiloBytes with last access date   5 to  99 years: ", LastAccessed_Kbytes5to99years   )
print("-----------------------------------------------------------------") 


# convert bytes to more human-readable megabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest GB is close enough)
LastAccessed_Mbytes0to90days    = int(LastAccessed_bytes0to90days/1024/1024)
LastAccessed_Mbytes90to180days  = int(LastAccessed_bytes90to180days/1024/1024)
LastAccessed_Mbytes180to365days = int(LastAccessed_bytes180to365days/1024/1024)
LastAccessed_Mbytes1to2years    = int(LastAccessed_bytes1to2years/1024/1024)
LastAccessed_Mbytes2to3years    = int(LastAccessed_bytes2to3years/1024/1024)
LastAccessed_Mbytes3to5years    = int(LastAccessed_bytes3to5years/1024/1024)
LastAccessed_Mbytes5to99years   = int(LastAccessed_bytes5to99years/1024/1024)

print("MegaBytes with last access date   0 to  90  days: ", LastAccessed_Mbytes0to90days    )
print("MegaBytes with last access date  90 to 180  days: ", LastAccessed_Mbytes90to180days  )
print("MegaBytes with last access date 180 to 365  days: ", LastAccessed_Mbytes180to365days )
print("MegaBytes with last access date   1 to   2 years: ", LastAccessed_Mbytes1to2years    )
print("MegaBytes with last access date   2 to   3 years: ", LastAccessed_Mbytes2to3years    )
print("MegaBytes with last access date   3 to   5 years: ", LastAccessed_Mbytes3to5years    )
print("MegaBytes with last access date   5 to  99 years: ", LastAccessed_Mbytes5to99years   )
print("-----------------------------------------------------------------") 


# convert bytes to more human-readable gigabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest MB is close enough)
LastAccessed_Gbytes0to90days    = int(LastAccessed_bytes0to90days/1024/1024/1024)
LastAccessed_Gbytes90to180days  = int(LastAccessed_bytes90to180days/1024/1024/1024)
LastAccessed_Gbytes180to365days = int(LastAccessed_bytes180to365days/1024/1024/1024)
LastAccessed_Gbytes1to2years    = int(LastAccessed_bytes1to2years/1024/1024/1024)
LastAccessed_Gbytes2to3years    = int(LastAccessed_bytes2to3years/1024/1024/1024)
LastAccessed_Gbytes3to5years    = int(LastAccessed_bytes3to5years/1024/1024/1024)
LastAccessed_Gbytes5to99years   = int(LastAccessed_bytes5to99years/1024/1024/1024)

print("GigaBytes with last access date   0 to  90  days: ", LastAccessed_Gbytes0to90days    )
print("GigaBytes with last access date  90 to 180  days: ", LastAccessed_Gbytes90to180days  )
print("GigaBytes with last access date 180 to 365  days: ", LastAccessed_Gbytes180to365days )
print("GigaBytes with last access date   1 to   2 years: ", LastAccessed_Gbytes1to2years    )
print("GigaBytes with last access date   2 to   3 years: ", LastAccessed_Gbytes2to3years    )
print("GigaBytes with last access date   3 to   5 years: ", LastAccessed_Gbytes3to5years    )
print("GigaBytes with last access date   5 to  99 years: ", LastAccessed_Gbytes5to99years   )
print("-----------------------------------------------------------------") 


# convert bytes to more human-readable terabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest TB is close enough)
LastAccessed_Tbytes0to90days    = int(LastAccessed_bytes0to90days/1024/1024/1024/1024)
LastAccessed_Tbytes90to180days  = int(LastAccessed_bytes90to180days/1024/1024/1024/1024)
LastAccessed_Tbytes180to365days = int(LastAccessed_bytes180to365days/1024/1024/1024/1024)
LastAccessed_Tbytes1to2years    = int(LastAccessed_bytes1to2years/1024/1024/1024/1024)
LastAccessed_Tbytes2to3years    = int(LastAccessed_bytes2to3years/1024/1024/1024/1024)
LastAccessed_Tbytes3to5years    = int(LastAccessed_bytes3to5years/1024/1024/1024/1024)
LastAccessed_Tbytes5to99years   = int(LastAccessed_bytes5to99years/1024/1024/1024/1024)

print("TeraBytes with last access date   0 to  90  days: ", LastAccessed_Tbytes0to90days    )
print("TeraBytes with last access date  90 to 180  days: ", LastAccessed_Tbytes90to180days  )
print("TeraBytes with last access date 180 to 365  days: ", LastAccessed_Tbytes180to365days )
print("TeraBytes with last access date   1 to   2 years: ", LastAccessed_Tbytes1to2years    )
print("TeraBytes with last access date   2 to   3 years: ", LastAccessed_Tbytes2to3years    )
print("TeraBytes with last access date   3 to   5 years: ", LastAccessed_Tbytes3to5years    )
print("TeraBytes with last access date   5 to  99 years: ", LastAccessed_Tbytes5to99years   )
print("-----------------------------------------------------------------") 


# convert bytes to more human-readable petabytes for graphing
# use int() to convert the floating point value to the nearest integer (nearest PB is close enough)
LastAccessed_Pbytes0to90days    = int(LastAccessed_bytes0to90days/1024/1024/1024/1024/1024)
LastAccessed_Pbytes90to180days  = int(LastAccessed_bytes90to180days/1024/1024/1024/1024/1024)
LastAccessed_Pbytes180to365days = int(LastAccessed_bytes180to365days/1024/1024/1024/1024/1024)
LastAccessed_Pbytes1to2years    = int(LastAccessed_bytes1to2years/1024/1024/1024/1024/1024)
LastAccessed_Pbytes2to3years    = int(LastAccessed_bytes2to3years/1024/1024/1024/1024/1024)
LastAccessed_Pbytes3to5years    = int(LastAccessed_bytes3to5years/1024/1024/1024/1024/1024)
LastAccessed_Pbytes5to99years   = int(LastAccessed_bytes5to99years/1024/1024/1024/1024/1024)

print("PetaBytes with last access date   0 to  90  days: ", LastAccessed_Pbytes0to90days    )
print("PetaBytes with last access date  90 to 180  days: ", LastAccessed_Pbytes90to180days  )
print("PetaBytes with last access date 180 to 365  days: ", LastAccessed_Pbytes180to365days )
print("PetaBytes with last access date   1 to   2 years: ", LastAccessed_Pbytes1to2years    )
print("PetaBytes with last access date   2 to   3 years: ", LastAccessed_Pbytes2to3years    )
print("PetaBytes with last access date   3 to   5 years: ", LastAccessed_Pbytes3to5years    )
print("PetaBytes with last access date   5 to  99 years: ", LastAccessed_Pbytes5to99years   )

In [None]:
# Based on the byte counts, figure out the preferred unit value (Bytes, MegaBytes, GigaBytes, TeraBytes) to use for graphing
if (LastAccessed_bytes0to90days  >= 0):  y_axis_units = "Bytes"  #start with default value of bytes
if (LastAccessed_Kbytes0to90days >= 10): y_axis_units = "KiloBytes"
if (LastAccessed_Mbytes0to90days >= 10): y_axis_units = "MegaBytes"
if (LastAccessed_Gbytes0to90days >= 10): y_axis_units = "GigaBytes"
if (LastAccessed_Tbytes0to90days >= 10): y_axis_units = "TeraBytes"
if (LastAccessed_Pbytes0to90days >= 10): y_axis_units = "PetaBytes"
print ("Based on file sizes, the vertical y-axis units will be shown in ", y_axis_units)

## manually edit y_axis_units if desired
##y_axis_units = "GigaBytes"

In [None]:
# create a bar graph with these vertical columns:
# bytes0to90days
# bytes90to180days
# bytes180to365days
# bytes1to2years
# bytes2to3years
# bytes3to5years
# bytes5to99years



# function to add value labels
def addlabels(x,y):
    for i in range(len(x)):
        plt.text(i, y[i], y[i], ha = 'center')

if __name__ == '__main__':

    # creating data on which bar chart will be plot
    x = "0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"
    #
    # Based on how big the numbers are, figure out if the vertical y-axis should be in bytes, Mbytes, Gbytes, Tbytes, Pbytes
    if (y_axis_units == "Bytes"):     y = [ LastAccessed_bytes0to90days,  LastAccessed_bytes90to180days,  LastAccessed_bytes180to365days,  LastAccessed_bytes1to2years,  LastAccessed_bytes2to3years,  LastAccessed_bytes3to5years,  LastAccessed_bytes5to99years]
    if (y_axis_units == "MegaBytes"): y = [LastAccessed_Mbytes0to90days, LastAccessed_Mbytes90to180days, LastAccessed_Mbytes180to365days, LastAccessed_Mbytes1to2years, LastAccessed_Mbytes2to3years, LastAccessed_Mbytes3to5years, LastAccessed_Mbytes5to99years]
    if (y_axis_units == "GigaBytes"): y = [LastAccessed_Gbytes0to90days, LastAccessed_Gbytes90to180days, LastAccessed_Gbytes180to365days, LastAccessed_Gbytes1to2years, LastAccessed_Gbytes2to3years, LastAccessed_Gbytes3to5years, LastAccessed_Gbytes5to99years]
    if (y_axis_units == "TeraBytes"): y = [LastAccessed_Tbytes0to90days, LastAccessed_Tbytes90to180days, LastAccessed_Tbytes180to365days, LastAccessed_Tbytes1to2years, LastAccessed_Tbytes2to3years, LastAccessed_Tbytes3to5years, LastAccessed_Tbytes5to99years]
    if (y_axis_units == "PetaBytes"): y = [LastAccessed_Pbytes0to90days, LastAccessed_Pbytes90to180days, LastAccessed_Pbytes180to365days, LastAccessed_Pbytes1to2years, LastAccessed_Pbytes2to3years, LastAccessed_Pbytes3to5years, LastAccessed_Pbytes5to99years]


    # setting figure size by using figure() function
    plt.figure(figsize = (10, 5))

    # making the bar chart on the data
    plt.bar(x, y)

    # calling the function to add value labels
    addlabels(x, y)

    # Add main title (suptitle) and subtitle (title)
    plt.suptitle("Byte Counts by Last Access Date", fontsize=14, fontweight="bold", y=0.95)
    plt.title("(not all filesystems record Last Access Date, may be less reliable than Last Modification Date)", fontsize=8, pad=2)  # Subtitle with small padding

    # giving X and Y labels
    plt.xlabel("Last Modification Date")
    plt.ylabel("Number of bytes")
    if (y_axis_units == "Bytes"):     plt.ylabel("Bytes")
    if (y_axis_units == "KiloBytes"): plt.ylabel("Kilobytes")
    if (y_axis_units == "MegaBytes"): plt.ylabel("MegaBbytes")
    if (y_axis_units == "GigaBytes"): plt.ylabel("GigaBytes")
    if (y_axis_units == "TeraBytes"): plt.ylabel("TeraBytes")
    if (y_axis_units == "PetaBytes"): plt.ylabel("PetaBytes")

    # Save the graph as a PNG file for future reference
    graph_filename = CSV_source_file                                                    #start with name of source file
    if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  # Remove the last 4 characters (.csv)
    graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
    graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
    graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
    graph_filename += ".png"                                                            #append .png extension to the filename
    plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
    print(f"Graph saved as {graph_filename}")

    # visualizing the plot
    plt.show()

In [None]:
import matplotlib.pyplot as plt

# Data for the pie chart
labels = ["0to90days", "90to180days", "180to365days", "1to2years", "2to3years", "3to5years", "5to99years"]
sizes  = [LastAccessed_bytes0to90days, LastAccessed_bytes90to180days, LastAccessed_bytes180to365days, LastAccessed_bytes1to2years, LastAccessed_bytes2to3years, LastAccessed_bytes3to5years, LastAccessed_bytes5to99years]

# The chart gets cluttered and hard to read if there are labels on very small pie slices
# Only put labels on pie slices larger than 10%
# Function to format labels with percentage only if >10%
def label_format(label, pct):
    return f"{label} \n{pct:.1f}%" if pct > 5 else ""  # Show label + % if >10%

# Compute final labels with percentages for slices >5%
total_size = sum(sizes)
final_labels = [label_format(label, (size / total_size) * 100) for label, size in zip(labels, sizes)]

# Set up figure size
plt.figure(figsize=(8, 6))

# Create pie chart
plt.pie(
    sizes,
    labels=final_labels,  # Labels include percentage only if >10%
    labeldistance=0.5,    # Moves labels inside slices
)

# Add main title (suptitle) and subtitle (title)
plt.suptitle("Percentage Byte Counts by Last Access Date", fontsize=14, fontweight="bold", y=0.95)
plt.title("(not all filesystems record Last Access Date, may be less reliable than Last Modification Date)", fontsize=8, pad=2)  # Subtitle with small padding

# Generate legend labels with percentages
legend_labels = [f"{label} ({(size / total_size) * 100:.1f}%)" for label, size in zip(labels, sizes)]

# Add legend with percentage values
plt.legend(legend_labels, loc="best", bbox_to_anchor=(1, 0.5), title="")

# Save the graph as a PNG file for future reference
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename
plt.savefig(graph_filename, format="png", dpi=300, bbox_inches="tight")
print(f"Graph saved as {graph_filename}")

# Show the chart
plt.show()


## 4.3 - Create CSV reports for each category of files

In [None]:

# Generate reports in CSV format showing the list of filenames, bytes, ModificationTimeDays

## open file for writing
#CSV_temp_file = CSV_source_file   #start with name of source file
#CSV_temp_file += ".tmp"           #append .tmp to filename

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastAccessed_0to90days.csv"                                    #append date range to filename
df_output_file  = df[ (df['AccessTimeDays'] >= 0)        & (df['AccessTimeDays'] < 90)]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastAccessed_90to180days.csv"                                  #append date range to filename
df_output_file  = df[ (df['AccessTimeDays'] >= 90)     & (df['AccessTimeDays'] < 180)]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastAccessed_180to365days.csv"                                 #append date range to filename
df_output_file = df[ (df['AccessTimeDays'] >= 180)   & (df['AccessTimeDays'] < 365)]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastAccessed_1to2years.csv"                                    #append date range to filename
df_output_file = df[ (df['AccessTimeDays'] >= (365*1))  & (df['AccessTimeDays'] < (365*2))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastAccessed_2to3years.csv"                                    #append date range to filename
df_output_file = df[ (df['AccessTimeDays'] >= (365*2))  & (df['AccessTimeDays'] < (365*3))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastAccessed_3to5years.csv"                                    #append date range to filename
df_output_file = df[ (df['AccessTimeDays'] >= (365*3))  & (df['AccessTimeDays'] < (365*5))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

CSV_output_file = CSV_source_file                                                   #start with name of source file
if CSV_source_file.lower().endswith(".csv"): CSV_output_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
CSV_output_file += "_LastAccessed_5to99years.csv"                                   #append date range to filename
df_output_file= df[ (df['AccessTimeDays'] >= (365*5)) & (df['AccessTimeDays'] < (365*99))]
print ("Creating CSV output file showing all filenames with ages at:", CSV_output_file)
df_output_file.to_csv(CSV_output_file)

print ("Finished creating CSV output files")





# 5 - Create HTML report for presentation to stakeholders

In [None]:
# set filename for HTML report
HTML_file = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): HTML_file = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
HTML_file += ".html"                                                           #append .html extension to filename
print(f"Setting HTML output file to {HTML_file}")

In [None]:
# set the current working directory to the folder containing the *.html and *.png files
# we want to be in the current directory so the HTML <img src=filename.png> tags do not contain directory paths

# figure out the name of the directory containing the HTML_file from the previous step
directory_name = os.path.dirname(HTML_file)

# set the current directory to the location of the files we are working with
os.chdir(directory_name)
print(f"Setting current working directory to {directory_name}")

In [None]:
# create HTML header

html_header = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>File Age Report</title>
</head>
<body>
"""



In [None]:
# do some math to figure out the total number of files in a human-readable format

# NOTE: we report files in multiples of 1000 (base 10), but we report bytes in multiples of 1024 (base 2)
total_files = num_lines
total_Kfiles = round(total_files/1000, 1)
total_Mfiles = round(total_files/1000/1000, 1)
total_Gfiles = round(total_files/1000/1000/1000, 1)
total_Tfiles = round(total_files/1000/1000/1000/1000, 1)
total_Pfiles = round(total_files/1000/1000/1000/1000/1000, 1)


# Based on the file counts, figure out the preferred unit value (Files, Kfiles, Mfiles, TFiles, PFiles) to display in the report
if (total_files  >= 0):  file_units = ""  #start with default value nothing
if (total_Kfiles >= 1): file_units = "Thousand"
if (total_Mfiles >= 1): file_units = "Million"
if (total_Gfiles >= 1): file_units = "Billion"
if (total_Tfiles >= 1): file_units = "Trillion"
if (total_Pfiles >= 1): file_units = "Quadrillion"
print (f"Based on file counts, the numbers will be reported in {file_units}")

 # Based on how big the numbers are, figure out if the vertical y-axis should be in bytes, Mbytes, Gbytes, Tbytes, Pbytes
if (file_units == ""):            total_files_human_readable = f"{total_files}"
if (file_units == "Thousand"):    total_files_human_readable = f"{total_Kfiles} Thousand"
if (file_units == "Million"):     total_files_human_readable = f"{total_Mfiles} Million"
if (file_units == "Billion"):     total_files_human_readable = f"{total_Gfiles} Billion"
if (file_units == "Trillion"):    total_files_human_readable = f"{total_Tfiles} Trillion"
if (file_units == "Quadrillion"): total_files_human_readable = f"{total_Pfiles} Quadrillion"
print (f"Total file count is {total_files_human_readable}")


In [None]:
# do some math to figure out the total disk space consumed in a human-readable format

# NOTE: we report files in multiples of 1000 (base 10), but we report bytes in multiples of 1024 (base 2)
# bytes are typically too small, convert to more human-readable units, rounded to 1 decimal place
total_bytes = sum(df.Bytes)
total_Kbytes = round(total_bytes/1024, 1)
total_Mbytes = round(total_bytes/1024/1024, 1)
total_Gbytes = round(total_bytes/1024/1024/1024, 1)
total_Tbytes = round(total_bytes/1024/1024/1024/1024, 1)
total_Pbytes = round(total_bytes/1024/1024/1024/1024/1024, 1)


# Based on the byte counts, figure out the preferred unit value (Bytes, MegaBytes, GigaBytes, TeraBytes) to display in the report
if (total_bytes  >= 0):  byte_units = "Bytes"  #start with default value of bytes
if (total_Kbytes >= 1): byte_units = "KiloBytes"
if (total_Mbytes >= 1): byte_units = "MegaBytes"
if (total_Gbytes >= 1): byte_units = "GigaBytes"
if (total_Tbytes >= 1): byte_units = "TeraBytes"
if (total_Pbytes >= 1): byte_units = "PetaBytes"
print ("Based on file sizes, the space consumption will be reported in ", byte_units)

 # Based on how big the numbers are, figure out if the vertical y-axis should be in bytes, Mbytes, Gbytes, Tbytes, Pbytes
if (byte_units == "Bytes"):     total_bytes_human_readable = f"{total_bytes} {byte_units}"
if (byte_units == "KiloBytes"): total_bytes_human_readable = f"{total_Kbytes} {byte_units}"
if (byte_units == "MegaBytes"): total_bytes_human_readable = f"{total_Mbytes} {byte_units}"
if (byte_units == "GigaBytes"): total_bytes_human_readable = f"{total_Gbytes} {byte_units}"
if (byte_units == "TeraBytes"): total_bytes_human_readable = f"{total_Tbytes} {byte_units}"
if (byte_units == "PetaBytes"): total_bytes_human_readable = f"{total_Pbytes} {byte_units}"
print (f"Total space utilization is {total_bytes_human_readable}")


In [None]:
#get some detail about the share name, total files, total bytes

sample_filename = {df.Filename[0]}
total_bytes = sum(df.Bytes)
html_body1 = f"""
<br>Total number of bytes in this share: {total_bytes} <b> ({total_bytes_human_readable}) </b>
<br>Total number of files in this share: {total_files} <b> ({total_files_human_readable} files) </b>
<br>Input source file is {CSV_source_file}
<br>Sample filename: {sample_filename}
<p><hr>
"""

In [None]:
## debugging output 
html_body1

In [None]:
# display the graphs for the LastModifiedDate

# Figure out the filename that was saved earlier
graph_filename1 = CSV_source_file                                                    #start with name of source file
graph_filename1 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename1.lower().endswith(".csv"): graph_filename1 = graph_filename1[:-4]  #remove the last 4 characters (.csv)
graph_filename1 += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename1 += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename1 += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename1 += ".png"                                                            #append .png extension to the filename

# Figure out the filename that was saved earlier
graph_filename2 = CSV_source_file                                                    #start with name of source file
graph_filename2 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename2.lower().endswith(".csv"): graph_filename2 = graph_filename2[:-4]  #remove the last 4 characters (.csv)
graph_filename2 += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename2 += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename2 += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename2 += ".png"                                                            #append .png extension to the filename

# Figure out the filename that was saved earlier
graph_filename3 = CSV_source_file                                                    #start with name of source file
graph_filename3 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename3.lower().endswith(".csv"): graph_filename3 = graph_filename3[:-4]  #remove the last 4 characters (.csv)
graph_filename3 += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename3 += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename3 += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename3 += ".png"                                                            #append .png extension to the filename

# Figure out the filename that was saved earlier
graph_filename4 = CSV_source_file                                                    #start with name of source file
graph_filename4 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename4.lower().endswith(".csv"): graph_filename4 = graph_filename4[:-4]  #remove the last 4 characters (.csv)
graph_filename4 += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename4 += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename4 += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename4 += ".png"                                                            #append .png extension to the filename


html_body2 = f"""
<table border=1>
<tr><td colspan=2 bgcolor=lightgray><center><h2>File counts by Last Modification Date</h2></center>
<tr><td><img width="80%" src={graph_filename1}>
    <td><img width="80%" src={graph_filename2}>
<tr><td colspan=2 bgcolor=lightgray><center><h2>Byte counts by Last Modification Date</h2></center>
<tr><td><img width="80%" src={graph_filename3}>
    <td><img width="80%" src={graph_filename4}>
</table>
<p><hr>
"""


In [None]:
## debugging output 
html_body2

In [None]:
# display the graphs for the LastAccessDate

# Figure out the filename that was saved earlier
graph_filename5 = CSV_source_file                                                    #start with name of source file
graph_filename5 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename5.lower().endswith(".csv"): graph_filename5 = graph_filename5[:-4]  #remove the last 4 characters (.csv)
graph_filename5 += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename5 += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename5 += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename5 += ".png"                                                            #append .png extension to the filename

# Figure out the filename that was saved earlier
graph_filename6 = CSV_source_file                                                    #start with name of source file
graph_filename6 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename6.lower().endswith(".csv"): graph_filename6 = graph_filename6[:-4]  #remove the last 4 characters (.csv)
graph_filename6 += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename6 += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename6 += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename6 += ".png"                                                            #append .png extension to the filename

# Figure out the filename that was saved earlier
graph_filename7 = CSV_source_file                                                    #start with name of source file
graph_filename7 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename7.lower().endswith(".csv"): graph_filename7 = graph_filename7[:-4]  #remove the last 4 characters (.csv)
graph_filename7 += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename7 += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename7 += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename7 += ".png"                                                            #append .png extension to the filename

# Figure out the filename that was saved earlier
graph_filename8 = CSV_source_file                                                    #start with name of source file
graph_filename8 = os.path.basename(CSV_source_file)                                  #extract only the filename (remove directory path)
if graph_filename8.lower().endswith(".csv"): graph_filename8 = graph_filename8[:-4]  #remove the last 4 characters (.csv)
graph_filename8 += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename8 += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename8 += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename8 += ".png"                                                            #append .png extension to the filename

html_body3 = f"""
<table border=1>
<tr><td colspan=2 bgcolor=lightgray><center><h2>File counts by Last Access Date</h2></center>
<tr><td><img  width="80%"src={graph_filename5}>
    <td><img  width="80%"src={graph_filename6}>
<tr><td colspan=2 bgcolor=lightgray><center><h2>Byte counts by Last Access Date</h2></center>
<tr><td><img  width="80%"src={graph_filename7}>
    <td><img  width="80%"src={graph_filename8}>
</table>
<p><hr>
"""


In [None]:
## debugging output 
html_body3

In [None]:
# create HTML fooder
html_footer = """
</body>
</html>
"""

In [None]:
# debugging output to confirm the HTML syntax is correct
print(f"{html_header}")
print(f"{html_body1}")
print(f"{html_body2}")
print(f"{html_body3}")
print(f"{html_footer}")



In [None]:
# save the HTML file
with open(HTML_file, "w", encoding="utf-8") as file:
    file.write(html_header)
    file.write(html_body1)
    file.write(html_body2)
    file.write(html_body3)
    file.write(html_footer)

print(f"HTML file '{HTML_file}' has been created successfully.")


In [None]:
# import os

# figure out the name of the directory containing the HTML_file from the previous step
directory_name = os.path.dirname(HTML_file)
print(f"Searching for *.html files in {directory_name} directory")  

# Get a list of all HTML files in the current directory (excluding index.html)
html_files = [f for f in os.listdir(directory_name) if f.endswith(".html") and f != "index.html"]
print(html_files)  


In [None]:
# At this point, we have an HTML report for this particular share, but it is common to have lots of shares.
# So we will create an index.html file with hyperlinks to all the *.html files in this folder.
# The idea here is to make it easy for the end user to just click an index.html file and see reports for all the shares.

# import os

# figure out the name of the directory containing the HTML_file from the previous step
directory_name = os.path.dirname(HTML_file)
print(f"Searching for *.html files in {directory_name} directory")  

# set the current directory to the location of the files we are working with
os.chdir(directory_name)

# Get a list of all HTML files in the current directory (excluding index.html)
html_files = [f for f in os.listdir(directory_name) if f.endswith(".html") and f != "index.html"]
print(html_files)  

# Generate the HTML content
html_content = """<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>File age reports</title>
</head>
<body>
    <h2>File Age Reports for NFS / CIFS shares</h2>
    <ul>
"""

# Add links to each HTML file
for file in html_files:
    html_content += f'<br><a href="{file}">{file}</a>\n'

# Close the HTML structure
html_content += """    </ul>
</body>
</html>
"""

# Save to index.html
index_file = "index.html"
with open(index_file, "w", encoding="utf-8") as file:
    file.write(html_content)

print(f"'{index_file}' has been created successfully with links to {len(html_files)} HTML files.")




# 6 - Summary

In [None]:
# to make things easy for the user, display the charts that were created earlier
# this makes it easy for the user to see all the graphs without having to scroll up and down through the previous cells

In [None]:
# display the bar chart for LastModificationDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")
    


In [None]:
# display the pie chart for LastModificationDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")


In [None]:
# display the bar chart for LastModificationDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")
    


In [None]:
# display the pie chart for LastModificationDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastModificationDate"                                           #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")


In [None]:
# display the bar chart for LastAccessDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")


In [None]:
# display the pie chart for LastAccessDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_FileCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")


In [None]:
# display the bar chart for LastAccessDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_BarChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")


In [None]:
# display the pie chart for LastAccessDate

# Figure out the filename that was saved earlier
graph_filename = CSV_source_file                                                    #start with name of source file
if CSV_source_file.lower().endswith(".csv"): graph_filename = CSV_source_file[:-4]  #remove the last 4 characters (.csv)
graph_filename += "_PieChart"                                                       #append the type of chart (bar, pie, etc) to the filename
graph_filename += "_ByteCount"                                                      #append FileCount|ByteCount to the filename    
graph_filename += "_LastAccessDate"                                                 #append LastModificationDate or LastAccessDate to the filename
graph_filename += ".png"                                                            #append .png extension to the filename

#from IPython.display import display
#from PIL import Image

# Confirm the image exists
if os.path.exists(graph_filename):
    print(f"Confirmed {graph_filename} exists.")
    #
    # Open the image
    img = Image.open(graph_filename)
    #
    # Get original dimensions and resize to 33%
    original_width, original_height = img.size
    new_width = original_width // 3
    new_height = original_height // 3
    resized_img = img.resize((new_width, new_height))
    #
    # Display the image
    #display(img)
    display(resized_img)
else:
    print(f"ERROR: cannot find image file {graph_filename}")


In [None]:
# show the total number of files and used bytes (by last modification date)
print(f"Total number of files by Last Modification Date:", len(df))

if (y_axis_units == 'Bytes'):
    total_bytes = LastModified_bytes0to90days  + LastModified_bytes90to180days  + LastModified_bytes180to365days  + LastModified_bytes1to2years  + LastModified_bytes2to3years  + LastModified_bytes3to5years  + LastModified_bytes5to99years
if (y_axis_units == 'KiloBytes'):
    total_bytes = LastModified_Kbytes0to90days + LastModified_Kbytes90to180days + LastModified_Kbytes180to365days + LastModified_Kbytes1to2years + LastModified_Kbytes2to3years + LastModified_Kbytes3to5years + LastModified_Kbytes5to99years
if (y_axis_units == 'MegaBytes'):
    total_bytes = LastModified_Mbytes0to90days + LastModified_Mbytes90to180days + LastModified_Mbytes180to365days + LastModified_Mbytes1to2years + LastModified_Mbytes2to3years + LastModified_Mbytes3to5years + LastModified_Mbytes5to99years
if (y_axis_units == 'GigaBytes'):
    total_bytes = LastModified_Gbytes0to90days + LastModified_Gbytes90to180days + LastModified_Gbytes180to365days + LastModified_Gbytes1to2years + LastModified_Gbytes2to3years + LastModified_Gbytes3to5years + LastModified_Gbytes5to99years
if (y_axis_units == 'TeraBytes'):
    total_bytes = LastModified_Tbytes0to90days + LastModified_Tbytes90to180days + LastModified_Tbytes180to365days + LastModified_Tbytes1to2years + LastModified_Tbytes2to3years + LastModified_Tbytes3to5years + LastModified_Tbytes5to99years
if (y_axis_units == 'PetaBytes'):
    total_bytes = LastModified_Pbytes0to90days + LastModified_Pbytes90to180days + LastModified_Pbytes180to365days + LastModified_Pbytes1to2years + LastModified_Pbytes2to3years + LastModified_Pbytes3to5years + LastModified_Pbytes5to99years
print(f"Total space consumed:", total_bytes, y_axis_units)
print(f" ")



# show the total number of files and used bytes (by last access date)
print(f"Total number of files by Last Access Date:", len(df))

if (y_axis_units == 'Bytes'):
    total_bytes = LastAccessed_bytes0to90days  + LastAccessed_bytes90to180days  + LastAccessed_bytes180to365days  + LastAccessed_bytes1to2years  + LastAccessed_bytes2to3years  + LastAccessed_bytes3to5years  + LastAccessed_bytes5to99years
if (y_axis_units == 'KiloBytes'):
    total_bytes = LastAccessed_Kbytes0to90days + LastAccessed_Kbytes90to180days + LastAccessed_Kbytes180to365days + LastAccessed_Kbytes1to2years + LastAccessed_Kbytes2to3years + LastAccessed_Kbytes3to5years + LastAccessed_Kbytes5to99years
if (y_axis_units == 'MegaBytes'):
    total_bytes = LastAccessed_Mbytes0to90days + LastAccessed_Mbytes90to180days + LastAccessed_Mbytes180to365days + LastAccessed_Mbytes1to2years + LastAccessed_Mbytes2to3years + LastAccessed_Mbytes3to5years + LastAccessed_Mbytes5to99years
if (y_axis_units == 'GigaBytes'):
    total_bytes = LastAccessed_Gbytes0to90days + LastAccessed_Gbytes90to180days + LastAccessed_Gbytes180to365days + LastAccessed_Gbytes1to2years + LastAccessed_Gbytes2to3years + LastAccessed_Gbytes3to5years + LastAccessed_Gbytes5to99years
if (y_axis_units == 'TeraBytes'):
    total_bytes = LastAccessed_Tbytes0to90days + LastAccessed_Tbytes90to180days + LastAccessed_Tbytes180to365days + LastAccessed_Tbytes1to2years + LastAccessed_Tbytes2to3years + LastAccessed_Tbytes3to5years + LastAccessed_Tbytes5to99years
if (y_axis_units == 'PetaBytes'):
    total_bytes = LastAccessed_Pbytes0to90days + LastAccessed_Pbytes90to180days + LastAccessed_Pbytes180to365days + LastAccessed_Pbytes1to2years + LastAccessed_Pbytes2to3years + LastAccessed_Pbytes3to5years + LastAccessed_Pbytes5to99years
print(f"Total space consumed:", total_bytes, y_axis_units)
