# Create directory tree and determine shareable and non-shareable files

# Import packages

In [1]:
import os
import pandas as pd
import zipfile

# Functions

In [2]:
def list_files_in_directory(root_dir):
    file_paths = []
    
    # Traverse the directory tree
    for foldername, subfolders, filenames in os.walk(root_dir):
        for filename in filenames:
            # Get the full file path
            file_path = os.path.join(foldername, filename)
            file_paths.append(file_path)
    
    # Create a DataFrame with file paths
    df = pd.DataFrame(file_paths, columns=['file_path'])
    # Add an "include" column, default to False (exclude)
    df['include'] = True
    
    return df

def zip_selected_files(df, root_dir, zip_filename):
    # Create a zipfile object
    with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for _, row in df[df['include']].iterrows():
            file_path = row['file_path']
            # Add only files marked for inclusion
            zipf.write(file_path, os.path.relpath(file_path, root_dir))
            
def list_zip_contents(zip_filename):
    with zipfile.ZipFile(zip_filename, 'r') as zipf:
        # Get the list of all file names in the zip file
        zip_contents = zipf.namelist()
        # Print the contents
        for file in zip_contents:
            print(file)

# Step 1: List all files and create DataFrame

In [3]:
root_directory = './replication-package/'
df_files = list_files_in_directory(root_directory)

In [None]:
df_files

In [5]:
df_files = df_files.loc[df_files['file_path'].str.find('.DS')==-1].reset_index(drop=True)
df_files = df_files.loc[df_files['file_path'].str.find('.ipynb_checkpoints')==-1].reset_index(drop=True)

In [None]:
df_files.loc[df_files['file_path'].str.find('/.')!=-1]

# Step 2: Directories and files not to be included

## DATASET 1 not to be included (Do as many as needed)

In [None]:
df_files.loc[df_files['file_path'].str.find('raw-data/DATASET1')!=-1, 'include'] = False

# Step 5: Create folder and file tree

In [20]:
# Function to generate a nested dictionary structure from the DataFrame
def build_tree_from_df(df):
    tree = {}
    
    for _, row in df.iterrows():
        parts = row['file_path'].split(os.sep)  # Split the file path into parts
        current_level = tree
        
        for part in parts[:-1]:  # Traverse directory structure
            if part not in current_level:
                current_level[part] = {}
            current_level = current_level[part]
        
        # Add the file with its include status
        current_level[parts[-1]] = {
            'include': row['include'],
            'path': row['file_path']
        }
    
    return tree


# Function to generate collapsible HTML tree from the dictionary
def generate_html_tree(structure):
    def traverse(directory):
        html = ""
        for name, value in directory.items():
            if isinstance(value, dict) and 'include' not in value:
                # Directory
                html += f"""
                <li>
                    <span style="color: black; cursor: pointer;" onclick="this.parentElement.querySelector('ul').classList.toggle('active')">
                        [+] {name}
                    </span>
                    <ul class="nested">
                    {traverse(value)}
                    </ul>
                </li>
                """
            else:
                # File with clickable link and color coding
                color = "blue" if value['include'] else "red"
                html += f"<li><a href='{value['path']}' style='color:{color};'>{name}</a></li>"
        return html

    return f"""
    <html>
    <head>
        <style>
            ul, #myUL {{
              list-style-type: none;
            }}
            .nested {{
              display: none;
            }}
            .active {{
              display: block;
            }}
        </style>
    </head>
    <body>

    <ul id="myUL">
    {traverse(structure)}
    </ul>

    </body>
    </html>
    """

# Function to generate a sorted collapsible HTML tree from the dictionary with root directory
def generate_html_tree_with_root(structure, root_dir_name):
    def traverse(directory):
        html = ""
        # Sort the directory keys (directories and files together)
        for name, value in sorted(directory.items()):
            if isinstance(value, dict) and 'include' not in value:
                # Directory
                html += f"""
                <li>
                    <span style="color: black; cursor: pointer;" onclick="this.parentElement.querySelector('ul').classList.toggle('active')">
                        [+] {name}
                    </span>
                    <ul class="nested">
                    {traverse(value)}
                    </ul>
                </li>
                """
            else:
                # File with clickable link and color coding
                color = "blue" if value['include'] else "red"
                html += f"<li><a href='{value['path']}' style='color:{color};'>{name}</a></li>"
        return html

    return f"""
    <html>
    <head>
        <style>
            ul, #myUL {{
              list-style-type: none;
            }}
            .nested {{
              display: none;
            }}
            .active {{
              display: block;
            }}
        </style>
    </head>
    <body>
    <ul id="myUL">
        <li>
            <span style="color: black; cursor: pointer;" onclick="this.parentElement.querySelector('ul').classList.toggle('active')">
                [+] {root_dir_name}
            </span>
            <ul class="nested">
                {traverse(structure)}
            </ul>
        </li>
    </ul>
    </body>
    </html>
    """

# Function to generate a color-coded collapsible HTML tree from the dictionary with root directory
def generate_color_coded_html_tree_with_root(structure, root_dir_name):
    def traverse(directory):
        html = ""
        # Sort the directory keys (directories and files together)
        for name, value in sorted(directory.items()):
            if isinstance(value, dict) and 'include' not in value:
                # Directory: Color based on shared files inside (at least one shared = blue, all not shared = red)
                is_shared = any(isinstance(v, dict) and 'include' in v and v['include'] for v in value.values()) or \
                            any(isinstance(v, dict) and traverse_is_shared(v) for v in value.values())
                color = "blue" if is_shared else "red"
                status = "shared" if is_shared else "not shared"
                html += f"""
                <li>
                    <span style="color:{color}; cursor: pointer;" onclick="toggleVisibility(this)">
                        [+] {name}
                    </span> [{status}]
                    <ul class="nested">
                    {traverse(value)}
                    </ul>
                </li>
                """
            else:
                # File with clickable link and color coding
                color = "blue" if value['include'] else "red"
                status = "shared" if value['include'] else "not shared"
                html += f"<li><a href='{value['path']}' style='color:{color};'>{name}</a> [{status}]</li>"
        return html

    def traverse_is_shared(directory):
        # Recursive function to check if any file in the directory or its subdirectories is shared
        return any(isinstance(v, dict) and 'include' in v and v['include'] for v in directory.values()) or \
               any(isinstance(v, dict) and traverse_is_shared(v) for v in directory.values())

    return f"""
    <html>
    <head>
        <style>
            ul, #myUL {{
              list-style-type: none;
            }}
            .nested {{
              display: none;
            }}
            .active {{
              display: block;
            }}
        </style>
        <script>
            function toggleVisibility(element) {{
                var nestedList = element.parentElement.querySelector('ul');
                nestedList.classList.toggle('active');
                // Toggle between [+] and [-]
                if (element.innerHTML.includes('[+]')) {{
                    element.innerHTML = element.innerHTML.replace('[+]', '[-]');
                }} else {{
                    element.innerHTML = element.innerHTML.replace('[-]', '[+]');
                }}
            }}
        </script>
    </head>
    <body>
    <ul id="myUL">
        <li>
            <span style="color:blue; cursor: pointer;" onclick="toggleVisibility(this)">
                [+] {root_dir_name}
            </span> [shared]
            <ul class="nested">
                {traverse(structure)}
            </ul>
        </li>
    </ul>
    </body>
    </html>
    """

# Function to generate a color-coded collapsible HTML tree with the first node expanded by default
def generate_color_coded_html_tree_with_root(structure, root_dir_name):
    def traverse(directory, is_first_node=False):
        html = ""
        # Sort the directory keys (directories and files together)
        for idx, (name, value) in enumerate(sorted(directory.items())):
            if isinstance(value, dict) and 'include' not in value:
                # Directory: Color based on shared files inside (at least one shared = blue, all not shared = red)
                is_shared = any(isinstance(v, dict) and 'include' in v and v['include'] for v in value.values()) or \
                            any(isinstance(v, dict) and traverse_is_shared(v) for v in value.values())
                color = "blue" if is_shared else "red"
                status = "shared" if is_shared else "not shared"

                # If it's the first directory, make it open by default (set class="active")
                active_class = "active" if is_first_node and idx == 0 else ""
                symbol = "[-]" if active_class else "[+]"

                html += f"""
                <li>
                    <span style="color:{color}; cursor: pointer;" onclick="toggleVisibility(this)">
                        {symbol} {name}
                    </span> [{status}]
                    <ul class="nested {active_class}">
                    {traverse(value)}
                    </ul>
                </li>
                """
            else:
                # File with clickable link and color coding
                color = "blue" if value['include'] else "red"
                status = "shared" if value['include'] else "not shared"
                html += f"<li><a href='{value['path']}' style='color:{color};'>{name}</a> [{status}]</li>"
        return html

    def traverse_is_shared(directory):
        # Recursive function to check if any file in the directory or its subdirectories is shared
        return any(isinstance(v, dict) and 'include' in v and v['include'] for v in directory.values()) or \
               any(isinstance(v, dict) and traverse_is_shared(v) for v in directory.values())

    return f"""
    <html>
    <head>
        <style>
            ul, #myUL {{
              list-style-type: none;
            }}
            .nested {{
              display: none;
            }}
            .active {{
              display: block;
            }}
        </style>
        <script>
            function toggleVisibility(element) {{
                var nestedList = element.parentElement.querySelector('ul');
                nestedList.classList.toggle('active');
                // Toggle between [+] and [-]
                if (element.innerHTML.includes('[+]')) {{
                    element.innerHTML = element.innerHTML.replace('[+]', '[-]');
                }} else {{
                    element.innerHTML = element.innerHTML.replace('[-]', '[+]');
                }}
            }}
        </script>
    </head>
    <body>
    <ul id="myUL">
        <li>
            <span style="color:blue; cursor: pointer;" onclick="toggleVisibility(this)">
                [-] {root_dir_name}
            </span> [shared]
            <ul class="nested active">
                {traverse(structure, is_first_node=True)}
            </ul>
        </li>
    </ul>
    </body>
    </html>
    """

# Function to generate an ASCII tree from the dictionary
def generate_ascii_tree(structure, prefix=''):
    lines = []
    for idx, (name, value) in enumerate(structure.items()):
        connector = "└── " if idx == len(structure) - 1 else "├── "
        if isinstance(value, dict):
            # Directory
            lines.append(f"{prefix}{connector}{name}/")
            # Recurse into the directory
            lines.extend(generate_ascii_tree(value, prefix + ("    " if idx == len(structure) - 1 else "│   ")))
        else:
            # File
            status = "[shared]" if value == 'shared' else "[not shared]"
            lines.append(f"{prefix}{connector}{name} {status}")
    
    return lines

# Function to generate a sorted ASCII tree from the dictionary with root directory
def generate_ascii_tree_with_root(structure, root_dir_name, prefix=''):
    lines = []
    lines.append(f"{prefix}{root_dir_name}/")
    
    def traverse(directory, indent='    '):
        # Sort the directory keys (directories and files together)
        for name, value in sorted(directory.items()):
            connector = "└── " if name == sorted(directory.keys())[-1] else "├── "
            if isinstance(value, dict) and 'include' not in value:
                # Directory
                lines.append(f"{indent}{connector}{name}/")
                traverse(value, indent + ("    " if name == sorted(directory.keys())[-1] else "│   "))
            elif isinstance(value, dict) and 'include' in value:
                # File with include status
                status = "[shared]" if value['include'] else "[not shared]"
                lines.append(f"{indent}{connector}{name} {status}")
    
    traverse(structure)
    return "\n".join(lines)

# Function to adjust file paths to start from 'replication'
def adjust_file_paths(df, root_dir):
    dfin = df.copy()
    # Ensure all paths are relative to the root_dir (./replication)
    dfin['file_path'] = dfin['file_path'].apply(lambda path: os.path.relpath(path, root_dir))
    return dfin

# Function to generate a color-coded Markdown tree (using HTML span for color)
def generate_color_coded_markdown_tree_with_root(structure, root_dir_name, prefix=''):
    lines = []
    lines.append(f"{prefix}- <span style='color:blue;'>`{root_dir_name}/`</span> [shared]")  # Root directory is considered shared
    
    def traverse(directory, indent='    '):
        # Sort the directory keys (directories and files together)
        for name, value in sorted(directory.items()):
            connector = "- "  # Bullet point for Markdown
            if isinstance(value, dict) and 'include' not in value:
                # Directory: Use red if no shared files, blue if shared
                color = "blue" if all('include' in v and v['include'] for v in value.values()) else "red"
                status = "shared" if color == "blue" else "not shared"
                lines.append(f"{indent}{connector}<span style='color:{color};'>`{name}/`</span> [{status}]")
                traverse(value, indent + "    ")
            elif isinstance(value, dict) and 'include' in value:
                # File with color coding
                color = "blue" if value['include'] else "red"
                status = "shared" if value['include'] else "not shared"
                lines.append(f"{indent}{connector}<span style='color:{color};'>`{name}`</span> [{status}]")
    
    traverse(structure)
    return "\n".join(lines)

# Build the tree structure from the DataFrame

In [None]:
# Define the root directory
root_directory = './replication-package'
root_dir_name = 'replication-package'  # Change the root directory name

# Adjust the file paths to be relative to the replication directory
df = adjust_file_paths(df_files, root_directory)
df

In [22]:
tree_structure = build_tree_from_df(df)

In [None]:
# Generate the ASCII tree from the DataFrame's directory structure
ascii_tree = generate_ascii_tree_with_root(tree_structure, root_dir_name)

# Output the ASCII tree to the console
print(ascii_tree)

# Save the ASCII tree to a text file
with open("./replication-package/directory_structure.txt", "w") as file:
    file.write(ascii_tree)

print(ascii_tree)

# Generate the HTML content

In [24]:
html_content = generate_color_coded_html_tree_with_root(tree_structure, root_dir_name)

In [25]:
# Save to a README.md file
with open("./replication-package/directory_structure.html", "w") as file:
    file.write(html_content)

# Create tree in markdown

In [26]:
markdown_tree = generate_color_coded_markdown_tree_with_root(tree_structure, root_dir_name)

# Save to a README.md file
with open("./replication-package/directory_structure.md", "w") as file:
    file.write(markdown_tree)


In [None]:
df.loc[df['file_path'].str.startswith('data')]

# Step 4: Zip only the selected files (those marked as True in the "include" column)

In [28]:
zip_output = '3-replication-package.zip'
root_directory = './'
zip_selected_files(df_files, root_directory, zip_output)

In [None]:
zip_output = '3-replication-package.zip'
list_zip_contents(zip_output)

# Step 5: Zip only the selected files (those marked as False in the "include" column)

In [None]:
df_excluded = df_files.loc[df_files['include']==False].reset_index(drop=True)
df_excluded['include'] = True
df_excluded

In [31]:
zip_output = '4-confidential-data-not-for-publication.zip'
root_directory = './'
zip_selected_files(df_excluded, root_directory, zip_output)

In [None]:
zip_output = '4-confidential-data-not-for-publication.zip'
list_zip_contents(zip_output)

# Step 6: Final Zip

In [33]:
paper = list_files_in_directory('./1-paper')
appendix = list_files_in_directory('./2-appendix/')

In [None]:
dfout = pd.concat([paper, appendix], ignore_index=True)
dfout

In [None]:
dfout = dfout.loc[dfout['file_path'].str.find('/.')==-1].reset_index(drop=True)
dfout

In [None]:
dfout = pd.concat([dfout, pd.DataFrame({'file_path':['./3-replication-package.zip', 
                                                     './4-confidential-data-not-for-publication.zip',
                                                     './checklist.pdf'],
              'include':[True, True, True]})], ignore_index=True)
dfout

In [None]:
zip_output = 'full-replication.zip'
root_directory = './'
zip_selected_files(dfout, root_directory, zip_output)

In [None]:
dfout.loc[dfout['file_path'].str.find('checklist')!=-1]