In [None]:
import labrotation.file_handling as fh
import os
import pandas as pd
import uuid

In [None]:
datadoc_dir = fh.open_dir("Open Data documentation folder")

# Run this to create list of all files that will be checked and a UUID column added/filled if it does not exist.

In [None]:
files_lis = []
for root, dirs, files in os.walk(datadoc_dir):
    for name in files:
        if "grouping" in name:
            if "~" in name: # "~" on windows is used for temporary files that are opened in excel
                files_lis = []
                raise Exception(f"Please close all excel files and try again. Found temporary file in:\n{os.path.join(root, name)}")
            fpath = os.path.join(root, name)
            print(fpath)
            files_lis.append(fpath)

# Alternatively, add only individual files (not yet implemented)

In [None]:
# TODO: use fh.open_file iteratively (until user hits cancel, resulting in empty string or ".") to get a list of files

In [None]:
files_lis

# For each file in the list, open and check for UUID column. If exists, check for completeness, and fill if necessary. If does not exist, generate column.

# WARNING! This will overwrite files in the folder!!!

In [None]:
files_changed = []
files_unchanged = []  # files will be changed (pd.to_excel called each time), but their contents not.
for file_path in files_lis:
    print(file_path + ":")
    df = pd.read_excel(file_path)
    if "uuid" in df.columns:
        print("\tUUID column found")
        # check if uuid is complete column
        n_na = df["uuid"].isna().sum()
        if n_na > 0:
            print("\tUUID column contains empty entries! Attempting to fill them...")
            uuids_old = df["uuid"].copy()  # need values and not just reference, to check no uuids will be overwritten
            # create uuids to fill up the column with
            rand_uuids = [uuid.uuid4().hex for i in range(df["uuid"].isna().sum())]
            # replace NaN with values from list
            df.loc[df.uuid.isna(), "uuid"] = rand_uuids
            # assert already existing uuids were not changed
            assert (sum(uuids_old == df["uuid"]) == len(df) - n_na) and df["uuid"].isna().sum() == 0
            files_changed.append(file_path)
            print("\tCompleted.")
        else: # no NaN entries in uuid column.
            print("\tNo empty rows found. Skipping...")
            files_unchanged.append(file_path)
    else:  # uuid column does not exist
        print("\tUUID column not found. Creating one...")
        df["uuid"] = [uuid.uuid4().hex for i_row in range(len(df))]
        files_changed.append(file_path)
        print("\tCompleted.")
    df.to_excel(file_path, index=False, na_rep="NaN")

# Test: check that above method does not change excel sheet contents (only formatting)

In [None]:
test_data_folder = "./tests/files/Data Documentation/"

In [None]:
test_files_lis = []
for root, dirs, files in os.walk(test_data_folder):
    for name in files:
        if "grouping" in name and "modified" not in name:
            if "~" in name: # "~" on windows is used for temporary files that are opened in excel
                files_lis = []
                raise Exception(f"Please close all excel files and try again. Found temporary file in:\n{os.path.join(root, name)}")
            fpath = os.path.join(root, name)
            print(fpath)
            test_files_lis.append(fpath)

In [None]:
test_files_changed = []
test_files_unchanged = []
for file_path in test_files_lis:
    print(file_path + ":")
    df = pd.read_excel(file_path)
    if "uuid" in df.columns:
        print("\tUUID column found")
        # check if uuid is complete column
        n_na = df["uuid"].isna().sum()
        if n_na > 0:
            print("\tUUID column contains empty entries! Attempting to fill them...")
            uuids_old = df["uuid"].copy()  # need values and not just reference, to check no uuids will be overwritten
            # create uuids to fill up the column with
            rand_uuids = [uuid.uuid4().hex for i in range(df["uuid"].isna().sum())]
            # replace NaN with values from list
            df.loc[df.uuid.isna(), "uuid"] = rand_uuids
            # assert already existing uuids were not changed
            assert (sum(uuids_old == df["uuid"]) == len(df) - n_na) and df["uuid"].isna().sum() == 0
            test_files_changed.append(file_path)
            print("\tCompleted.")
        else: # no NaN entries in uuid column.
            print("\tNo empty rows found. Skipping...")
            test_files_unchanged.append(file_path)
    else:  # uuid column does not exist
        print("\tUUID column not found. Creating one...")
        df["uuid"] = [uuid.uuid4().hex for i_row in range(len(df))]
        test_files_changed.append(file_path)
        print("\tCompleted.")
    df2 = pd.read_excel(file_path)
    # compare unchanged entries by re-reading the excel file first.
    assert (df[df2.columns].all() == df2.all()).all()
    # write to new file and compare with original file
    fname, ext = os.path.splitext(file_path)
    fname = fname + "_modified" + ext
    df.to_excel(fname, index=False, na_rep="NaN")
    df3 = pd.read_excel(fname)
    assert (df3[df2.columns].all() == df2.all()).all()

In [None]:
df2 = df.copy()

In [None]:
df.columns

In [None]:
df[df.columns]

In [None]:
(df.all() == df2.all()).all()

In [None]:
os.path.splitext(test_files_lis[0])