In [None]:
import os
import re
import pandas as pd
import numpy as np
from collections import Counter

DATA_DIR = "datasets-p2"
OUT_DIR = "dataset-diff-analysis"
pattern = re.compile(r"Financial_Records.*\.csv")

filenames = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if pattern.match(f)]
filenames.sort()
print(f'All file names: {[filename for filename in filenames]}')

raw_datasets = [pd.read_csv(fname) for fname in filenames]
dataset_names = [os.path.basename(fname) for fname in filenames]

unique_datasets = []
unique_names = []

for i, ds in enumerate(raw_datasets):
    is_duplicate = False
    for u in unique_datasets:
        if ds.equals(u):
            is_duplicate = True
            break
    if not is_duplicate:
        unique_datasets.append(ds)
        unique_names.append(dataset_names[i])

num_unique = len(unique_datasets)
ref_df = unique_datasets[0]
rows, cols = ref_df.shape
columns = ref_df.columns

diff_map = pd.DataFrame(0, index=range(rows), columns=columns)
uniqueness_counts = [0 for _ in range(num_unique)]

for row in range(rows):
    for col in columns:
        values = [df.at[row, col] for df in unique_datasets]
        try:
            values = [float(v) for v in values]
            is_numeric = True
        except:
            is_numeric = False
        if is_numeric:
            rounded_values = [round(v, 4) for v in values]
            counter = Counter(rounded_values)
        else:
            counter = Counter(values)
        if len(counter) > 1:
            diff_map.at[row, col] = 1
            for i, val in enumerate(values):
                if counter[val] == 1:
                    uniqueness_counts[i] += 1

total_cells = rows * len(columns)
modified_cells = diff_map.values.sum(where=diff_map != 1)
percent_modified = round((modified_cells / total_cells) * 100, 4)

summary_df = pd.DataFrame({
    "Metric": ["Total Cells", "Modified Cells", "Percent Modified"],
    "Value": [total_cells, modified_cells, percent_modified]
})

unique_df = pd.DataFrame({
    "Dataset": unique_names,
    "Unique Cell Contributions": uniqueness_counts
}).sort_values(by="Unique Cell Contributions", ascending=False)

existing_versions = [
    int(d.split("_v")[-1]) for d in os.listdir(OUT_DIR)
    if d.startswith(f"{num_unique}_v") and os.path.isdir(os.path.join(OUT_DIR, d))
]

next_version = max(existing_versions) + 1 if existing_versions else 1
save_dir = os.path.join(OUT_DIR, f"{num_unique}_v{next_version}")
os.makedirs(save_dir, exist_ok=True)

summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
diff_map.to_csv(os.path.join(save_dir, "diff_map.csv"), index=False)
unique_df.to_csv(os.path.join(save_dir, "uniqueness.csv"), index=False)

All file names: ['datasets-p2\\Financial_Records.csv', 'datasets-p2\\Financial_Records_Bob.csv', 'datasets-p2\\Financial_Records_Bob_Nemanja_Saveski.csv', 'datasets-p2\\Financial_Records_Bob_Sabina_Khazari.csv', 'datasets-p2\\Financial_Records_Bob_Thomas_Senstyler.csv']


In [3]:
import os
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

DATA_DIR = "datasets-p2"
OUT_DIR = "dataset-diff-analysis"
pattern = re.compile(r"Financial_Records.*\.csv")

filenames = [os.path.join(DATA_DIR, f) for f in os.listdir(DATA_DIR) if pattern.match(f)]
filenames.sort()
print(f'All file names: {[filename for filename in filenames]}')

raw_datasets = [pd.read_csv(fname) for fname in filenames]
dataset_names = [os.path.basename(fname) for fname in filenames]

unique_datasets = []
unique_names = []

for i, ds in enumerate(raw_datasets):
    is_duplicate = False
    for u in unique_datasets:
        if ds.equals(u):
            is_duplicate = True
            break
    if not is_duplicate:
        unique_datasets.append(ds)
        unique_names.append(dataset_names[i])

num_unique = len(unique_datasets)
ref_df = unique_datasets[0]
rows, cols = ref_df.shape
columns = ref_df.columns

diff_map = pd.DataFrame(0, index=range(rows), columns=columns)
uniqueness_counts = [0 for _ in range(num_unique)]

for row in range(rows):
    for col in columns:
        values = [df.at[row, col] for df in unique_datasets]
        try:
            values = [float(v) for v in values]
            is_numeric = True
        except:
            is_numeric = False
        if is_numeric:
            rounded_values = [round(v, 4) for v in values]
            counter = Counter(rounded_values)
        else:
            counter = Counter(values)

        num_unique_values = len(counter)
        diff_map.at[row, col] = num_unique_values
        if num_unique_values > 1:
            for i, val in enumerate(values):
                if counter[val] == 1:
                    uniqueness_counts[i] += 1


total_cells = rows * len(columns)
modified_cells = (diff_map > 1).values.sum()
percent_modified = round((modified_cells / total_cells) * 100, 4)

summary_df = pd.DataFrame({
    "Metric": ["Total Cells", "Modified Cells", "Percent Modified"],
    "Value": [total_cells, modified_cells, percent_modified]
})

unique_df = pd.DataFrame({
    "Dataset": unique_names,
    "Unique Cell Contributions": uniqueness_counts
}).sort_values(by="Unique Cell Contributions", ascending=False)

existing_versions = [
    int(d.split("_v")[-1]) for d in os.listdir(OUT_DIR)
    if d.startswith(f"{num_unique}_v") and os.path.isdir(os.path.join(OUT_DIR, d))
]

next_version = max(existing_versions) + 1 if existing_versions else 1
save_dir = os.path.join(OUT_DIR, f"{num_unique}_v{next_version}")
os.makedirs(save_dir, exist_ok=True)

summary_df.to_csv(os.path.join(save_dir, "summary.csv"), index=False)
diff_map.to_csv(os.path.join(save_dir, "diff_map.csv"), index=False)
unique_df.to_csv(os.path.join(save_dir, "uniqueness.csv"), index=False)

All file names: ['datasets-p2\\Financial_Records.csv', 'datasets-p2\\Financial_Records_Bob.csv', 'datasets-p2\\Financial_Records_Bob_Nemanja_Saveski.csv', 'datasets-p2\\Financial_Records_Bob_Sabina_Khazari.csv', 'datasets-p2\\Financial_Records_Bob_Thomas_Senstyler.csv']
