In [None]:
import warnings

warnings.filterwarnings("ignore")

import os
import sys

import pickle

import numpy as np
import pandas as pd

pd.options.display.float_format = "{:.5f}".format

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import PIL

from pillow_heif import register_heif_opener

register_heif_opener()

from IPython.display import Video

In [None]:
CWD = os.getcwd()
SAVE_PATH = os.path.join(os.path.dirname(CWD), "data")

In [None]:
DF_FILE_NAME = sorted([f for f in os.listdir(SAVE_PATH) if ".csv" in f], reverse=True)[
    0
]
DF_PATH = os.path.join(SAVE_PATH, DF_FILE_NAME)

DF_PATH = os.path.join(SAVE_PATH, "df_2025_09_25_19_33_18.csv")
RESULTS_PATH = os.path.join(SAVE_PATH, "results_df2025_09_25_19_33_18.csv")

# Análisis y preprocesamiento de df

In [None]:
df = pd.read_csv(DF_PATH)
df = df[df["FileType"].isin(["video", "photo"])]

In [None]:
df

In [None]:
file_size_and_counter_by_type = (
    df.groupby(["FileExtension", "FileType"])
    .agg(
        NrFiles=("FilePath", len),
        TotalSizeMB=("SizeMB", np.sum),
        TotalSizeGB=("SizeMB", lambda series: np.sum(series) / (1024)),  # GB
    )
    .reset_index()
    .assign(
        NrFilesByType=lambda df: df.groupby("FileType")["NrFiles"].transform(np.sum),
        TotalSizeMBByType=lambda df: df.groupby("FileType")["TotalSizeMB"].transform(
            np.sum
        ),
        PctNrFilesByType=lambda df: df["NrFiles"] / df["NrFilesByType"],
        PctSizeMBByType=lambda df: df["TotalSizeMB"] / df["TotalSizeMBByType"],
    )
    .query("FileType == 'video' or FileType == 'photo'")
    .sort_values("FileType", ascending=False)
)

file_size_and_counter_by_type

In [None]:
SHA = (
    df.pipe(lambda df: df[df["FileType"].isin(["video", "photo"])])["Sha256"]
    .value_counts()
    .reset_index()
    .rename(columns={"count": "Count"})
    .pipe(lambda df: df[df["Count"] >= 2])
    .sample(1)["Sha256"]
    .iloc[0]
)

In [None]:
def get_matplotlib_figure_params(paths: list[str]) -> tuple[int, int]:
    if (len(paths) % 3) == 0:
        return (int(len(paths) / 3), 3)
    elif (len(paths) % 2) == 0:
        return (int(len(paths) / 2), 2)
    else:
        return (int(len(paths) / 2) + 1, 2)

In [None]:
def plot_duplicated_photos(sha: str, photos: pd.DataFrame) -> None:

    # get the path to plot/show images
    paths = photos[photos["Sha256"].str.contains(sha)]["FilePath"].tolist()
    nr_rows, nr_columns = get_matplotlib_figure_params(paths=paths)

    # plotting with matplotlib
    fig = plt.figure(figsize=(15, 5))
    axes = fig.subplots(nr_rows, nr_columns).flatten()

    for path, ax in zip(paths, axes):
        ax.imshow(PIL.Image.open(path))

In [None]:
df[df["Sha256"].str.contains(SHA)]

In [None]:
path_ = df[df["Sha256"].str.contains(SHA)].iloc[0]["FilePath"]
file_type_ = df[df["Sha256"].str.contains(SHA)].iloc[0]["FileType"]

In [None]:
path_

In [None]:
if file_type_ == "photo":
    plot_duplicated_photos(sha=SHA, photos=df)

In [None]:
Video(path_, embed=True, width=320, height=320)

In [None]:
sha256_list = df["Sha256"].values.tolist()
path_list = df["FilePath"].values.tolist()

dict_sha256_to_save = {}
dict_sha256_to_delete = {}

for sha256_, path_ in zip(sha256_list, path_list):
    if sha256_ not in dict_sha256_to_save.keys():
        dict_sha256_to_save[sha256_] = path_
    else:
        dict_sha256_to_delete[path_] = sha256_

In [None]:
len(dict_sha256_to_save)

In [None]:
len(dict_sha256_to_delete)