In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
import sys

import pickle

import numpy as np
import pandas as pd

pd.options.display.float_format = "{:.5f}".format

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import PIL
from pillow_heif import register_heif_opener

register_heif_opener()

In [2]:
CWD = os.getcwd()
PY8TB_PATH = os.path.dirname(CWD)
SAVE_PATH = os.path.join(PY8TB_PATH, "data")
DF_PATH = os.path.join(SAVE_PATH, "df_2024_06_30_18_08_37.parquet.gzip")

In [3]:
sys.path.insert(0, PY8TB_PATH)

In [4]:
from py8tb import preprocessing_pipeline

# Análisis y preprocesamiento de df

In [5]:
df = preprocessing_pipeline(path=DF_PATH)

In [6]:
photos = df[df["FileType"] == "photo"]

In [7]:
with open(os.path.join(SAVE_PATH, "photos_sha256.pkl"), "rb") as f:
    photos_sha256 = pickle.load(f)
    photos_sha256 = pd.DataFrame(data=photos_sha256, columns=["FilePath", "Sha256"])

In [8]:
photos_sha256["Sha256"].apply(len).value_counts()

Sha256
64    114951
Name: count, dtype: int64

In [9]:
photos.head()

Unnamed: 0,FilePath,CreationDate,LastModificationDate,SizeMB,FileName,FileExtension,FileType
54,/Volumes/MUPU_4TB_1/code/kaggle_datasets/bobby...,2023-01-07 16:59:12,2020-07-20 11:58:42,0.29537,bobby.png,.png,photo
61,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Denal...,2023-01-07 16:59:12,2020-01-25 15:33:18,0.0298,Denali Mt McKinley.jpg,.jpg,photo
67,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Figur...,2023-01-07 16:59:11,2020-04-18 20:21:12,0.02575,FigureAxesMPL.png,.png,photo
81,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Logis...,2023-01-07 16:59:09,2020-04-10 08:10:50,0.10171,LogisticRegression.jpg,.jpg,photo
86,/Volumes/MUPU_4TB_1/code/kaggle_datasets/MPLAr...,2023-01-07 17:00:12,2020-04-18 21:22:12,0.0258,MPLArchitecture.png,.png,photo


In [10]:
photos_sha256.head()

Unnamed: 0,FilePath,Sha256
0,/Volumes/MUPU_4TB_1/code/kaggle_datasets/bobby...,c299e1e38511f982498f8e181ca300fe3eddad8b241159...
1,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Denal...,d9cd8cba8cc65cd24e7eb9b6537462488a47e286d7d719...
2,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Figur...,7c3eff6b4237af20a75631867b93e91b13958e717637a6...
3,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Logis...,26048ccd2858c3e090fec613141fbf9cbcad7b5a87923f...
4,/Volumes/MUPU_4TB_1/code/kaggle_datasets/MPLAr...,290ebbce76730269474a23a2f588860862868c2d939534...


In [11]:
photos = pd.merge(
    left=photos,
    right=photos_sha256,
    how="left",
    right_on="FilePath",
    left_on="FilePath",
)

In [12]:
photos.head()

Unnamed: 0,FilePath,CreationDate,LastModificationDate,SizeMB,FileName,FileExtension,FileType,Sha256
0,/Volumes/MUPU_4TB_1/code/kaggle_datasets/bobby...,2023-01-07 16:59:12,2020-07-20 11:58:42,0.29537,bobby.png,.png,photo,c299e1e38511f982498f8e181ca300fe3eddad8b241159...
1,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Denal...,2023-01-07 16:59:12,2020-01-25 15:33:18,0.0298,Denali Mt McKinley.jpg,.jpg,photo,d9cd8cba8cc65cd24e7eb9b6537462488a47e286d7d719...
2,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Figur...,2023-01-07 16:59:11,2020-04-18 20:21:12,0.02575,FigureAxesMPL.png,.png,photo,7c3eff6b4237af20a75631867b93e91b13958e717637a6...
3,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Logis...,2023-01-07 16:59:09,2020-04-10 08:10:50,0.10171,LogisticRegression.jpg,.jpg,photo,26048ccd2858c3e090fec613141fbf9cbcad7b5a87923f...
4,/Volumes/MUPU_4TB_1/code/kaggle_datasets/MPLAr...,2023-01-07 17:00:12,2020-04-18 21:22:12,0.0258,MPLArchitecture.png,.png,photo,290ebbce76730269474a23a2f588860862868c2d939534...


In [13]:
SHA = (
    photos
    # .pipe(lambda df: df[df["FileExtension"] == ".heic"])
    ["Sha256"]
    .value_counts()
    .reset_index()
    .rename(columns={"count": "Count"})
    .pipe(lambda df: df[df["Count"] > 4])
    .sample(1)["Sha256"]
    .iloc[0]
)

In [14]:
def get_matplotlib_figure_params(paths: list[str]) -> tuple[int, int]:
    if (len(paths) % 3) == 0:
        return (int(len(paths) / 3), 3)
    elif (len(paths) % 2) == 0:
        return (int(len(paths) / 2), 2)
    else:
        return (int(len(paths) / 2) + 1, 2)

In [15]:
def plot_duplicated_photos(sha: str, photos: pd.DataFrame) -> None:

    # get the path to plot/show images
    paths = photos[photos["Sha256"].str.contains(sha)]["FilePath"].tolist()
    nr_rows, nr_columns = get_matplotlib_figure_params(paths=paths)

    # plotting with matplotlib
    fig = plt.figure(figsize=(15, 5))
    axes = fig.subplots(nr_rows, nr_columns).flatten()

    for path, ax in zip(paths, axes):
        ax.imshow(PIL.Image.open(path))

In [16]:
photos[photos["Sha256"].str.contains(SHA)]

Unnamed: 0,FilePath,CreationDate,LastModificationDate,SizeMB,FileName,FileExtension,FileType,Sha256
78099,/Volumes/MUPU_4TB_1/MUPU 500GB/Fotos/T1-XIAOMI...,2021-08-07 21:55:44,2019-08-28 04:10:45,0.01202,T1-XIAOMI -2340x4160-116600016.jpg,.jpg,photo,67d68fcb23b8559f36f6b9febdd0d25834803ef304b275...
78284,/Volumes/MUPU_4TB_1/MUPU 500GB/Fotos/T1-XIAOMI...,2021-08-07 21:55:47,2019-08-28 03:56:01,0.01202,T1-XIAOMI -2340x4160-231538608.jpg,.jpg,photo,67d68fcb23b8559f36f6b9febdd0d25834803ef304b275...
78540,/Volumes/MUPU_4TB_1/MUPU 500GB/Fotos/T1-XIAOMI...,2021-08-07 21:55:50,2019-08-28 04:23:22,0.01202,T1-XIAOMI -2340x4160-31304512.jpg,.jpg,photo,67d68fcb23b8559f36f6b9febdd0d25834803ef304b275...
78759,/Volumes/MUPU_4TB_1/MUPU 500GB/Fotos/T1-XIAOMI...,2021-08-07 21:55:53,2019-08-28 03:40:50,0.01202,T1-XIAOMI -2340x4160-363181432.jpg,.jpg,photo,67d68fcb23b8559f36f6b9febdd0d25834803ef304b275...
78864,/Volumes/MUPU_4TB_1/MUPU 500GB/Fotos/T1-XIAOMI...,2021-08-07 21:55:57,2019-08-28 03:30:57,0.01202,T1-XIAOMI -2340x4160-490373448.jpg,.jpg,photo,67d68fcb23b8559f36f6b9febdd0d25834803ef304b275...
79045,/Volumes/MUPU_4TB_1/MUPU 500GB/Fotos/T1-XIAOMI...,2021-08-07 21:55:57,2019-08-28 04:18:41,0.01202,T1-XIAOMI -2340x4160-63155992.jpg,.jpg,photo,67d68fcb23b8559f36f6b9febdd0d25834803ef304b275...
79196,/Volumes/MUPU_4TB_1/MUPU 500GB/Fotos/T1-XIAOMI...,2021-08-07 21:55:59,2019-08-28 04:17:31,0.01202,T1-XIAOMI -2340x4160-68381408.jpg,.jpg,photo,67d68fcb23b8559f36f6b9febdd0d25834803ef304b275...


In [17]:
SHA

'67d68fcb23b8559f36f6b9febdd0d25834803ef304b275ccfd1745cf32fe6ab2'

In [18]:
total_photos = photos["FileType"].value_counts().iloc[0]

In [19]:
unique_photos = photos.groupby(["Sha256"]).size().size

In [20]:
duplicates_photos = total_photos - unique_photos

In [21]:
photos["DummyValue"] = 1

In [22]:
photos.groupby(["Sha256"]).get_group((SHA))["DummyValue"].cumsum()

78099    1
78284    2
78540    3
78759    4
78864    5
79045    6
79196    7
Name: DummyValue, dtype: int64

In [23]:
%%timeit
photos["CumSum"] = photos.groupby(["Sha256"])["DummyValue"].transform(
    lambda series: series.cumsum()
)

2.74 s ± 36.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
sha256_list = photos["Sha256"].values.tolist()
path_list = photos["FilePath"].values.tolist()

In [25]:
assert len(sha256_list) == len(path_list)

In [26]:
%%timeit

dict_sha256_to_save = {}
dict_sha256_to_delete = {}

for sha256_, path_ in zip(sha256_list, path_list):
    if sha256_ not in dict_sha256_to_save.keys():
        dict_sha256_to_save[sha256_] = path_
    else:
        dict_sha256_to_delete[path_] = sha256_

13.7 ms ± 152 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
len(dict_sha256_to_save)

NameError: name 'dict_sha256_to_save' is not defined

In [None]:
len(dict_sha256_to_delete)

In [None]:
unique_photos_df = photos[photos["CumSum"] == 1]

In [None]:
unique_photos_df.shape

In [None]:
unique_photos_df

In [None]:
unique_photos

In [None]:
duplicates_photos_df = photos[photos["CumSum"] != 1]

In [None]:
duplicates_photos_df.shape

In [None]:
duplicates_photos

In [None]:
duplicates_photos

In [None]:
duplicates_photos_df["FilePath"].iloc[0]

In [None]:
# os.remove(duplicates_photos_df["FilePath"].iloc[0])

In [None]:
paths = plot_duplicated_photos(sha=SHA, photos=photos)

In [None]:
def parse_date(filename, split_by_text):

    splitted_filename = filename.split(split_by_text)[1].split(".")[0]

    if (len(splitted_filename) >= 8) and (split_by_text in ["IMG_", "IMG-"]):
        return splitted_filename[:8]

    elif "Screenshot_" in filename:
        return splitted_filename[:10].replace("-", "")

    else:
        return filename

In [None]:
def extract_creation_date_from_photo_name(row):

    creation_date = row["CreationDate"]
    last_modification_date = row["LastModificationDate"]
    filename = row["FileName"]

    if "IMG_" in filename:
        return parse_date(filename=filename, split_by_text="IMG_")

    elif "IMG-" in filename:
        return parse_date(filename=filename, split_by_text="IMG-")

    elif "Screenshot_" in filename:
        return parse_date(filename=filename, split_by_text="Screenshot_")

    else:
        return filename

In [None]:
def parse_date_with_regex(filename):

    import re

    regex_rule = re.compile(pattern="\d{8}|\d{4}-\d{2}-\d{2}")
    results = regex_rule.findall(filename)

    if len(results) > 0:

        if "-" in results[0]:
            return results[0].replace("-", "")

        else:
            return results[0]

    else:
        return filename

In [None]:
sdf = df[(df["FileType"] == "photo")].sample(50)

In [None]:
sdf["ParseCreationDate"] = sdf.apply(extract_creation_date_from_photo_name, axis=1)

In [None]:
sdf["ParseCreationDateWithRegex"] = sdf["FileName"].apply(parse_date_with_regex)

In [None]:
sdf

In [None]:
top_file_extesions = pd.concat(
    [
        df["FileExtension"].value_counts(),
        df["FileExtension"].value_counts(normalize=True),
    ],
    axis=1,
).assign(CumSum=lambda df: df["proportion"].cumsum())

In [None]:
# Using top 20 file extesions we created the TOP_FILE_EXTENSIONS
# and mapped it to the file type (audio, video, photo etc)
top_file_extesions.head(20)

In [None]:
file_size_and_counter_by_type = (
    df.groupby(["FileExtension", "FileType"])
    .agg(
        NrFiles=("FilePath", len),
        TotalSizeMB=("SizeMB", np.sum),
        TotalSizeGB=("SizeMB", lambda series: np.sum(series) / (1024)),  # GB
    )
    .reset_index()
    .assign(
        NrFilesByType=lambda df: df.groupby("FileType")["NrFiles"].transform(np.sum),
        TotalSizeMBByType=lambda df: df.groupby("FileType")["TotalSizeMB"].transform(
            np.sum
        ),
        PctNrFilesByType=lambda df: df["NrFiles"] / df["NrFilesByType"],
        PctSizeMBByType=lambda df: df["TotalSizeMB"] / df["TotalSizeMBByType"],
    )
    .query("FileType == 'video' or FileType == 'photo'")
    .sort_values("FileType", ascending=False)
)

In [None]:
file_size_and_counter_by_type