In [3]:
import warnings

warnings.filterwarnings("ignore")

import os
import sys

import pickle

import numpy as np
import pandas as pd

pd.options.display.float_format = "{:.5f}".format

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import PIL
from pillow_heif import register_heif_opener
register_heif_opener()

In [2]:
CWD = os.getcwd()
PY8TB_PATH = os.path.dirname(CWD)
SAVE_PATH = os.path.join(PY8TB_PATH, "data")
DF_PATH = os.path.join(SAVE_PATH, "df_2024_06_30_18_08_37.parquet.gzip")

In [None]:
sys.path.insert(0, PY8TB_PATH)

In [None]:
from py8tb import preprocessing_pipeline

# Análisis y preprocesamiento de df

In [None]:
df = preprocessing_pipeline(path=DF_PATH)

In [None]:
photos = df[df["FileType"] == "photo"]

In [None]:
with open(os.path.join(SAVE_PATH, "photos_sha256.pkl"), "rb") as f:
    photos_sha256 = pickle.load(f)
    photos_sha256 = pd.DataFrame(data=photos_sha256, columns=["FilePath", "Sha256"])

In [None]:
photos_sha256["Sha256"].apply(len).value_counts()

In [None]:
photos.head()

In [None]:
photos_sha256.head()

In [None]:
photos = pd.merge(
    left=photos,
    right=photos_sha256,
    how="left",
    right_on="FilePath",
    left_on="FilePath",
)

In [None]:
photos.head()

In [None]:
SHA = (
    photos
    # .pipe(lambda df: df[df["FileExtension"] == ".heic"])
    ["Sha256"]
    .value_counts()
    .reset_index()
    .rename(columns={"count":"Count"})
    .pipe(lambda df: df[df["Count"] > 1])
    .sample(1)
    ["Sha256"]
    .iloc[0]
)

In [None]:
def get_matplotlib_figure_params(paths:list[str]) -> tuple[int, int]:
    if (len(paths)%3) == 0:
        return (int(len(paths)/3), 3)
    elif (len(paths)%2) == 0:
        return (int(len(paths)/2), 2)
    else:
        return (int(len(paths)/2) +1, 2)

In [None]:
def plot_duplicated_photos(sha:str, photos:pd.DataFrame) -> None:
    
    # get the path to plot/show images
    paths = photos[photos["Sha256"].str.contains(sha)]["FilePath"].tolist()
    nr_rows, nr_columns = get_matplotlib_figure_params(paths=paths)
    
    # plotting with matplotlib
    fig = plt.figure(figsize=(15, 5))
    axes = fig.subplots(nr_rows, nr_columns).flatten()
    
    for path, ax in zip(paths, axes):
        ax.imshow(PIL.Image.open(path))

In [None]:
photos[photos["Sha256"].str.contains(SHA)]

In [None]:
photos[photos["Sha256"].str.contains(SHA)]["FilePath"].tolist()

In [None]:
# paths = plot_duplicated_photos(
#     sha = SHA,
#     photos = photos
# )

In [None]:
def parse_date(filename, split_by_text):

    splitted_filename = filename.split(split_by_text)[1].split(".")[0]

    if (len(splitted_filename) >= 8) and (split_by_text in ["IMG_", "IMG-"]):
        return splitted_filename[:8]

    elif "Screenshot_" in filename:
        return splitted_filename[:10].replace("-", "")

    else:
        return filename

In [None]:
def extract_creation_date_from_photo_name(row):

    creation_date = row["CreationDate"]
    last_modification_date = row["LastModificationDate"]
    filename = row["FileName"]

    if "IMG_" in filename:
        return parse_date(filename=filename, split_by_text="IMG_")

    elif "IMG-" in filename:
        return parse_date(filename=filename, split_by_text="IMG-")

    elif "Screenshot_" in filename:
        return parse_date(filename=filename, split_by_text="Screenshot_")

    else:
        return filename

In [None]:
def parse_date_with_regex(filename):

    import re

    regex_rule = re.compile(pattern="\d{8}|\d{4}-\d{2}-\d{2}")
    results = regex_rule.findall(filename)

    if len(results) > 0:

        if "-" in results[0]:
            return results[0].replace("-", "")

        else:
            return results[0]

    else:
        return filename

In [None]:
sdf = df[(df["FileType"] == "photo")].sample(50)

In [None]:
sdf["ParseCreationDate"] = sdf.apply(extract_creation_date_from_photo_name, axis=1)

In [None]:
sdf["ParseCreationDateWithRegex"] = sdf["FileName"].apply(parse_date_with_regex)

In [None]:
sdf

In [None]:
top_file_extesions = pd.concat(
    [
        df["FileExtension"].value_counts(),
        df["FileExtension"].value_counts(normalize=True),
    ],
    axis=1,
).assign(CumSum=lambda df: df["proportion"].cumsum())

In [None]:
# Using top 20 file extesions we created the TOP_FILE_EXTENSIONS
# and mapped it to the file type (audio, video, photo etc)
top_file_extesions.head(20)

In [None]:
file_size_and_counter_by_type = (
    df.groupby(["FileExtension", "FileType"])
    .agg(
        NrFiles=("FilePath", len),
        TotalSizeMB=("SizeMB", np.sum),
        TotalSizeGB=("SizeMB", lambda series: np.sum(series) / (1024)),  # GB
    )
    .reset_index()
    .assign(
        NrFilesByType=lambda df: df.groupby("FileType")["NrFiles"].transform(np.sum),
        TotalSizeMBByType=lambda df: df.groupby("FileType")["TotalSizeMB"].transform(
            np.sum
        ),
        PctNrFilesByType=lambda df: df["NrFiles"] / df["NrFilesByType"],
        PctSizeMBByType=lambda df: df["TotalSizeMB"] / df["TotalSizeMBByType"],
    )
    .query("FileType == 'video' or FileType == 'photo'")
    .sort_values("FileType", ascending=False)
)

In [None]:
file_size_and_counter_by_type