In [1]:
import warnings

warnings.filterwarnings("ignore")

import os
import sys

import pickle

import numpy as np
import pandas as pd

pd.options.display.float_format = "{:.5f}".format

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
CWD = os.getcwd()
PY8TB_PATH = os.path.dirname(CWD)
SAVE_PATH = os.path.join(PY8TB_PATH, "data")
DF_PATH = os.path.join(SAVE_PATH, "df_2024_06_30_18_08_37.parquet.gzip")

In [3]:
sys.path.insert(0, PY8TB_PATH)

In [4]:
from py8tb import preprocessing_pipeline

# Análisis y preprocesamiento de df

In [21]:
df = preprocessing_pipeline(path=DF_PATH)

In [23]:
photos = df[df["FileType"] == "photo"]

In [24]:
with open(os.path.join(SAVE_PATH, "photos_sha256.pkl"), "rb") as f:
    photos_sha256 = pickle.load(f)
    photos_sha256 = pd.DataFrame(data=photos_sha256, columns=["FilePath", "Sha256"])

In [34]:
photos_sha256["Sha256"].apply(len).value_counts()

Sha256
64    114951
Name: count, dtype: int64

In [26]:
photos = pd.merge(
    left=photos,
    right=photos_sha256,
    how="left",
    right_on="FilePath",
    left_on="FilePath",
)

In [30]:
photos["Sha256"].value_counts().value_counts().sort_index()

count
1     58673
2     13866
3      2772
4      2039
5       576
6       936
7       414
8        21
9        16
10       14
11        8
12        3
20        1
21        4
Name: count, dtype: int64

In [14]:
photos

Unnamed: 0,FilePath,CreationDate,LastModificationDate,SizeMB,FileName,FileExtension,FileType
54,/Volumes/MUPU_4TB_1/code/kaggle_datasets/bobby...,2023-01-07 16:59:12,2020-07-20 11:58:42,0.29537,bobby.png,.png,photo
61,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Denal...,2023-01-07 16:59:12,2020-01-25 15:33:18,0.02980,Denali Mt McKinley.jpg,.jpg,photo
67,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Figur...,2023-01-07 16:59:11,2020-04-18 20:21:12,0.02575,FigureAxesMPL.png,.png,photo
81,/Volumes/MUPU_4TB_1/code/kaggle_datasets/Logis...,2023-01-07 16:59:09,2020-04-10 08:10:50,0.10171,LogisticRegression.jpg,.jpg,photo
86,/Volumes/MUPU_4TB_1/code/kaggle_datasets/MPLAr...,2023-01-07 17:00:12,2020-04-18 21:22:12,0.02580,MPLArchitecture.png,.png,photo
...,...,...,...,...,...,...,...
146087,/Volumes/MUPU_4TB_2/BACKUPS/OnePlus8Pro_241220...,2023-12-24 17:35:23,2023-12-15 08:58:59,0.78631,Screenshot_2023-12-15-08-58-59-57_0438eb925998...,.jpg,photo
146088,/Volumes/MUPU_4TB_2/BACKUPS/OnePlus8Pro_241220...,2023-12-24 17:35:23,2023-12-15 08:59:36,0.60291,Screenshot_2023-12-15-08-59-36-23_0438eb925998...,.jpg,photo
146089,/Volumes/MUPU_4TB_2/BACKUPS/OnePlus8Pro_241220...,2023-12-24 17:35:23,2023-12-15 08:59:43,0.74182,Screenshot_2023-12-15-08-59-43-02_0438eb925998...,.jpg,photo
146090,/Volumes/MUPU_4TB_2/BACKUPS/OnePlus8Pro_241220...,2023-12-24 17:35:23,2023-12-15 09:00:28,0.56253,Screenshot_2023-12-15-09-00-28-02_0438eb925998...,.jpg,photo


In [8]:
def parse_date(filename, split_by_text):

    splitted_filename = filename.split(split_by_text)[1].split(".")[0]

    if (len(splitted_filename) >= 8) and (split_by_text in ["IMG_", "IMG-"]):
        return splitted_filename[:8]

    elif "Screenshot_" in filename:
        return splitted_filename[:10].replace("-", "")

    else:
        return filename

In [9]:
def extract_creation_date_from_photo_name(row):

    creation_date = row["CreationDate"]
    last_modification_date = row["LastModificationDate"]
    filename = row["FileName"]

    if "IMG_" in filename:
        return parse_date(filename=filename, split_by_text="IMG_")

    elif "IMG-" in filename:
        return parse_date(filename=filename, split_by_text="IMG-")

    elif "Screenshot_" in filename:
        return parse_date(filename=filename, split_by_text="Screenshot_")

    else:
        return filename

In [10]:
def parse_date_with_regex(filename):

    import re

    regex_rule = re.compile(pattern="\d{8}|\d{4}-\d{2}-\d{2}")
    results = regex_rule.findall(filename)

    if len(results) > 0:

        if "-" in results[0]:
            return results[0].replace("-", "")

        else:
            return results[0]

    else:
        return filename

In [11]:
sdf = df[(df["FileType"] == "photo")].sample(50)

In [None]:
sdf["ParseCreationDate"] = sdf.apply(extract_creation_date_from_photo_name, axis=1)

In [None]:
sdf["ParseCreationDateWithRegex"] = sdf["FileName"].apply(parse_date_with_regex)

In [None]:
sdf

In [None]:
top_file_extesions = pd.concat(
    [
        df["FileExtension"].value_counts(),
        df["FileExtension"].value_counts(normalize=True),
    ],
    axis=1,
).assign(CumSum=lambda df: df["proportion"].cumsum())

In [None]:
# Using top 20 file extesions we created the TOP_FILE_EXTENSIONS
# and mapped it to the file type (audio, video, photo etc)
top_file_extesions.head(20)

In [None]:
file_size_and_counter_by_type = (
    df.groupby(["FileExtension", "FileType"])
    .agg(
        NrFiles=("FilePath", len),
        TotalSizeMB=("SizeMB", np.sum),
        TotalSizeGB=("SizeMB", lambda series: np.sum(series) / (1024)),  # GB
    )
    .reset_index()
    .assign(
        NrFilesByType=lambda df: df.groupby("FileType")["NrFiles"].transform(np.sum),
        TotalSizeMBByType=lambda df: df.groupby("FileType")["TotalSizeMB"].transform(
            np.sum
        ),
        PctNrFilesByType=lambda df: df["NrFiles"] / df["NrFilesByType"],
        PctSizeMBByType=lambda df: df["TotalSizeMB"] / df["TotalSizeMBByType"],
    )
    .query("FileType == 'video' or FileType == 'photo'")
    .sort_values("FileType", ascending=False)
)

In [None]:
file_size_and_counter_by_type