In [1]:
import warnings
warnings.filterwarnings("ignore")

import os

import numpy as np
import pandas as pd

pd.options.display.float_format = '{:.5f}'.format

import matplotlib.image as mpimg
import matplotlib.pyplot as plt
from PIL import Image

In [2]:
from config import TOP_FILE_EXTENSIONS

In [3]:
! ls

[34m__pycache__[m[m                         df_analysis.ipynb
config.py                           indexator.py
df_2024_06_22_19_24_53.parquet.gzip photo.py
df_2024_06_24_18_03_18.parquet.gzip [34mvideos[m[m
df_2024_06_24_18_03_49.parquet.gzip


In [4]:
CWD = os.getcwd()
DF_PATH = os.path.join(CWD, "df_2024_06_22_19_24_53.parquet.gzip")

In [5]:
df = pd.read_parquet(path = DF_PATH)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146072 entries, 0 to 146071
Data columns (total 4 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   FilePath              146072 non-null  object 
 1   CreationDate          146072 non-null  object 
 2   LastModificationDate  146072 non-null  object 
 3   SizeMB                146072 non-null  float64
dtypes: float64(1), object(3)
memory usage: 4.5+ MB


In [7]:
df.shape

(146072, 4)

In [8]:
df.sample(10)

Unnamed: 0,FilePath,CreationDate,LastModificationDate,SizeMB
81932,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/T1-(2017-...,2021-08-07 21:55:06,2019-08-28 03:18:26,0.01817
73605,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/T1-(2012-...,2021-08-07 21:53:42,2019-08-28 04:05:36,0.00611
26680,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/(2017-10-...,2021-08-07 20:31:41,2019-08-28 03:17:59,3.54267
12271,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/(2011-08-...,2021-08-07 19:50:51,2019-08-28 04:04:45,2.33549
120525,/Volumes/MUPU 4TB 2/202209/20220902_Bebe_Lloro...,2022-09-03 18:16:39,2022-09-03 12:30:20,458.93114
13589,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/(2012-01-...,2021-08-07 19:52:56,2019-08-28 03:58:38,2.54442
77316,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/T1-(2016-...,2021-08-07 21:54:19,2019-08-28 03:27:52,0.01217
75306,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/T1-(2013-...,2021-08-07 21:53:58,2019-08-28 04:19:38,0.01872
13544,/Volumes/MUPU 4TB 1/MUPU 500GB/Fotos/(2012-01-...,2021-08-07 19:52:51,2019-08-28 03:04:20,3.66641
782,/Volumes/MUPU 4TB 1/code/nuclio_0922_wip/7_COS...,2023-01-07 17:00:12,2023-01-07 16:47:24,0.07035


# Duplicado: todo fichero que ha sido creado, modificado y tiene el mismo tamaño.

In [9]:
def custom_mapping(file_extension):
    
    if str.lower(file_extension) in TOP_FILE_EXTENSIONS.keys():
        return TOP_FILE_EXTENSIONS[str.lower(file_extension)]
    else:
        return "na"

In [10]:
df = (
    df
    .assign(
        FileName = df["FilePath"].apply(os.path.basename),
        FileExtension = df["FilePath"].apply(lambda file_path: str.lower(os.path.splitext(file_path)[1])),
        FileType = lambda df: df["FileExtension"].apply(custom_mapping)
    )
)

In [12]:
photos = df[df["FileType"] == "photo"]

In [14]:
photos.to_parquet("photos.parquet.gzip")

In [None]:
top_file_extesions = (
    pd.concat(
    [
        df["FileExtension"].value_counts(),
        df["FileExtension"].value_counts(normalize = True)
    ],
    axis = 1
    )
    .assign(
        CumSum = lambda df: df["proportion"].cumsum()
    )
)

In [None]:
# Using top 20 file extesions we created the TOP_FILE_EXTENSIONS
# and mapped it to the file type (audio, video, photo etc)
top_file_extesions.head(20)

In [None]:
file_size_and_counter_by_type = (
    df
    .groupby(["FileExtension", "FileType"])
    .agg(
        NrFiles = ("FilePath", len),
        TotalSizeMB = ("SizeMB", np.sum),
        TotalSizeGB = ("SizeMB", lambda series: np.sum(series)/(1024)) # GB
    )
    .reset_index()
    .assign(
        NrFilesByType = lambda df: df.groupby("FileType")["NrFiles"].transform(np.sum),
        TotalSizeMBByType = lambda df: df.groupby("FileType")["TotalSizeMB"].transform(np.sum),
        PctNrFilesByType = lambda df: df["NrFiles"]/df["NrFilesByType"],
        PctSizeMBByType = lambda df: df["TotalSizeMB"]/df["TotalSizeMBByType"]
    )
    .query("FileType == 'video' or FileType == 'photo'")
    .sort_values("FileType", ascending = False)
)

In [None]:
file_size_and_counter_by_type

In [None]:
df = df[df["FileType"].isin(["photo", "video"])]

In [None]:
df.sort_values(["CreationDate", "LastModificationDate", "SizeMB"], inplace = True)

In [None]:
df["Counter"] = df.groupby(
    [
        # "FileName",
        "CreationDate",
        "LastModificationDate",
        "SizeMB"
    ]
)["FilePath"].transform(len)

In [None]:
df["Counter"].value_counts()

In [None]:
df.loc[998]

In [None]:
df[
    (df["CreationDate"] == "2023-01-07 16:58:38") &
    (df["LastModificationDate"] == "2020-11-14 17:26:00") & 
    (df["SizeMB"] > 3)
]

In [None]:
path_ = df.loc[35009]["FilePath"]
path_ = df.loc[35010]["FilePath"]
path_ = df.loc[35014]["FilePath"]

path_ = df.loc[69525]["FilePath"]

# path_ = df.loc[93741]["FilePath"]

path_ = df.loc[998]["FilePath"]
path_ = df.loc[1004]["FilePath"]

path_ = df.loc[1000]["FilePath"]
# path_ = df.loc[997]["FilePath"]

In [None]:
path_

In [None]:
img = mpimg.imread(path_)
imgplot = plt.imshow(img)
plt.show()

# sha256 in python

In [19]:
def calculate_sha256_of_image(file_path):
    
    from hashlib import sha256
    print(file_path)
    with open(file_path, "rb") as f:
        content = f.read()
        content_in_bytes = bytearray(content)
    
    sha256_img = sha256(string = content_in_bytes).hexdigest()
    
    return sha256_img

In [20]:
calculate_sha256_of_image(
    file_path = df.loc[998]["FilePath"]
)

/Volumes/MUPU 4TB 1/code/python10pm_repo/pythoneando-main/Video_3_principios_básicos_de_la_programación/Archivos/CosmoCaixa/IMG_20200903_122929.jpg


'60b17f6fe383e430532091b1c3a2ea06371cef19777d7090dc19c8d0e96c4e78'

In [21]:
calculate_sha256_of_image(
    file_path = df.loc[1004]["FilePath"]
)

/Volumes/MUPU 4TB 1/code/python10pm_repo/pythoneando-main/Video_3_principios_básicos_de_la_programación/Ficheros_conjuntos/IMG_20200903_122929.jpg


'60b17f6fe383e430532091b1c3a2ea06371cef19777d7090dc19c8d0e96c4e78'

In [22]:
calculate_sha256_of_image(
    file_path = df.loc[1023]["FilePath"]
)

/Volumes/MUPU 4TB 1/code/python10pm_repo/pythoneando-main/Video_6_if_name_main/input/IMG_20200903_122929.jpg


'60b17f6fe383e430532091b1c3a2ea06371cef19777d7090dc19c8d0e96c4e78'

In [None]:
calculate_sha256_of_image(
    file_path = df.loc[1000]["FilePath"]
)

In [None]:
calculate_sha256_of_image(
    file_path = df.loc[1006]["FilePath"]
)

In [None]:
calculate_sha256_of_image(
    file_path = df.loc[1025]["FilePath"]
)