In [None]:
import os
import sys
sys.path.append('..')
sys.path.append('../utils/')
import pandas as pd
import numpy as np
from tqdm import tqdm 
from utils import data_loader_utils

In [None]:
machines = ["M01","M02","M03"]
process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]
labels = ["good","bad"]
path_to_dataset = "./Datasets/Raw/"

In [None]:
dfs = []
file_count = 0  # Contador para o número de arquivos lidos

# Sampling frequency is 2kHz
freq = 2000

# Wrap the outermost loop with tqdm for the progress bar
for process_name in tqdm(process_names, desc="Loading files"):
    for machine in machines:
        for label in labels:
            data_path = os.path.join(path_to_dataset, machine, process_name, label)
            data_list, label_list = data_loader_utils.load_tool_research_data(data_path, label=label, add_additional_label=True, verbose=False)
            
            for data, full_label in zip(data_list, label_list):
                file_count += 1  # Incrementando a contagem de arquivos
                
                parts = full_label.split('_')                
                month = parts[1]  # 'Aug'
                year = parts[2]  # '2019'
                
                # Generating a unique code for the file, using the file count instead of sample_id
                unique_code = f"{machine}_{process_name}_{month}_{year}_{file_count}"
                
                # Create time based on sampling frequency
                time = np.linspace(0, len(data) / freq, len(data), endpoint=False)
                
                # Create a DataFrame with the vibration data and time
                df = pd.DataFrame(data, columns=['X_axis', 'Y_axis', 'Z_axis'])
                df['Time'] = time
                df['Machine'] = machine
                df['Process'] = process_name
                df['Label'] = label
                df['Month'] = month
                df['Year'] = year
                df['Unique_Code'] = unique_code
                
                dfs.append(df)

final_df = pd.concat(dfs, ignore_index=True)

# Imprimindo o número de arquivos lidos e a quantidade de códigos únicos
print(f"Number of files read: {file_count}")
print(f"Number of unique codes: {final_df['Unique_Code'].nunique()}")

In [None]:
for name in dir():
    if not name.startswith('_') and name != 'final_df' and name != 'pd' and name != 'tqdm':
        del globals()[name]

In [None]:
final_df

In [None]:
final_df['Year'] = final_df['Year'].astype(str)

final_df['Period'] = final_df['Month'] + '-' + final_df['Year']

final_df['Year'] = final_df['Year'].astype(int)

final_df['Period_Num'] = pd.to_datetime(final_df['Period'], format='%b-%Y')

In [None]:
final_df = final_df.sort_values(by = ['Period_Num', 'Unique_Code','Time'])
final_df

In [None]:
#final_df['Year'] = final_df['Year'].astype(str)

In [None]:
#final_df['Code'] = final_df['Machine'] + '_' + final_df['Process'] + '_' + final_df['Month'] + '_' + final_df['Year']

In [None]:
#final_df['Code'] = final_df['Machine'] + '_' + final_df['Process'] + '_' + final_df['Month'] + '_' + final_df['Year']

# Criar um contador dentro de cada grupo
#final_df['count'] = final_df.groupby('Code').cumcount() + 1

# Atualizar o 'Unique_Code' para incluir este contador
#final_df['Unique_Code'] = final_df['Code'] + '_' + final_df['count'].astype(str)

#final_df['Unique_Code'].nunique()

In [None]:
final_df

In [None]:
final_df.reset_index(drop=True, inplace=True)
final_df

In [None]:
final_df['Label'].replace({'good': 0, 'bad': 1}, inplace=True)

In [None]:
# Reordenando as colunas
final_df = final_df[['Time', 'Month', 'Year', 'Machine', 'Process', 'X_axis', 'Y_axis', 'Z_axis', 'Label','Unique_Code','Period']]

In [None]:
final_df

In [None]:
path_to_save_parquet = 'Datasets/Processed/ETL1.parquet'
final_df.to_parquet(path_to_save_parquet)