# Proyecto Open Data I
## Radares, y su eficiencia en la CAM
### Recopilación, limpieza y tratamiento de los datos
Este cuaderno pretende enseñar el proceso de limpieza de los datos relativos a los radares en la CAM

In [4]:
# Importar librerías
import os

import pandas
import pandas as pd

A continuación, se muestra la clase que está compuesta de todos los métodos que se encargan de la limpieza y transformación de los datos

In [17]:
class CSVDataLoader:
    """
    A class for loading and cleaning CSV data from a specified folder path.

    Attributes:
    -----------
    folder_path : str
        The path to the folder containing the CSV files to be loaded.

    data : dict
        A dictionary containing the loaded CSV data, where the keys are the file names and the values are the corresponding dataframes.
    """

    def __init__(self, folder_path):
        """
        Initializes a CSVDataLoader object with the specified folder path.

        Parameters:
        -----------
        folder_path : str
            The path to the folder containing the CSV files to be loaded.
        """
        self.folder_path = folder_path
        self.data = {}
        self.filename = []

    def load_data(self):
        """
        Loads CSV data from the specified folder path into a dictionary.

        Returns:
        --------
        None
        """
        csv_files = [f for f in os.listdir(self.folder_path) if f.endswith('.csv')]
        folders = ("datasets/actuacionesBomberos", "datasets/estaciones", "datasets/accidentalidad")
        for folder in folders:
            df = None
            for file in os.listdir(folder):
                filepath = folder + "/" + file
                df1 = pd.read_csv(filepath, sep=';', encoding='utf-8', low_memory=False)
                df = pd.concat([df, df1])
            self.data[str(folder)] = df

        for file_name in csv_files:
            file_path = os.path.join(self.folder_path, file_name)
            try:
                df = pd.read_csv(file_path, sep=';', encoding='latin-1', low_memory=False)
                self.data[str(file_name)] = df
                self.filename.append(file_name)
            except Exception as e:
                print(f"Error al leer {file_name}: {str(e)}")

    def clean_data(self):
        """
        Cleans the loaded CSV data by renaming columns, removing whitespace, dropping null values and duplicates, and converting date columns to datetime format.

        Returns:
        --------
        None
        """
        for i in self.data:
            num_columnas = self.data[i].shape[1] -1
            columna_borrar = "Unnamed: " + str(num_columnas)

            if columna_borrar in self.data[i].columns:
                self.data[i] = self.data[i].drop(columna_borrar, axis=1)
                self.data[i] = self.data[i].dropna()

            self.data[i] = self.data[i].rename(columns = lambda x: x.strip().lower().replace(' ', '_'))
            self.data[i] = self.data[i].applymap(lambda x: x.strip() if isinstance(x, str) else x)
            self.data[i] = self.data[i].dropna()
            self.data[i] = self.data[i].drop_duplicates()
            self.data[i] = self.data[i].loc[:, ~self.data[i].columns.duplicated()]
            self.data[i].columns = pd.DataFrame.applymap(str.upper, self.data[i].columns)

            if 'FECHA' in self.data[i].columns:
                self.data[i]['FECHA'] = pd.to_datetime(self.data[i]['FECHA'], format='%d/%m/%Y')


    def get_nan_columns(self):
        for i in self.data:
            print(self.data[i].isnull().sum())
            print(self.data[i].info())
    def get_cleaned_data(self):
        """
        Returns the cleaned CSV data as a dictionary.

        Returns:
        --------
        dict
            A dictionary containing the cleaned CSV data, where the keys are the file names and the values are the corresponding dataframes.
        """
        return self.data

Una vez está definida la clase con sus métodos, procedemos a declarar las variables que nos permiten trabajar con ello

In [18]:
folder_path = "datasets"
data_loader = CSVDataLoader(folder_path)

In [19]:
data_loader.load_data()

In [20]:
data_loader.clean_data()

TypeError: the first argument must be callable