<a href="https://colab.research.google.com/github/luasampaio/data-engineering/blob/main/1_AutomatizeFunctions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Removendo dados duplicados

In [2]:
import pandas as pd

In [3]:
def remove_duplicates(df, subset=None):
    return df.drop_duplicates(subset=subset)

In [5]:
# Example dataset with duplicates, ensuring consistent lengths
data = {'Name': ['Alice', 'Bob', 'Alice', 'David','Luciana','Angelica'],
        'Age': [25, 30, 25, 22,23,32],
        'Salary': [50000, 60000, 50000, 45000, 45000, 45000]}
df = pd.DataFrame(data)

In [6]:
data

{'Name': ['Alice', 'Bob', 'Alice', 'David', 'Luciana', 'Angelica'],
 'Age': [25, 30, 25, 22, 23, 32],
 'Salary': [50000, 60000, 50000, 45000, 45000, 45000]}

In [7]:
cleaned_df = remove_duplicates(df, subset=['Name'])
print(cleaned_df)

       Name  Age  Salary
0     Alice   25   50000
1       Bob   30   60000
3     David   22   45000
4   Luciana   23   45000
5  Angelica   32   45000


Automating Data Type Transformation

In [8]:
def transform_data_types(df, col_types):
    for col, dtype in col_types.items():
        df[col] = df[col].astype(dtype)
    return df

In [10]:
print(df.dtypes)

Name      object
Age        int64
Salary     int64
dtype: object


### Definição de tipo de dados


In [11]:
col_types = {'Age': 'int', 'Salary': 'float'}

In [13]:
cleaned_df = transform_data_types(df, col_types)


In [15]:
print(cleaned_df.dtypes)

Name       object
Age         int64
Salary    float64
dtype: object


Função

In [22]:
import pandas as pd
import numpy as np

# Função para lidar com valores ausentes
def handle_missing_values(df, method='mean', fill_value=None):
    if method == 'mean':
        for col in df.select_dtypes(include=[np.number]).columns:
            df[col] = df[col].fillna(df[col].mean())
    elif method == 'median':
        for col in df.select_dtypes(include=[np.number]).columns:
            df[col] = df[col].fillna(df[col].median())
    elif method == 'mode':
        for col in df.columns:
            df[col] = df[col].fillna(df[col].mode()[0])
    elif method == 'constant':
        df = df.fillna(fill_value)
    elif method == 'drop':
        df = df.dropna()
    else:
        raise ValueError("Método inválido para lidar com valores ausentes.")
    return df

# Função para remover duplicatas
def remove_duplicates(df, subset=None):
    return df.drop_duplicates(subset=subset)

# Função para transformar tipos de dados
def transform_data_types(df, col_types):
    for col, dtype in col_types.items():
        # Preencher valores ausentes com um valor padrão antes da conversão
        if dtype == 'int':
            df[col] = df[col].fillna(0).astype(dtype)
        elif dtype == 'float':
            df[col] = df[col].fillna(0.0).astype(dtype)
        else:
            df[col] = df[col].astype(dtype)
    return df

# Pipeline de limpeza de dados
def data_cleaning_pipeline(df, missing_values_method='mean', fill_value=None, subset=None, col_types=None):
    # Lidar com valores ausentes
    df = handle_missing_values(df, method=missing_values_method, fill_value=fill_value)

    # Remover duplicatas
    df = remove_duplicates(df, subset=subset)

    # Transformar tipos de dados
    if col_types:
        df = transform_data_types(df, col_types)

    return df

# Exemplo de conjunto de dados com vários problemas
data = {
    'Nome': ['Alice', 'Bob', None, 'Alice'],
    'Idade': ['25', None, '30', '22'],
    'Salário': [50000, 60000, None, 50000]
}

df = pd.DataFrame(data)

# Defina os tipos de dados e execute o pipeline
col_types = {'Idade': 'int', 'Salário': 'float'}
cleaned_df = data_cleaning_pipeline(df, missing_values_method='mean', subset=['Nome'], col_types=col_types)

print(cleaned_df)


    Nome  Idade       Salário
0  Alice     25  50000.000000
1    Bob      0  60000.000000
2   None     30  53333.333333


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0).astype(dtype)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(0.0).astype(dtype)


In [23]:
import pandas as pd
import numpy as np

# Função para lidar com valores ausentes
def handle_missing_values(df, method='mean', fill_value=None):
    if method == 'mean':
        for col in df.select_dtypes(include=[np.number]).columns:
            df.loc[:, col] = df[col].fillna(df[col].mean())
    elif method == 'median':
        for col in df.select_dtypes(include=[np.number]).columns:
            df.loc[:, col] = df[col].fillna(df[col].median())
    elif method == 'mode':
        for col in df.columns:
            df.loc[:, col] = df[col].fillna(df[col].mode()[0])
    elif method == 'constant':
        df = df.fillna(fill_value)
    elif method == 'drop':
        df = df.dropna()
    else:
        raise ValueError("Método inválido para lidar com valores ausentes.")
    return df

# Função para remover duplicatas
def remove_duplicates(df, subset=None):
    return df.drop_duplicates(subset=subset)

# Função para transformar tipos de dados
def transform_data_types(df, col_types):
    for col, dtype in col_types.items():
        # Preencher valores ausentes com um valor padrão antes da conversão
        if dtype == 'int':
            df.loc[:, col] = df[col].fillna(0).astype(dtype)
        elif dtype == 'float':
            df.loc[:, col] = df[col].fillna(0.0).astype(dtype)
        else:
            df.loc[:, col] = df[col].astype(dtype)
    return df

# Pipeline de limpeza de dados
def data_cleaning_pipeline(df, missing_values_method='mean', fill_value=None, subset=None, col_types=None):
    # Lidar com valores ausentes
    df = handle_missing_values(df, method=missing_values_method, fill_value=fill_value)

    # Remover duplicatas
    df = remove_duplicates(df, subset=subset)

    # Transformar tipos de dados
    if col_types:
        df = transform_data_types(df, col_types)

    return df

# Exemplo de conjunto de dados com vários problemas
data = {
    'Nome': ['Alice', 'Bob', None, 'Alice'],
    'Idade': ['25', None, '30', '22'],
    'Salário': [50000, 60000, None, 50000]
}

df = pd.DataFrame(data)

# Defina os tipos de dados e execute o pipeline
col_types = {'Idade': 'int', 'Salário': 'float'}
cleaned_df = data_cleaning_pipeline(df, missing_values_method='mean', subset=['Nome'], col_types=col_types)

print(cleaned_df)


    Nome Idade       Salário
0  Alice    25  50000.000000
1    Bob     0  60000.000000
2   None    30  53333.333333
