In [None]:
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor


# Path of the file to read
iowa_file_path = '../input/home-data-for-ml-course/train.csv'

home_data = pd.read_csv(iowa_file_path)
# Create target object and call it y
y = home_data.SalePrice
# Create X
features = ['LotArea', 'YearBuilt', '1stFlrSF', '2ndFlrSF', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd']
X = home_data[features]

# Split into validation and training data
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

# Specify Model
iowa_model = DecisionTreeRegressor(random_state=1)
# Fit Model
iowa_model.fit(train_X, train_y)

# Make validation predictions and calculate mean absolute error
val_predictions = iowa_model.predict(val_X)
val_mae = mean_absolute_error(val_predictions, val_y)
print("Validation MAE: {:,.0f}".format(val_mae))

# Set up code checking
from learntools.core import binder
binder.bind(globals())
from learntools.machine_learning.ex5 import *
print("\nSetup complete")

In [None]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
    model.fit(train_X, train_y)
    preds_val = model.predict(val_X)
    mae = mean_absolute_error(val_y, preds_val)
    return(mae)

In [None]:
candidate_max_leaf_nodes = [5, 25, 50, 100, 250, 500]
# Write loop to find the ideal tree size from candidate_max_leaf_nodes
scores = {leaf_size: get_mae(leaf_size, train_X, val_X, train_y, val_y) for leaf_size in candidate_max_leaf_nodes}
# Store the best value of max_leaf_nodes (it will be either 5, 25, 50, 100, 250 or 500)
best_tree_size = min(scores, key=scores.get)


In [None]:
# Fill in argument to make optimal size and uncomment
final_model = DecisionTreeRegressor(max_leaf_nodes=best_tree_size, random_state=1)

# fit the final model and uncomment the next two lines
final_model.fit(X, y)

## **Random Forest Model**

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Define the model. Set random_state to 1
rf_model = RandomForestRegressor(random_state=1)

# fit your model
rf_model.fit(train_X, train_y)

# Calculate the mean absolute error of your Random Forest model on the validation data
rf_val_predictions = rf_model.predict(val_X)
rf_val_mae = mean_absolute_error(rf_val_predictions, val_y)

print("Validation MAE for Random Forest Model: {}".format(rf_val_mae))


In [None]:
# Function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=10, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

## **Imputation**

In [None]:
# Make copy to avoid changing original data (when imputing)
X_train_plus = X_train.copy()
X_valid_plus = X_valid.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    X_train_plus[col + '_was_missing'] = X_train_plus[col].isnull()
    X_valid_plus[col + '_was_missing'] = X_valid_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer()
imputed_X_train_plus = pd.DataFrame(my_imputer.fit_transform(X_train_plus))
imputed_X_valid_plus = pd.DataFrame(my_imputer.transform(X_valid_plus))

# Imputation removed column names; put them back
imputed_X_train_plus.columns = X_train_plus.columns
imputed_X_valid_plus.columns = X_valid_plus.columns

# Imputation median strategy
final_imputer = SimpleImputer(strategy='median')
final_X_train = pd.DataFrame(final_imputer.fit_transform(X_train))
final_X_valid = pd.DataFrame(final_imputer.transform(X_valid))


## **Categorical Variables**

Ordinal Encoding e One-Hot Encoding

In [None]:
# Get list of categorical variables
s = (X_train.dtypes == 'object')
object_cols = list(s[s].index)

print("Categorical variables:")
print(object_cols)


# Ordinal Encoding
#   Never" (0) < "Rarely" (1) < "Most days" (2) < "Every day" (3).

from sklearn.preprocessing import OrdinalEncoder
# Make copy to avoid changing original data 
label_X_train = X_train.copy()
label_X_valid = X_valid.copy()

# Apply ordinal encoder to each column with categorical data
ordinal_encoder = OrdinalEncoder()
label_X_train[object_cols] = ordinal_encoder.fit_transform(X_train[object_cols])
label_X_valid[object_cols] = ordinal_encoder.transform(X_valid[object_cols])



# One-Hot Encoding
from sklearn.preprocessing import OneHotEncoder

# Apply one-hot encoder to each column with categorical data
OH_encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
OH_cols_train = pd.DataFrame(OH_encoder.fit_transform(X_train[object_cols]))
OH_cols_valid = pd.DataFrame(OH_encoder.transform(X_valid[object_cols]))

# One-hot encoding removed index; put it back
OH_cols_train.index = X_train.index
OH_cols_valid.index = X_valid.index

# Remove categorical columns (will replace with one-hot encoding)
num_X_train = X_train.drop(object_cols, axis=1)
num_X_valid = X_valid.drop(object_cols, axis=1)

# Add one-hot encoded columns to numerical features
OH_X_train = pd.concat([num_X_train, OH_cols_train], axis=1)
OH_X_valid = pd.concat([num_X_valid, OH_cols_valid], axis=1)

# Ensure all columns have string type
OH_X_train.columns = OH_X_train.columns.astype(str)
OH_X_valid.columns = OH_X_valid.columns.astype(str)


In [None]:
def one_hot_encode_dataframe(df, categorical_columns):
    """
    Aplica One-Hot Encoding às colunas categóricas de um DataFrame.
    
    :param df: DataFrame original.
    :param categorical_columns: Lista de colunas categóricas a serem codificadas.
    :return: Novo DataFrame com as colunas codificadas e devidamente nomeadas.
    """
    df_copy = df.copy()
    encoder = OneHotEncoder(sparse=False, drop=None)
    
    for column in categorical_columns:
        encoded_array = encoder.fit_transform(df_copy[[column]])
        encoded_df = pd.DataFrame(encoded_array, columns=encoder.get_feature_names_out([column]))
        
        df_copy = df_copy.drop(columns=[column])
        df_copy = pd.concat([df_copy, encoded_df], axis=1)
    
    return df_copy

# Exemplo de uso
data = {'Categoria': ['A', 'B', 'A', 'C'], 'Valor': [10, 20, 30, 40]}
df = pd.DataFrame(data)
categorical_columns = ['Categoria']

encoded_df = one_hot_encode_dataframe(df, categorical_columns)
print(encoded_df)

## **Pipelines**


1º Define Preprocessing Steps

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant') # Strategy can be: mean, median, most_frequent, constant

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Strategy can be: most_frequent, constant
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(  
    transformers=[
        ('num', numerical_transformer, numerical_cols), 
        ('cat', categorical_transformer, categorical_cols) 
    ])

2º Define the model

In [None]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=0)

3º Create and Evaluate the Pipeline

In [None]:
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])

# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

In [None]:
def avaliate_pipes (data, encoder, model, numerical_imputer, categorical_imputer):
    # Definindo Feature e Target
    X = data.drop('target', axis=1) # Features
    Y = data['target'] # Target

    x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size = 0.20, random_state = 0)

    # Classificando as features em num e cat
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Tratando dados NaN com imputers
    numerical_transformer = SimpleImputer(strategy=numerical_imputer)
    categorical_transformer = SimpleImputer(strategy=categorical_imputer)

    # Aplicando Encoder nas features categóricas
    if encoder == 'one-hot':
        cat_encoder = OneHotEncoder(handle_unknown='ignore')
    elif encoder == 'ordinal':
        cat_encoder = OrdinalEncoder()
    else:
        raise ValueError('Encoder não reconhecido')
    
    # Criando um preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', cat_encoder, categorical_features)
        ])
    
    # Criando um pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Treinando o modelo
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    score = accuracy_score(y_test, y_pred)
        
    return score


# Criar um for que varie os parâmetros do modelo e do encoder
# e chame a função avaliate_pipes
# e guarde o resultado em um dicionário
# e imprima o resultado


for encoder in ['one-hot', 'ordinal']:
    for numerical_imputer in ['mean', 'median', 'constant']:
        for categorical_imputer in ['most_frequent', 'constant']:
            score = avaliate_pipes(data, encoder, model, numerical_imputer, categorical_imputer)
            print(f'Score: {score}, Model: {model}, Encoder: {encoder}, Numerical Imputer: {numerical_imputer}, Categorical Imputer: {categorical_imputer}')


In [None]:
def avaliate_pipes(data, encoder, model, numerical_imputer, categorical_imputer):
    # Definindo Feature e Target
    X = data.drop('target', axis=1) # Features
    Y = data['target'] # Target

    x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.20, random_state=0)

    # Classificando as features em num e cat
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Tratando dados NaN com imputers
    numerical_transformer = SimpleImputer(strategy=numerical_imputer)
    categorical_transformer = SimpleImputer(strategy=categorical_imputer)

    # Aplicando Encoder nas features categóricas
    if encoder == 'one-hot':
        cat_encoder = OneHotEncoder(handle_unknown='ignore')
    elif encoder == 'ordinal':
        cat_encoder = OrdinalEncoder()
    else:
        raise ValueError('Encoder não reconhecido')
    
    # Criando um preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_features),
            ('cat', cat_encoder, categorical_features)
        ])
    
    # Criando um pipeline
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])

    # Treinando o modelo
    pipeline.fit(x_train, y_train)
    y_pred = pipeline.predict(x_val)
    score = accuracy_score(y_val, y_pred)
    
    # Transformando os datasets
    x_train_transformed = pipeline.named_steps['preprocessor'].transform(x_train)
    x_val_transformed = pipeline.named_steps['preprocessor'].transform(x_val)
        
    return score, x_train_transformed, x_val_transformed, y_train, y_val

# Exemplo de uso
score, x_train_transformed, x_val_transformed, y_train, y_val = avaliate_pipes(data, 'one-hot', model, 'mean', 'most_frequent')
print(f'Score: {score}')
print(f'Training data shape: {x_train_transformed.shape}')
print(f'Validation data shape: {x_val_transformed.shape}')

 **Selecting Cols**

In [None]:
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]

# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

## **Cross-Validation**

In [None]:
from sklearn.model_selection import cross_val_score

# Multiply by -1 since sklearn calculates *negative* MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                              cv=5, # Number of folds
                              scoring='neg_mean_absolute_error')

print("MAE scores:\n", scores)


# Function  that reports the average (over three cross-validation folds) MAE of a machine learning pipeline that uses:



def get_score(n_estimators):
    my_pipeline = Pipeline(steps=[
        ('preprocessor', SimpleImputer()),
        ('model', RandomForestRegressor(n_estimators, random_state=0))
    ])
    scores = -1 * cross_val_score(my_pipeline, X, y,
                                  cv=3,# Number of folds
                                  scoring='neg_mean_absolute_error')
    # Replace this body with your own code
    return scores.mean()

results = {}
for i in range(1,9):
    results[50*i] = get_score(50*i)


## **XGBoost**

In [None]:
# Define the model
my_model_2 = XGBRegressor(n_estimators = 1000, learning_rate = 0.05) # Your code here

# Fit the model
my_model_2.fit(X_train, y_train) # Your code here

# Get predictions
predictions_2 = my_model_2.predict(X_valid) # Your code here

# Calculate MAE
mae_2 = mean_absolute_error(predictions_2, y_valid) # Your code here

# Uncomment to print MAE
print("Mean Absolute Error:" , mae_2)

In [None]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        X[colname], _ = X[colname].factorize()
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

## **Avaliar Modelos**

In [None]:
model = RandomForestClassifier()
for encoder in ['ordinal', 'one-hot']:
    print()
    for numerical_imputer in ['mean', 'median', 'constant']:
        for categorical_imputer in ['most_frequent', 'constant']:
            pipeline, score = avaliate_pipes(train, encoder, model, numerical_imputer, categorical_imputer)
            print(f'Score: {score:.4}%, Model: {model}, Encoder: {encoder}, Numerical Imputer: {numerical_imputer}, Categorical Imputer: {categorical_imputer}')
            max_score = score if score > max_score else max_score
            best_pipeline = pipeline if score == max_score else best_pipeline
print()
print(f'Melhor pipeline: {best_pipeline}')
print(f'Melhor score: {max_score:.4}%')


In [2]:
from sklearn.impute import SimpleImputer

def Data_Cleaning(data):
    # Definindo Feature e Target
    X = data.copy()  # Features

    # Classificando as features em num e cat
    numerical_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object']).columns

    # Tratando dados NaN com imputers
    num_imputer = SimpleImputer(strategy='median')
    cat_imputer = SimpleImputer(strategy='most_frequent')

    # Imputando valores nas colunas numéricas
    X[numerical_features] = num_imputer.fit_transform(X[numerical_features])

    # Imputando valores nas colunas categóricas
    X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])

    return X, numerical_features, categorical_features

# Exemplo de uso
data = pd.DataFrame({
    'num_col1': [1, 2, None, 4],
    'num_col2': [None, 2, 3, 4],
    'cat_col1': ['A', None, 'B', 'C'],
    'cat_col2': [None, 'X', 'Y', 'Z']
})

cleaned_data, num_features, cat_features = Data_Cleaning(data)
print(cleaned_data)

TypeError: '<' not supported between instances of 'NoneType' and 'str'