In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

SEED = 42

In [48]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

X_train = train_data.drop(columns=['Id', 'SalePrice'], axis=1)
y_train = train_data['SalePrice']

In [49]:
numerical_cols = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

In [50]:
from sklearn.base import BaseEstimator, TransformerMixin

class DropColumnsWithTooManyNans(BaseEstimator, TransformerMixin):
 def __init__(self, threshold=0.85):
     self.threshold = threshold
     self.columns_to_drop = None
 
 def fit(self, X, y=None):
     # Calculate the percentage of NaNs in each column
     nan_percentage = X.isna().mean()
     # Identify columns to drop
     self.columns_to_drop = nan_percentage[nan_percentage > self.threshold].index
     return self
 
 def transform(self, X):
     # Drop the identified columns
     return X.drop(columns=self.columns_to_drop)

In [51]:
# Numérico
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Categórico
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle dos dois
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [52]:
model = RandomForestRegressor(n_estimators=300, max_depth=20, min_samples_split=2, min_samples_leaf=1, max_features=0.3, random_state=SEED)

# Bundle dropping NaN columns, preprocessing, and modeling in a pipeline
pipeline = Pipeline(steps=[
 ('preprocessor', preprocessor),
 ('model', model)
])

# Split data into training and validation sets
X_train_full, X_valid, y_train_full, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=SEED)

Está falhando na hora do fit do pipeline...

In [55]:
pipeline.fit(X_train_full, y_train_full)

# Transform the training and test data
X_train_full_transformed = pipeline.named_steps['preprocessor'].transform(X_train_full)
X_valid_transformed = pipeline.named_steps['preprocessor'].transform(X_valid)
X_test_transformed = pipeline.named_steps['preprocessor'].transform(test_data)

In [60]:
len(pipeline.named_steps['preprocessor'].get_feature_names_out())

285

Está dando erro não sei pq :(

In [61]:
# Convert transformed data back to DataFrames
X_train_full_df = pd.DataFrame([X_train_full_transformed], columns=pipeline.named_steps['preprocessor'].get_feature_names_out())
X_valid_df = pd.DataFrame([X_valid_transformed], columns=pipeline.named_steps['preprocessor'].get_feature_names_out())
X_test_df = pd.DataFrame([X_test_transformed], columns=pipeline.named_steps['preprocessor'].get_feature_names_out())

# Save the transformed DataFrames to CSV files
X_train_full_df.to_csv('train_transformed.csv', index=False)
X_valid_df.to_csv('valid_transformed.csv', index=False)
X_test_df.to_csv('test_transformed.csv', index=False)

# Predict and evaluate on the validation set
y_valid_pred = pipeline.predict(X_valid)
mse_valid = mean_squared_error(y_valid, y_valid_pred)
print(f"Mean Squared Error on Validation Set: {mse_valid}")

# Make predictions on the test data
y_test_pred = pipeline.predict(X_test_df)

# Save predictions
predictions = pd.DataFrame({'Id': test_data['Id'], 'SalePrice': y_test_pred})
predictions.to_csv('submission_rfr_pipeline.csv', index=False)

Mean Squared Error on Validation Set: 811576761.1629729


TypeError: 'NoneType' object is not iterable