In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

# Load your dataset (replace with your file paths)
train_data = pd.read_csv("/content/train_sampled.csv")

In [None]:
# Separate features and target
X = train_data.drop(columns=["price_doc", "id", "timestamp"], errors="ignore")  # Drop target and irrelevant columns
y = train_data["price_doc"]

# Handle missing values by filling with median for numeric columns only
numeric_columns = X.select_dtypes(include=['number']).columns
X[numeric_columns] = X[numeric_columns].fillna(X[numeric_columns].median())
# Alternatively, you could impute categorical columns with the mode:
#categorical_columns = X.select_dtypes(exclude=['number']).columns
#X[categorical_columns] = X[categorical_columns].fillna(X[categorical_columns].mode().iloc[0])

In [None]:

from sklearn.preprocessing import OneHotEncoder  # Import OneHotEncoder

# Create a OneHotEncoder to handle categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore') # sparse=False for compatibility with SelectFromModel

# Identify categorical features (object dtype)
categorical_features = X.select_dtypes(include=['object']).columns

# Fit and transform the encoder on categorical features
encoded_features = encoder.fit_transform(X[categorical_features])

# Create a DataFrame from the encoded features
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out(categorical_features))

# Drop original categorical features and concatenate encoded features
X = X.drop(columns=categorical_features, errors="ignore")
X = pd.concat([X, encoded_df], axis=1)

# Now proceed with your feature selection and model training:
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)
select_from_model = SelectFromModel(estimator=random_forest, threshold="median")
select_from_model.fit(X, y)  # This should now work without the ValueError

# Initialize a RandomForestRegressor
random_forest = RandomForestRegressor(n_estimators=100, random_state=42)

# Use SelectFromModel for feature selection
select_from_model = SelectFromModel(estimator=random_forest, threshold="median")
select_from_model.fit(X, y)

# Get the selected features
selected_features = X.columns[select_from_model.get_support()]

# Print the selected features
print("Selected features by SelectFromModel:")
print(selected_features)

# Optionally, create a new dataset with only selected features
X_selected = X[selected_features]

# Save the dataset with selected features (if needed)
X_selected.to_csv("selected_features.csv", index=False)


Selected features by SelectFromModel:
Index(['full_sq', 'life_sq', 'floor', 'build_year', 'num_room', 'kitch_sq',
       'area_m', 'indust_part', 'preschool_education_centers_raion',
       'children_school',
       ...
       'cafe_count_5000_na_price', 'cafe_count_5000_price_500',
       'cafe_count_5000_price_2500', 'church_count_5000', 'leisure_count_5000',
       'sport_count_5000', 'market_count_5000', 'radiation_raion_yes',
       'big_market_raion_no', 'ecology_good'],
      dtype='object', length=165)
