In [4]:
from sklearn.calibration import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.compose import make_column_transformer
from sklearn.kernel_approximation import Nystroem
from sklearn.preprocessing import OneHotEncoder, SplineTransformer, StandardScaler
from imblearn.over_sampling import SVMSMOTE
# from utils import clean_predictor
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.mixture import GaussianMixture
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
import xgboost as xgb 
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier

def get_data_train(DATA_PATH=''):
    return pd.read_csv(DATA_PATH+"train.csv")


def get_data_test(DATA_PATH=''):
    return pd.read_csv(DATA_PATH+"test-full.csv")

def clean_predictor(y_pred, Id=None):
    df_test = get_data_test()
    df_train = get_data_train()

    if Id is not None:
        predictions_df = pd.DataFrame(
            {'Id': Id, 'Cover_Type': y_pred})
    else:   # We assume the prediction are sorted
        predictions_df = pd.DataFrame({'Cover_Type': y_pred})
        predictions_df['Id'] = range(1, len(df_test) + 1)

    # Removing those in df_train
    predictions_df.drop(predictions_df[predictions_df["Id"].isin(
        df_train["Id"])].index, inplace=True)

    # Adding df_train instead
    predictions_df = pd.concat(
        [df_train[['Id', 'Cover_Type']], predictions_df], axis=0, ignore_index=True)

    # Sorting by Id (just in case)
    predictions_df.sort_values("Id", inplace=True)

    return predictions_df


df_test = pd.read_csv("test-full.csv")
df_train = pd.read_csv("train.csv")
predict_true = pd.read_parquet("ground_truth.parquet")["Cover_Type"]
predict_best = pd.read_csv("test_predictions_best.csv")["Cover_Type"]

# Un-one-hot-encoding the categorical variables
soil_types = [f"Soil_Type{i}" for i in range(1, 41)]
wilderness_areas = [f"Wilderness_Area{i}" for i in range(1,5)]
df_test["Wilderness_Area_Synth"] = df_test[wilderness_areas] @ range(1,5)
df_train["Wilderness_Area_Synth"] = df_train[wilderness_areas] @ range(1,5)
df_test["Soil_Type_Synth"] = df_test[soil_types] @ range(1,41)
df_train["Soil_Type_Synth"] = df_train[soil_types] @ range(1,41)
df_train = df_train.drop(columns=wilderness_areas + soil_types)
df_test = df_test.drop(columns=wilderness_areas + soil_types)

### 1. OVERSAMPLING CLASS 2 AND 1
ovs_strat = {1: 30_000, 2: 35_000}

# Separating train 
X_train = df_train.drop(columns=['Cover_Type'], axis=1)
y_train = df_train['Cover_Type']
base_cols = list(X_train.columns)
# X_train.drop(columns=['Slope', 'Hillshade_3pm'], axis=1, inplace=True)
# df_test.drop(columns=['Slope', 'Hillshade_3pm'], axis=1, inplace=True)


# Oversampling
svmsmote = SVMSMOTE(sampling_strategy=ovs_strat, random_state=1)
X_train_synth, y_train_synth = svmsmote.fit_resample(X_train, y_train)
X_train_synth = pd.DataFrame(X_train_synth, columns=X_train.columns)

# Baseline to evaluate
# clf = RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=403)#np.random.randint(500))
# clf = ExtraTreesClassifier(n_estimators=150, max_features=None, min_samples_leaf=1, min_samples_split=2, n_jobs=-1)
clf = xgb.XGBRFClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=7, max_depth=8, eta=0.1)
# clf = GradientBoostingClassifier(n_estimators=150, max_depth=8)
# cat_col=X_train_synth.select_dtypes(include=object).columns
# clf = HistGradientBoostingClassifier(categorical_features=cat_col)


# clf.fit(X_train_synth, y_train_synth)
# y_pred = clf.predict(df_test)
# predictions_df = clean_predictor(y_pred)
print(f"Current best: {accuracy_score(predict_best, predict_true)}")
# print(f"Base score: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

##### COMBINING
mult_combinator = {"Horizontal_Distance_To_Fire_Points": ["Id", "Elevation", "Horizontal_Distance_To_Roadways", 
                                            "Hillshade_9am", "Hillshade_Noon"],
              "Horizontal_Distance_To_Roadways": ["Id", "Elevation", "Hillshade_Noon"],
              "Id": ["Elevation"]}

new_cols = []
for key in mult_combinator:
    for value in mult_combinator[key]:
        new_cols.append(f"{key} * {value}")
        df_test[f"{key} * {value}"] = df_test[key] * df_test[value]    
        X_train_synth[f"{key} * {value}"] = X_train_synth[key] * X_train_synth[value]   

# # Evaluating        
# clf.fit(X_train_synth, y_train_synth)
# y_pred = clf.predict(df_test)
# predictions_df = clean_predictor(y_pred)
# print(f"With mult features: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

# Log and square features
log_combinator = ['Id', 'Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points']
square_combinator = ['Horizontal_Distance_To_Roadways', 'Horizontal_Distance_To_Fire_Points']

for label in log_combinator:
    temp = np.where(df_test[label] > 0, df_test[label], -10)
    df_test[f"log({label})"] = np.log(temp, where=temp > 0)
    temp = np.where(X_train_synth[label] > 0, X_train_synth[label], -10)
    X_train_synth[f"log({label})"] = np.log(temp, where=temp > 0)
    
for label in square_combinator:
    df_test[f"{label}^2"] = df_test[label]**2
    X_train_synth[f"{label}^2"] = X_train_synth[label]**2
    
# clf.fit(X_train_synth, y_train_synth)
# y_pred = clf.predict(df_test)
# predictions_df = clean_predictor(y_pred)
# print(f"With all new features: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")
        
### 2. KMEANS 

# With ID
km_test = KMeans(n_clusters=25, n_init=5, init="k-means++")
km_test.fit_predict(df_test.loc[:, "Id": "Horizontal_Distance_To_Fire_Points"])
df_test["kmean_cluster"] = km_test.labels_
X_train_synth["kmean_cluster"] = km_test.predict(X_train_synth.loc[:, "Id": "Horizontal_Distance_To_Fire_Points"])

# # Evaluating
# clf.fit(X_train_synth[base_cols + new_cols + ["kmean_cluster"]], y_train_synth)
# y_pred = clf.predict(df_test[base_cols + new_cols + ["kmean_cluster"]])
# predictions_df = clean_predictor(y_pred)
# print(f"New features + kmeansID: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

# Without ID
km = KMeans(n_clusters=12, n_init=5, init="k-means++")
df_test["kmean_2"] = km.fit_predict(df_test.loc[:, "Elevation": "Horizontal_Distance_To_Fire_Points"])
X_train_synth["kmean_2"] = km.predict(X_train_synth.loc[:, "Elevation": "Horizontal_Distance_To_Fire_Points"])

# # Evaluating
# clf.fit(X_train_synth[base_cols + new_cols + ["kmean_2"]], y_train_synth)
# y_pred = clf.predict(df_test[base_cols + new_cols + ["kmean_2"]])
# predictions_df = clean_predictor(y_pred)
# print(f"New features + kmeansNoID: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

# # Evaluating both KM
# clf.fit(X_train_synth[base_cols + new_cols + ["kmean_cluster", "kmean_2"]], y_train_synth)
# y_pred = clf.predict(df_test[base_cols + new_cols + ["kmean_cluster", "kmean_2"]])
# predictions_df = clean_predictor(y_pred)
# print(f"New features + 2 KM: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

### 3. PCA

# With ID
pca_1 = PCA(n_components=4)
pca_cols_1 = ["PCA_1", "PCA_2", "PCA_3", "PCA_4"]
df_test.loc[:, pca_cols_1] = pca_1.fit_transform(df_test.loc[:, "Id": "Horizontal_Distance_To_Fire_Points"])
X_train_synth.loc[:, pca_cols_1] = pca_1.transform(X_train_synth.loc[:, "Id": "Horizontal_Distance_To_Fire_Points"])

# # Evaluating
# clf.fit(X_train_synth[base_cols + new_cols + pca_cols_1], y_train_synth)
# y_pred = clf.predict(df_test[base_cols + new_cols + pca_cols_1])
# predictions_df = clean_predictor(y_pred)
# print(f"New features + pcaID: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

# Without ID
pca_2 = PCA(n_components=2)
pca_cols_2 = ["PCA_5", "PCA_6"]
df_test.loc[:, pca_cols_2] = pca_2.fit_transform(df_test.loc[:, "Elevation": "Horizontal_Distance_To_Fire_Points"])
X_train_synth.loc[:, pca_cols_2] = pca_2.transform(X_train_synth.loc[:, "Elevation": "Horizontal_Distance_To_Fire_Points"]) 

# # Evaluating
# clf.fit(X_train_synth[base_cols + new_cols + pca_cols_2], y_train_synth)
# y_pred = clf.predict(df_test[base_cols + new_cols + pca_cols_2])
# predictions_df = clean_predictor(y_pred)
# print(f"New features + pcaNoID: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

# # Evaluating both PCAs
# clf.fit(X_train_synth[base_cols + new_cols + pca_cols_1 + pca_cols_2], y_train_synth)
# y_pred = clf.predict(df_test[base_cols + new_cols + pca_cols_1 + pca_cols_2])
# predictions_df = clean_predictor(y_pred)
# print(f"New features + 2 PCA: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")

# to_delete = ['kmean_2', ]

# X_train_synth.drop(columns=to_del[:6], inplace=True)
# df_test.drop(columns=to_del[:6], inplace=True)

le = LabelEncoder()
### 4. EVALUATING OVERALL
clf.fit(X_train_synth, le.fit_transform(y_train_synth))
y_pred = clf.predict(df_test)
predictions_df = clean_predictor(y_pred)

# Having it fit the desired format
print(f"New features + All unsupervised: {accuracy_score(predictions_df['Cover_Type']+1, predict_true)}")
predictions_df.to_csv('test_predictions.csv', index=False) 

Current best: 0.8656912421774421
New features + All unsupervised: 0.7833332185910101


In [None]:
from sklearn import metrics
import matplotlib.pyplot as plt

target = 'CoverType'
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50):
    
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds, show_progress=False)
        alg.set_params(n_estimators=cvresult.shape[0])
    
    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain['Disbursed'],eval_metric='auc')
        
    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
        
    #Print model report:
    print("\nModel Report")
    print("Accuracy : %.4g" % metrics.accuracy_score(dtrain['Disbursed'].values, dtrain_predictions))
    print("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain['Disbursed'], dtrain_predprob))
                    
    feat_imp = pd.Series(alg.booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')

predictors = [x for x in train.columns if x not in [target, IDcol]]
xgb1 = xgb.XGBClassifier(
 learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
modelfit(xgb1, train, predictors)

In [11]:
np.arange(100, 301, 20)

array([100, 120, 140, 160, 180, 200, 220, 240, 260, 280, 300])

In [None]:
# clf = RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=403)#np.random.randint(500))
# clf = ExtraTreesClassifier(n_estimators=150, max_features=None, min_samples_leaf=1, min_samples_split=2, n_jobs=-1)

for mss in [2, 10, 30, 50]:
    for msl in [0.01, 0.05, 0.1, 1]:
        clf = xgb.XGBRFClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=7, max_depth=20, eta=0.1, n_estimators=100,min_samples_split=mss, min_samples_leaf=msl)
        # clf = GradientBoostingClassifier(n_estimators=150, max_depth=8)
        # cat_col=X_train_synth.select_dtypes(include=object).columns
        # clf = HistGradientBoostingClassifier(categorical_features=cat_col)

        le = LabelEncoder()
        ### 4. EVALUATING OVERALL
        clf.fit(X_train_synth, le.fit_transform(y_train_synth))
        y_pred = clf.predict(df_test)
        predictions_df = clean_predictor(y_pred)

        # Having it fit the desired format
        print(f"msl={msl}, mss={mss}: {accuracy_score(predictions_df['Cover_Type']+1, predict_true)}")
        # predictions_df.to_csv('test_predictions.csv', index=False) 

16: 0.8286128341583306
18: 0.8289828781505373
20: 0.8294906129305419


In [45]:
from sklearn.ensemble import BaggingClassifier
clf = xgb.XGBRFClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=7, max_depth=15, eta=0.1, n_estimators=100)
# clf = ExtraTreesClassifier(n_estimators=150, max_features=None, min_samples_leaf=1, min_samples_split=2, n_jobs=-1)

bagged_trees = BaggingClassifier(
    estimator=clf,
    n_estimators=10,
)
bagged_trees.fit(X_train_synth, le.fit_transform(y_train_synth))
y_pred = bagged_trees.predict(df_test)
predictions_df = clean_predictor(y_pred)

# Having it fit the desired format
print(f": {accuracy_score(predictions_df['Cover_Type']+1, predict_true)}")

: 0.8283856443584642


In [43]:
pred2

Unnamed: 0,Id,Cover_Type
15120,1,4.00
15121,2,4.00
15122,3,1.18
15123,4,2.50
10190,5,5.00
...,...,...
581007,581008,2.00
581008,581009,2.00
581009,581010,2.00
581010,581011,2.00


In [24]:
le.fit_transform(y_train_synth)

array([0, 0, 0, ..., 1, 1, 1])

In [9]:
cat_col = X_train_synth.select_dtypes(include=object).columns
num_col = X_train_synth.select_dtypes(exclude=object).columns
preprocessor = make_column_transformer(
    # (
    #     OneHotEncoder(handle_unknown="ignore", min_frequency=0.01),
    #     cat_col,
    # ),
    (SplineTransformer(), num_col),
)
clf = make_pipeline(
    preprocessor,
    Nystroem(kernel='poly', degree=2, n_components=300),
    # RandomForestClassifier(n_estimators=150, n_jobs=-1, random_state=np.random.randint(500))
    ExtraTreesClassifier(n_estimators=150, random_state=np.random.randint(500),max_features=None, min_samples_leaf=1, min_samples_split=2, n_jobs=-1)
)
### 4. EVALUATING OVERALL
clf.fit(X_train_synth, y_train_synth)
y_pred = clf.predict(df_test)
predictions_df = clean_predictor(y_pred)

# Having it fit the desired format
print(f"New features + All unsupervised: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")
predictions_df.to_csv('test_predictions.csv', index=False) 

New features + All unsupervised: 0.8413698856478007


In [77]:
from sklearn.feature_selection import SelectKBest, chi2, SequentialFeatureSelector

# # assume X is your dataset and y is your target variable
# selector = SelectKBest(chi2, k=20)
# selected_features = selector.fit_transform(X_train_synth, y_train_synth)

sfs = SequentialFeatureSelector(clf, n_features_to_select='auto', tol=0.0001, scoring='accuracy', cv=5)
sfs.fit(X_train_synth, y_train_synth)

In [78]:
sfs.get_support()

array([ True,  True,  True,  True,  True,  True, False, False, False,
       False,  True,  True,  True, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
        True, False,  True,  True,  True, False, False, False])

In [81]:
test = sfs.get_feature_names_out()
test

array(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_Synth',
       'Soil_Type_Synth', 'log(Id)', 'kmean_cluster', 'PCA_1', 'PCA_2',
       'PCA_3'], dtype=object)

In [82]:
clf.random_state

403

In [70]:
to_del[:7]

['kmean_2',
 'Slope',
 'Hillshade_3pm',
 'Horizontal_Distance_To_Fire_Points^2',
 'log(Horizontal_Distance_To_Roadways)',
 'log(Horizontal_Distance_To_Fire_Points)',
 'Horizontal_Distance_To_Fire_Points * Hillshade_Noon']

In [61]:
data = pd.concat([X_train_synth,y_train_synth], axis=1)

In [63]:
tmp = {}
to_del = []
for i in range(15):
    tmp[i] = [clf.feature_names_in_, clf.feature_importances_] 
    
    least_important = list(clf.feature_names_in_).pop(np.argsort(clf.feature_importances_)[0])
    print('Delete: ',least_important)
    X_train_synth.drop(columns=[least_important], inplace=True)
    df_test.drop(columns=[least_important], inplace=True)
    to_del.append(least_important)

    clf.fit(X_train_synth, y_train_synth)
    y_pred = clf.predict(df_test)
    predictions_df = clean_predictor(y_pred)
    # Having it fit the desired format
    print(f"New features + All unsupervised: {accuracy_score(predictions_df['Cover_Type'], predict_true)}")
    # predictions_df.to_csv(f'test_predictions.csv', index=False) 

Delete:  kmean_2
New features + All unsupervised: 0.8624400184505656
Delete:  Slope
New features + All unsupervised: 0.863045857916876
Delete:  Hillshade_3pm
New features + All unsupervised: 0.8625845937777533
Delete:  Horizontal_Distance_To_Fire_Points^2
New features + All unsupervised: 0.8631732218955891
Delete:  log(Horizontal_Distance_To_Roadways)
New features + All unsupervised: 0.862187011627987
Delete:  log(Horizontal_Distance_To_Fire_Points)
New features + All unsupervised: 0.8629047248593833
Delete:  Horizontal_Distance_To_Fire_Points * Hillshade_Noon
New features + All unsupervised: 0.863180106434979
Delete:  Horizontal_Distance_To_Roadways
New features + All unsupervised: 0.8626086896656179
Delete:  Hillshade_Noon
New features + All unsupervised: 0.8628410428700268
Delete:  Horizontal_Distance_To_Fire_Points * Hillshade_9am
New features + All unsupervised: 0.8620166192780871
Delete:  kmean_cluster
New features + All unsupervised: 0.8626121319353128
Delete:  Vertical_Distance

In [66]:
clf.random_state

403

In [65]:
for v in tmp.values():
    print(v[0][np.argsort(v[1])][:5])

['kmean_2' 'Slope' 'Hillshade_3pm'
 'log(Horizontal_Distance_To_Fire_Points)'
 'Horizontal_Distance_To_Fire_Points^2']
['Slope' 'Hillshade_3pm' 'log(Horizontal_Distance_To_Fire_Points)'
 'Horizontal_Distance_To_Fire_Points^2'
 'Horizontal_Distance_To_Fire_Points']
['Hillshade_3pm' 'Horizontal_Distance_To_Fire_Points'
 'log(Horizontal_Distance_To_Fire_Points)'
 'Horizontal_Distance_To_Fire_Points^2'
 'Horizontal_Distance_To_Roadways^2']
['Horizontal_Distance_To_Fire_Points^2'
 'Horizontal_Distance_To_Fire_Points'
 'log(Horizontal_Distance_To_Fire_Points)'
 'Horizontal_Distance_To_Fire_Points * Hillshade_Noon'
 'Horizontal_Distance_To_Fire_Points * Hillshade_9am']
['log(Horizontal_Distance_To_Roadways)'
 'log(Horizontal_Distance_To_Fire_Points)'
 'Horizontal_Distance_To_Fire_Points' 'Horizontal_Distance_To_Roadways^2'
 'Horizontal_Distance_To_Roadways']
['log(Horizontal_Distance_To_Fire_Points)'
 'Horizontal_Distance_To_Fire_Points'
 'Horizontal_Distance_To_Fire_Points * Hillshade_Noon'


In [42]:
print(clf.random_state)

282


In [5]:
# Delete:  kmean_2
# New features + All unsupervised: 0.8629890604669095
# Delete:  Slope
# New features + All unsupervised: 0.862474441147515
# Delete:  Hillshade_3pm
# New features + All unsupervised: 0.8620871858068336
# Delete:  Horizontal_Distance_To_Fire_Points^2
# New features + All unsupervised: 0.8626379489580249
# Delete:  Horizontal_Distance_To_Roadways
# New features + All unsupervised: 0.8625157483838544
# Delete:  log(Horizontal_Distance_To_Fire_Points)
# New features + All unsupervised: 0.8621457043916477
# Delete:  Horizontal_Distance_To_Fire_Points * Hillshade_Noon
# New features + All unsupervised: 0.8621422621219528
# Delete:  log(Horizontal_Distance_To_Roadways)
# New features + All unsupervised: 0.8617412377024916
# Delete:  kmean_cluster
# New features + All unsupervised: 0.8636413705741017
# Delete:  Horizontal_Distance_To_Fire_Points * Hillshade_9am
# New features + All unsupervised: 0.8632696054470476

# Current best: 0.8656912421774421
# Base score: 0.8482750786558625
# With mult features: 0.8628031779033823
# With all new features: 0.8627274479700936
# New features + kmeansID: 0.8626396700928725
# New features + kmeansNoID: 0.8617963140176107
# New features + 2 KM: 0.8632489518288778
# New features + pcaID: 0.8633195183576243
# New features + pcaNoID: 0.8623247024157848
# New features + 2 PCA: 0.8641921337252931
# New features + All unsupervised: 0.8626276221489402

print(clf.feature_importances_)
print(clf.feature_names_in_)
print(clf.feature_names_in_[:20])

[0.0649774  0.12178027 0.0181831  0.01204682 0.02255579 0.01900052
 0.01640125 0.01746187 0.01657929 0.0141617  0.01447538 0.04568524
 0.05768906 0.02615949 0.01795495 0.01983697 0.01620867 0.01531837
 0.02886415 0.02063302 0.01696466 0.05529533 0.06621734 0.01565964
 0.01464653 0.01611607 0.01499848 0.02053506 0.00405167 0.0655281
 0.03205443 0.01856816 0.03040658 0.02431846 0.01866617]
['Id' 'Elevation' 'Aspect' 'Slope' 'Horizontal_Distance_To_Hydrology'
 'Vertical_Distance_To_Hydrology' 'Horizontal_Distance_To_Roadways'
 'Hillshade_9am' 'Hillshade_Noon' 'Hillshade_3pm'
 'Horizontal_Distance_To_Fire_Points' 'Wilderness_Area_Synth'
 'Soil_Type_Synth' 'Horizontal_Distance_To_Fire_Points * Id'
 'Horizontal_Distance_To_Fire_Points * Elevation'
 'Horizontal_Distance_To_Fire_Points * Horizontal_Distance_To_Roadways'
 'Horizontal_Distance_To_Fire_Points * Hillshade_9am'
 'Horizontal_Distance_To_Fire_Points * Hillshade_Noon'
 'Horizontal_Distance_To_Roadways * Id'
 'Horizontal_Distance_To_Ro

In [7]:
importance_scores = clf.feature_importances_
top_feature_indices = np.argsort(importance_scores)[-15:]
top_feature_names = [X_train_synth.columns.tolist()[i] for i in top_feature_indices]

In [13]:
clf.feature_importances_

array([0.0649774 , 0.12178027, 0.0181831 , 0.01204682, 0.02255579,
       0.01900052, 0.01640125, 0.01746187, 0.01657929, 0.0141617 ,
       0.01447538, 0.04568524, 0.05768906, 0.02615949, 0.01795495,
       0.01983697, 0.01620867, 0.01531837, 0.02886415, 0.02063302,
       0.01696466, 0.05529533, 0.06621734, 0.01565964, 0.01464653,
       0.01611607, 0.01499848, 0.02053506, 0.00405167, 0.0655281 ,
       0.03205443, 0.01856816, 0.03040658, 0.02431846, 0.01866617])

In [19]:
a = clf.feature_names_in_
a

array(['Id', 'Elevation', 'Aspect', 'Slope',
       'Horizontal_Distance_To_Hydrology',
       'Vertical_Distance_To_Hydrology',
       'Horizontal_Distance_To_Roadways', 'Hillshade_9am',
       'Hillshade_Noon', 'Hillshade_3pm',
       'Horizontal_Distance_To_Fire_Points', 'Wilderness_Area_Synth',
       'Soil_Type_Synth', 'Horizontal_Distance_To_Fire_Points * Id',
       'Horizontal_Distance_To_Fire_Points * Elevation',
       'Horizontal_Distance_To_Fire_Points * Horizontal_Distance_To_Roadways',
       'Horizontal_Distance_To_Fire_Points * Hillshade_9am',
       'Horizontal_Distance_To_Fire_Points * Hillshade_Noon',
       'Horizontal_Distance_To_Roadways * Id',
       'Horizontal_Distance_To_Roadways * Elevation',
       'Horizontal_Distance_To_Roadways * Hillshade_Noon',
       'Id * Elevation', 'log(Id)',
       'log(Horizontal_Distance_To_Roadways)',
       'log(Horizontal_Distance_To_Fire_Points)',
       'Horizontal_Distance_To_Roadways^2',
       'Horizontal_Distance_To_Fire