In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

# Set up StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=20)

# Load data
excel = pd.read_excel('../Dataset_Process/Dataset/original_data.xlsx', header=1)
data_orginal = pd.DataFrame(excel)

# Filter and preprocess
data = data_orginal[['cC', 'cO', 'cN', 'cF', 'cP', 'cS', 'OCr', 'FCr', 'FOr', 'PCr', 'NCr',
                     'SCr', 'NFr', 'POr', 'NOr', 'SOr', 'PFr', 'SFr', 'SPr', 'NSr', 'FNS/Or',
                     'FP/Or', 'FNS/FPr', 'dendrtic', 'sphere',
                     'Li longitudinal growth size', 'Li longitudinal growth maximum size',
                     'The difference between the maximum longitudinal growth size and average size',
                     'General thickness of deposition', 'Maximum deposition thickness',
                     'Li horizontal growth size', 'Li horizontal growth maximum size',
                     'The difference between the maximum horizontal growth size and average size',
                     'Average deposition density', 'λ value']]

df = data[data['dendrtic'].notnull()]
df = df[(df['dendrtic'] != 0) & (df['sphere'] == 0) | (df['dendrtic'] == 0) & (df['sphere'] != 0)]
df = df[df['Li longitudinal growth size'].notnull()]
df = df.fillna(0)
display(df)

# Features and labels
X = df[['Li longitudinal growth size', 'Li longitudinal growth maximum size',
        'The difference between the maximum longitudinal growth size and average size',
        'General thickness of deposition', 'Maximum deposition thickness',
        'The difference between the maximum horizontal growth size and average size',
        'Li horizontal growth size', 'Li horizontal growth maximum size',
        'Average deposition density', 'λ value']]
y = df['sphere']

# Loop for feature selection
for i in range(5):
    # Split into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=20)
    
    # Initialize Random Forest Classifier as the base model
    clf = RandomForestClassifier()
    
    # Perform feature selection using RFECV
    selector = RFECV(estimator=clf, step=1, cv=cv, min_features_to_select=2)
    selector = selector.fit(X_train, y_train)
    
    # Output feature selection results
    selected_features = X.columns[selector.support_]
    print(f"Run {i+1}:")
    print("Selected features:", selected_features.tolist())
    print("Feature rankings (lower is better):", selector.ranking_)
    print("RFECV best score (test set accuracy):", selector.score(X_test, y_test))
    
    # Output feature importance
    feature_importance = selector.estimator_.feature_importances_
    print("Feature importances:")
    for feature, importance in zip(selected_features, feature_importance):
        print(f"Feature '{feature}' importance score: {importance}")
    print("-" * 50)


KeyError: "['Li horizontal growth size', 'Li horizontal growth maximum size'] not in index"