In [135]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression

In [136]:
# Load the dataset
df = pd.read_csv('/Users/mabuhannood/Downloads/Project/Orders_Cleaned.csv')

In [137]:
# Encode categorical variables
categorical_columns = df.select_dtypes(include=['object']).columns
label_encoders = {col: LabelEncoder() for col in categorical_columns}
for col in categorical_columns:
    df[col] = label_encoders[col].fit_transform(df[col].astype(str))

In [138]:
# Separate features and target
X = df.drop(columns=['Order Amount'])
y = df['Order Amount']

### Random Forest Regressor method

In [139]:
# Fit a Random Forest Regressor for feature importance
rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X, y)

# Get feature importance
feature_importance = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)

# Select features with importance above a threshold
selected_features = feature_importance[feature_importance > 0.005].index.tolist()

# Filter the dataset to include only selected features
X_selected = X[selected_features]

# Display selected features
print("Selected Features:")
print(selected_features)

Selected Features:
['Rolls Ordered', 'Est. Extended Cost (Line)', 'Sales Order Name', 'Date Created Day', 'Sales Region', 'Parent Record', 'Memo', 'Actual Ship Date Day']


### SelectKBest method

In [140]:
# Use SelectKBest for comparison
selector = SelectKBest(score_func=f_regression, k='all')
X_new = selector.fit_transform(X, y)

# Get scores for each feature
scores = pd.Series(selector.scores_, index=X.columns).sort_values(ascending=False)

# Display scores
print("Feature Scores:")
print(scores)

# Select features based on a threshold
selected_features_stat = scores[scores > 5].index.tolist()  # Adjust threshold as needed

print("\nFeatures selected by SelectKBest:")
print(selected_features_stat)


Feature Scores:
Rolls Ordered                11421.399043
Est. Extended Cost (Line)    10207.433216
Parent Record                   27.302716
Actual Ship Date Year           23.674611
Sales Order Name                22.665008
Date Created Year               21.046153
Sales Region                     5.965821
Memo                             5.957411
Project Use Type                 5.040468
Date Created Day                 2.928927
Actual Ship Date Day             0.683685
Actual Ship Date Month           0.002299
Date Created Month               0.000540
dtype: float64

Features selected by SelectKBest:
['Rolls Ordered', 'Est. Extended Cost (Line)', 'Parent Record', 'Actual Ship Date Year', 'Sales Order Name', 'Date Created Year', 'Sales Region', 'Memo', 'Project Use Type']
