In [1]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.discriminant_analysis import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from xgboost import XGBRegressor

target = 'y'

In [2]:
df = pd.read_csv('./outputs/train_clean.csv')
df.shape

(4209, 552)

In [3]:
# Feature Importance
random_forest_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

X = df.copy(deep=True).drop(target, axis=1)
y = pd.Series(df[target])


clf = GridSearchCV(RandomForestRegressor(), random_forest_params, cv=5, n_jobs=-1, verbose=2)
clf.fit(X, y)
clf.best_estimator_.feature_importances_

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.3s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=100; total time=  12.6s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  25.2s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  25.4s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  25.8s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   9.8s
[CV] END m

KeyboardInterrupt: 

In [None]:
# plot 20 most important features

feature_importances = pd.Series(
    clf.best_estimator_.feature_importances_, index=X.columns
)
feature_importances.nlargest(20).sort_values().plot(kind="barh")
sns.despine()

In [None]:
# remove feature with importance < 0.01
X_filtered = X[feature_importances[feature_importances > 0.005].index]
X_filtered.head(5)

In [None]:
regressors = {
    "random_forest": {
        "model": RandomForestRegressor(),
        "params": {
            "n_estimators": [100, 200, 300],
            "max_depth": [None, 5, 10, 15, 20],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
        },
    },
    "svr": {
        "model": SVR(),
        "params": {
            "svr__kernel": ["linear", "poly", "rbf", "sigmoid"],
            "svr__degree": [2, 3, 4],
            "svr__C": [0.1, 1, 10],
        },
    },
    "xgboost": {
        "model": XGBRegressor(),
        "params": {
            "positive": [True],
            "n_estimators": [100, 200, 300],
            "max_depth": [3, 4, 5],
            "learning_rate": [0.1, 0.01, 0.001],
        },
    },
    "lasso": {"model": Lasso(), "params": {}},
    "ridge": {"model": LinearRegression(), "params": {"fit_intercept": [True, False]}},
}

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_filtered, y, test_size=0.33, random_state=42)

In [None]:
def run_regressors(
    regressors: dict[str, dict], X_train: pd.DataFrame, y_train: pd.Series
) -> Pipeline:

    for classifier_key, classifier_content in regressors.items():
        if classifier_key == "svr":
            pipeline = make_pipeline(StandardScaler(), classifier_content["model"])
            clf = GridSearchCV(
                estimator=pipeline,
                param_grid=classifier_content["params"],
                cv=5,
                verbose=2,
                scoring="neg_mean_squared_error",
            )
        else:
            clf = GridSearchCV(
                estimator=classifier_content["model"],
                param_grid=classifier_content["params"],
                cv=5,
                verbose=2,
            )
        clf.fit(X_train, y_train)

    return clf.best_estimator_

In [None]:
clf = run_regressors(regressors, X_train=X_train, y_train=y_train)

In [None]:
y_pred = clf.predict(X_test)
acc = clf.score(X_test, y_test)
acc