In [7]:
!pip install striprtf

Collecting striprtf
  Downloading striprtf-0.0.29-py3-none-any.whl.metadata (2.3 kB)
Downloading striprtf-0.0.29-py3-none-any.whl (7.9 kB)
Installing collected packages: striprtf
Successfully installed striprtf-0.0.29


In [8]:
from striprtf.striprtf import rtf_to_text
import json
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import warnings
import re
warnings.filterwarnings("ignore")

In [9]:
with open("algoparams_from_ui.json.rtf", "r") as file:
    rtf_content = file.read()

# Step 2: Convert to plain text (remove RTF formatting)
plain_text = rtf_to_text(rtf_content)

# Step 3: Parse the clean JSON
json_data = json.loads(plain_text)

design_data = json_data["design_state_data"]
target_config = design_data["target"]
prediction_type = target_config["prediction_type"]
target_column = target_config["target"]
feature_config = design_data["feature_handling"]
reduction_config = design_data["feature_reduction"]
algorithm_config = design_data["algorithms"]
hyperparam_config = design_data["hyperparameters"]

df = pd.read_csv("iris.csv")

In [10]:
# ========== Step 2: Feature Handling (Imputation + Encoding) ==========
selected_features = [f for f, conf in feature_config.items() if conf["is_selected"]]
numerical_features = []
categorical_features = []
imputers = {}

for fname, conf in feature_config.items():
    if not conf["is_selected"]:
        continue
    ftype = conf["feature_variable_type"]
    details = conf["feature_details"]

    if ftype == "numerical":
        numerical_features.append(fname)
        strategy = 'mean' if details["impute_with"] == "Average of values" else 'constant'
        fill_value = details.get("impute_value", 0)
        imputers[fname] = SimpleImputer(strategy=strategy, fill_value=fill_value)
    elif ftype == "text":
        categorical_features.append(fname)

num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean'))  # updated per-column later
])
cat_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, numerical_features),
    ('cat', cat_transformer, categorical_features)
])

In [11]:
# ========== Step 3: Feature Reduction ==========
reduction_method = reduction_config["feature_reduction_method"]
if reduction_method == "No Reduction":
    reducer = "passthrough"
elif reduction_method == "PCA":
    reducer = PCA(n_components=int(reduction_config["num_of_features_to_keep"]))
elif reduction_method == "Tree-based":
    reducer = SelectKBest(score_func=f_regression, k=int(reduction_config["num_of_features_to_keep"]))
else:
    reducer = "passthrough"

In [13]:
# ========== Step 4: Model Selection ==========
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

models = []
param_grids = {}

for model_key, model_conf in algorithm_config.items():
    if not model_conf["is_selected"]:
        continue

    if prediction_type == "Regression":

        if model_key == "RandomForestRegressor":
            model = RandomForestRegressor(random_state=42)
            param_grid = {
                'model__n_estimators': list(range(model_conf["min_trees"], model_conf["max_trees"] + 1, 5)),
                'model__max_depth': list(range(model_conf["min_depth"], model_conf["max_depth"] + 1, 5)),
                'model__min_samples_leaf': list(range(model_conf["min_samples_per_leaf_min_value"],
                                                      model_conf["min_samples_per_leaf_max_value"] + 1, 5))
            }

        elif model_key == "LinearRegression":
            model = LinearRegression()
            param_grid = {
                'model__fit_intercept': [True, False]
            }

        elif model_key == "RidgeRegression":
            model = Ridge()
            param_grid = {
                'model__alpha': [round(x, 2) for x in
                                 np.linspace(model_conf["min_regparam"], model_conf["max_regparam"], 3)],
                'model__max_iter': list(range(model_conf["min_iter"], model_conf["max_iter"] + 1, 10))
            }

        elif model_key == "LassoRegression":
            model = Lasso()
            param_grid = {
                'model__alpha': [round(x, 2) for x in
                                 np.linspace(model_conf["min_regparam"], model_conf["max_regparam"], 3)],
                'model__max_iter': list(range(model_conf["min_iter"], model_conf["max_iter"] + 1, 10))
            }

        elif model_key == "ElasticNetRegression":
            model = ElasticNet()
            param_grid = {
                'model__alpha': [round(x, 2) for x in
                                 np.linspace(model_conf["min_regparam"], model_conf["max_regparam"], 2)],
                'model__l1_ratio': [round(x, 2) for x in
                                    np.linspace(model_conf["min_elasticnet"], model_conf["max_elasticnet"], 2)],
                'model__max_iter': list(range(model_conf["min_iter"], model_conf["max_iter"] + 1, 10))
            }

        elif model_key == "GBTRegressor":
            model = GradientBoostingRegressor()
            param_grid = {
                'model__n_estimators': list(range(model_conf["min_iter"], model_conf["max_iter"] + 1, 10)),
                'model__max_depth': list(range(model_conf["min_depth"], model_conf["max_depth"] + 1, 1)),
                'model__learning_rate': [round(x, 2) for x in
                                         np.linspace(model_conf["min_stepsize"], model_conf["max_stepsize"], 2)],
                'model__subsample': list(range(model_conf["min_subsample"], model_conf["max_subsample"] + 1))
            }

        elif model_key == "DecisionTreeRegressor":
            model = DecisionTreeRegressor()
            param_grid = {
                'model__max_depth': list(range(model_conf["min_depth"], model_conf["max_depth"] + 1)),
                'model__min_samples_leaf': model_conf["min_samples_per_leaf"]
            }

        else:
            print(f"⚠️ Unsupported or unhandled regression model: {model_key}")
            continue

        models.append((model_key, model, param_grid))


In [14]:
X = df[selected_features]
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
# ========== Step 6: Model Training and Evaluation ==========
for name, model, param_grid in models:
    pipe = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('reduction', reducer),
        ('model', model)
    ])

    search = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, verbose=1)
    search.fit(X_train, y_train)

    y_pred = search.predict(X_test)
    print(f"\n===== Model: {name} =====")
    print("Best Parameters:", search.best_params_)
    print("R2 Score:", r2_score(y_test, y_pred))
    print("MSE:", mean_squared_error(y_test, y_pred))
    print("MAE:", mean_absolute_error(y_test, y_pred))

Fitting 3 folds for each of 12 candidates, totalling 36 fits

===== Model: RandomForestRegressor =====
Best Parameters: {'model__max_depth': 20, 'model__min_samples_leaf': 10, 'model__n_estimators': 20}
R2 Score: 0.9632025216126167
MSE: 0.023390521567375724
MAE: 0.1243853175576651
