## Regresja

losowy model

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error, median_absolute_error, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint, uniform
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import StandardScaler

In [2]:
df_train = pd.read_csv('../data_with_features/train_with_features.csv')
df_test = pd.read_csv('../data_with_features/test_with_features.csv')

In [3]:
X = df_train.drop(['cellid', 'phase', 'order_within_phase', 'order'], axis=1).loc[df_train['phase'] == 'G2M']
y = df_train['order_within_phase'].loc[df_train['phase'] == 'G2M']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', RandomForestRegressor(random_state=42))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)

mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 10.076098765432098
R-squared: 0.06556674393622441


### Dobór hiperparametrów

In [4]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', RandomForestRegressor(random_state=42))
])


param_dist = {
    'regressor__n_estimators': sp_randint(50, 200),
    'regressor__max_depth': [None] + list(sp_randint(3, 20).rvs(10)),
    'regressor__min_samples_split': sp_randint(2, 20),
    'regressor__min_samples_leaf': sp_randint(1, 20),
    'regressor__max_features': ['sqrt', 'log2', None],
    'regressor__bootstrap': [True, False],
}


random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='neg_mean_squared_error'
)


random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
best_params = random_search.best_params_
mae = mean_absolute_error(y_val, y_pred)
medae = median_absolute_error(y_val, y_pred)

y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'regressor__bootstrap': True, 'regressor__max_depth': np.int64(14), 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 11, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 70}
Mean Squared Error: 9.688839475799309
R-squared: 0.10148024254085963
Mean Absolute Error: 2.7274074074074077
Median Absolute Error: 2.745


In [5]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/randomforestregressor.xlsx', index=False)

In [6]:
rand_for_reg = RandomForestRegressor(bootstrap = True,
                                max_depth = np.int64(7),
                                max_features = 'log2',
                                min_samples_leaf = 5,
                                min_samples_split = 10,
                                n_estimators = 148)

In [7]:
rand_for_reg.fit(X_train, y_train)

y_pred_RFR = rand_for_reg.predict(X_val)

tolerance = 0.5  # Predictions within +/- 1 of the actual value are considered "correct"

# custom accuracy
correct_predictions = np.abs(y_val - y_pred_RFR) <= tolerance
accuracy = correct_predictions.mean()
print(accuracy)

0.08024691358024691


GradientBoostingRegressor

In [8]:
from sklearn.ensemble import GradientBoostingRegressor
from scipy.stats import randint as sp_randint, uniform


pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', GradientBoostingRegressor(random_state=42))
])

param_dist = {
    'regressor__n_estimators': sp_randint(50, 200),
    'regressor__learning_rate': uniform(0.01, 0.3),
    'regressor__max_depth': sp_randint(3, 10),
    'regressor__min_samples_split': sp_randint(2, 20),
    'regressor__min_samples_leaf': sp_randint(1, 20),
    'regressor__max_features': ['sqrt', 'log2', None],
}

random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='neg_mean_squared_error'
)

random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
best_params = random_search.best_params_

y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
medae = median_absolute_error(y_val, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'regressor__learning_rate': np.float64(0.01208563915935721), 'regressor__max_depth': 5, 'regressor__max_features': 'sqrt', 'regressor__min_samples_leaf': 5, 'regressor__min_samples_split': 2, 'regressor__n_estimators': 100}
Mean Squared Error: 9.814429857903866
R-squared: 0.08983329143287067
Mean Absolute Error: 2.6993005019352068
Median Absolute Error: 2.7750427103111353


In [9]:
pd.DataFrame(random_search.cv_results_).to_excel(
    '../parameter_search_results/random_search_gradientboostingregressor.xlsx', index=False)

In [10]:
gradientboostigreg = GradientBoostingRegressor(
    learning_rate=np.float64(0.01208563915935721),
    max_depth=5,
    max_features='sqrt',
    min_samples_leaf=5,
    min_samples_split=2,
    n_estimators=100)

In [11]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', gradientboostigreg)
])

pipeline.fit(X_train, y_train)

y_pred_GBR = pipeline.predict(X_val)

tolerance = 0.5

# custom accuracy
correct_predictions = np.abs(y_val - y_pred_GBR) <= tolerance
accuracy = correct_predictions.mean()
print(accuracy)

0.09259259259259259


Regresja Liniowa

In [12]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', LinearRegression())
])

param_dist = {
    'regressor__fit_intercept': [True, False],
    'regressor__positive': [True, False],
}


random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_dist,
    n_iter=4,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='neg_mean_squared_error'
)


random_search.fit(X_train, y_train)

best_model = random_search.best_estimator_
best_params = random_search.best_params_

y_pred = best_model.predict(X_val)
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
medae = median_absolute_error(y_val, y_pred)

print(f"Best Hyperparameters: {best_params}")
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Hyperparameters: {'regressor__positive': True, 'regressor__fit_intercept': False}
Mean Squared Error: 10.382347385844934
R-squared: 0.037165980684561895
Mean Absolute Error: 2.7932967955269894
Median Absolute Error: 2.835099433538147


In [13]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search_lineralregression.xlsx',
                                                 index=False)

In [14]:
lin_reg = LinearRegression(fit_intercept=False,
                           positive=True)

In [15]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', lin_reg)
])

pipeline.fit(X_train, y_train)

y_pred_LINREG = pipeline.predict(X_val)

tolerance = 0.5

# custom accuracy
correct_predictions = np.abs(y_val - y_pred_LINREG) <= tolerance
accuracy = correct_predictions.mean()
print(accuracy)

0.10493827160493827


Regresja logistyczna

In [16]:
y_numeric_train = pd.factorize(y_train)[0]
y_numeric_val = pd.factorize(y_val)[0]


pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values with the mean
    ('scaler', StandardScaler()),
    ('log_reg', LogisticRegression(max_iter=10000))
])

# Randomized Search
param_dist = {
    'log_reg__C': uniform(0.1, 10),
    'log_reg__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'log_reg__penalty': ['l2']  # Only use 'l2' penalty
}

random_search = RandomizedSearchCV(
    estimator=pipeline,  # Use the pipeline as the estimator
    param_distributions=param_dist,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1,
    scoring='accuracy'
)

random_search.fit(X_train, y_numeric_train)

print("Best parameters from Randomized Search:", random_search.best_params_)
print("Best score from Randomized Search:", random_search.best_score_)

# Grid Search around best parameters from Randomized Search
best_params_random = random_search.best_params_
param_grid = {
    'log_reg__C': [best_params_random['log_reg__C'] - 0.5,
                    best_params_random['log_reg__C'],
                    best_params_random['log_reg__C'] + 0.5],
    'log_reg__solver': [best_params_random['log_reg__solver']],
    'log_reg__penalty': ['l2']
}

grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=5,
    verbose=2,
    n_jobs=-1,
    scoring='accuracy'
)

grid_search.fit(X_train, y_numeric_train)

print("Best parameters from Grid Search:", grid_search.best_params_)
print("Best score from Grid Search:", grid_search.best_score_)

best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_val)
accuracy = accuracy_score(y_numeric_val, y_pred)
print("Accuracy on validation set:", accuracy)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters from Randomized Search: {'log_reg__C': np.float64(1.0541011649041132), 'log_reg__penalty': 'l2', 'log_reg__solver': 'saga'}
Best score from Randomized Search: 0.11879546809779366
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best parameters from Grid Search: {'log_reg__C': np.float64(1.0541011649041132), 'log_reg__penalty': 'l2', 'log_reg__solver': 'saga'}
Best score from Grid Search: 0.11879546809779366
Accuracy on validation set: 0.10493827160493827


In [17]:
pd.DataFrame(random_search.cv_results_).to_excel('../parameter_search_results/random_search_logisticregression.xlsx',
                                                 index=False)

In [18]:
log_reg = LogisticRegression(max_iter=10000, C=1.0541011649041132, penalty='l2', solver='saga')

In [19]:
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('log_reg', log_reg)
])

pipeline.fit(X_train, y_train)

y_pred_LINREG = pipeline.predict(X_val)

tolerance = 0.5

# custom accuracy
correct_predictions = np.abs(y_val - y_pred_LINREG) <= tolerance
accuracy = correct_predictions.mean()
print(accuracy)

0.12962962962962962


In [20]:
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
medae = median_absolute_error(y_val, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
print(f"Mean Absolute Error: {mae}")
print(f"Median Absolute Error: {medae}")

Mean Squared Error: 23.253086419753085
R-squared: -1.156435517611805
Mean Absolute Error: 3.9074074074074074
Median Absolute Error: 3.0
