In [18]:
import pandas as pd
import numpy as np
path = ''
data = pd.read_csv(path)

> Multilinear regression

In [20]:
import statsmodels.api as sm

# One-hot encoding 
data_encoded = pd.get_dummies(data, columns=['SURFTYPE', 'FC'], drop_first=True)

# Separating the response variable 'Rate' and the predictors
X = data_encoded.drop('Rate', axis=1)
y = data_encoded['Rate']
X = sm.add_constant(X)

# Fit the multilinear regression model
model = sm.OLS(y, X).fit()
# print(model.summary())

> Partial Least Squares (PLS) Regression

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.cross_decomposition import PLSSVD

X = data.drop('Rate', axis=1)
y = data['Rate']

# One-hot encoding 
categorical_features = ['SURFTYPE', 'FC']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ], remainder='passthrough', sparse_threshold=0)

# PLS Singular Value Decomposition (PLSSVD) model (adjust n_components as needed)
pls_svd = PLSSVD(n_components=1)

# Creating a pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('pls', pls_svd)])

# Fit the model
pipeline.fit(X, y)

# Extract the loadings
loadings = pipeline.named_steps['pls'].x_weights_

# Getting the feature names after one-hot encoding
feature_names = (pipeline.named_steps['preprocessor']
                 .named_transformers_['cat']
                 .get_feature_names_out(input_features=categorical_features))
feature_names = np.concatenate((feature_names, 
                                [f for f in X.columns if f not in categorical_features]))

# Mapping loadings to feature names
loadings_df = pd.DataFrame(loadings, index=feature_names)

# Displaying the loadings
# print(loadings_df)

> Ridge Regression w/ cross-validation, regularization

In [24]:
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error

# Separate the features and the target
X = data.drop('Rate', axis=1)
y = data['Rate']

# One-hot encoding for categorical variables
categorical_features = ['SURFTYPE', 'FC']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', one_hot_encoder, categorical_features)
    ],
    remainder='passthrough'
)

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Ridge regression with cross-validation
ridge_cv = RidgeCV(alphas=np.logspace(-6, 6, 13), store_cv_values=True)

# Create and fit the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', ridge_cv)
])
pipeline.fit(X_train, y_train)

# Evaluate the model
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

# Extract model details
best_alpha = ridge_cv.alpha_
coefficients = pipeline.named_steps['regressor'].coef_
encoded_feature_names = pipeline.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names = np.concatenate([encoded_feature_names, np.array(X.drop(categorical_features, axis=1).columns)])
coefficients_with_names = dict(zip(feature_names, coefficients))

# Print results
# print(f'Best alpha: {best_alpha}')
# print(f'Mean Squared Error: {mse}')
# print('Coefficients:')
# for feature, coeff in coefficients_with_names.items():
#     print(f'{feature}: {coeff}')
