In [1]:
import polars as pl
from sklearn.linear_model import LinearRegression 
from sklearn.feature_selection import RFE
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
dir = r"A:\fyp\data\\"

train_df = pl.read_csv(dir+"training_dataset.csv")
test_df =pl.read_csv(dir+"testing_dataset.csv")

# Function score

In [3]:
def preprocess(df, name = 'func'):
    impaired_list = [item for item in train_df.columns if item.startswith('impaired')]
    feature_df = df.drop(['SEQN', 'func_score'] + impaired_list)
    if name == 'func':
        target_df = df.select(pl.col('func_score'))
    else:
        target_df = df.select(pl.col(impaired_list))
    return feature_df, target_df

In [4]:
def eval_model(true, pred):
    print(f'R^2 : {round(r2_score(true, pred), 4)}') 
    print(f'MAE : {round(mean_absolute_error(true, pred), 4)}') 
    print(f'RMSE : {round(mean_squared_error(true, pred), 4)}') 

In [5]:
x_train, y_train = preprocess(train_df)
x_test, y_test = preprocess(test_df)

model = LinearRegression()
model.fit(x_train, y_train)
predictions = model.predict(x_test) 
print("Train")
eval_model(y_train, model.predict(x_train))
print()
print("Test")
eval_model(y_test, predictions)

Train
R^2 : 0.0612
MAE : 3.7834
RMSE : 30.0296

Test
R^2 : 0.066
MAE : 3.4775
RMSE : 23.0417


In [6]:
model = LinearRegression()
rfe = RFE(estimator=model)

r_squared_scores = []
for n in range(len(x_train.columns)):
    rfe.n_features_to_select = n + 1
    rfe.fit(x_train, y_train)
    x_train_selected = rfe.transform(x_train)
    x_test_selected = rfe.transform(x_test)
    model.fit(x_train_selected, y_train) 
    y_pred = model.predict(x_test_selected)
    r_squared_scores.append(r2_score(y_test, y_pred))

for n, score in enumerate(r_squared_scores):
    print(f"Number of Features: {n+1}, R-squared: {round(score*100, 3)} %")

Number of Features: 1, R-squared: 0.212 %
Number of Features: 2, R-squared: 1.154 %
Number of Features: 3, R-squared: 1.829 %
Number of Features: 4, R-squared: 1.836 %
Number of Features: 5, R-squared: 1.263 %
Number of Features: 6, R-squared: 1.332 %
Number of Features: 7, R-squared: 1.346 %
Number of Features: 8, R-squared: 1.2 %
Number of Features: 9, R-squared: 4.807 %
Number of Features: 10, R-squared: 7.03 %
Number of Features: 11, R-squared: 6.852 %
Number of Features: 12, R-squared: 6.841 %
Number of Features: 13, R-squared: 6.776 %
Number of Features: 14, R-squared: 7.212 %
Number of Features: 15, R-squared: 7.168 %
Number of Features: 16, R-squared: 6.584 %
Number of Features: 17, R-squared: 6.507 %
Number of Features: 18, R-squared: 6.611 %
Number of Features: 19, R-squared: 6.58 %
Number of Features: 20, R-squared: 6.6 %


In [7]:
rfe.n_features_to_select = r_squared_scores.index(max(r_squared_scores))+1
rfe.fit(x_train, y_train)
x_train_selected = rfe.transform(x_train)
x_test_selected = rfe.transform(x_test)
model.fit(x_train_selected, y_train) 
y_pred = model.predict(x_test_selected)

print("Train")
eval_model(y_train, model.predict(x_train_selected))
print()
print("Test")
eval_model(y_test, model.predict(x_test_selected))

Train
R^2 : 0.0582
MAE : 3.7907
RMSE : 30.1262

Test
R^2 : 0.0721
MAE : 3.4807
RMSE : 22.8906


In [8]:
import plotly.graph_objects as go
import numpy as np

true_values = np.array(y_test).flatten()
predicted_values = y_pred.flatten()

fig = go.Figure(data=go.Scatter(x=true_values, y=predicted_values, mode='markers'))
fig.update_layout(
    title='True Values vs. Predicted Values',
    xaxis_title='True Values',
    yaxis_title='Predicted Values',
    width=700,
    height=700,
)

fig.update_xaxes(range = [10,60])
fig.update_yaxes(range = [10,60])
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1}])

fig.show()