In [1]:
import pandas as pd

data = pd.read_csv("datasets/prostate_cancer.txt")
data

Unnamed: 0,id,lcavol,lweight,age,lbph,svi,lcp,gleason,pgg45,lpsa,train
0,1,-0.579818,2.769459,50,-1.386294,0,-1.386294,6,0,-0.430783,T
1,2,-0.994252,3.319626,58,-1.386294,0,-1.386294,6,0,-0.162519,T
2,3,-0.510826,2.691243,74,-1.386294,0,-1.386294,7,20,-0.162519,T
3,4,-1.203973,3.282789,58,-1.386294,0,-1.386294,6,0,-0.162519,T
4,5,0.751416,3.432373,62,-1.386294,0,-1.386294,6,0,0.371564,T
...,...,...,...,...,...,...,...,...,...,...,...
92,93,2.830268,3.876396,68,-1.386294,1,1.321756,7,60,4.385147,T
93,94,3.821004,3.896909,44,-1.386294,1,2.169054,7,40,4.684443,T
94,95,2.907447,3.396185,52,-1.386294,1,2.463853,7,10,5.143124,F
95,96,2.882564,3.773910,68,1.558145,1,1.558145,7,80,5.477509,T


In [2]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97 entries, 0 to 96
Data columns (total 11 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       97 non-null     int64  
 1   lcavol   97 non-null     float64
 2   lweight  97 non-null     float64
 3   age      97 non-null     int64  
 4   lbph     97 non-null     float64
 5   svi      97 non-null     int64  
 6   lcp      97 non-null     float64
 7   gleason  97 non-null     int64  
 8   pgg45    97 non-null     int64  
 9   lpsa     97 non-null     float64
 10  train    97 non-null     object 
dtypes: float64(5), int64(5), object(1)
memory usage: 8.5+ KB


In [3]:
x = data.drop(["id", "lpsa", "train"], axis=1)
y = data["lpsa"]

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [5]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train, y_train)

y_pred = model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mas = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"MAS: {mas}")
print(f"R^2: {r2}")

MSE: 0.4612959596185864
MAS: 0.530383981112604
R^2: 0.6039185108581981


In [6]:
from sklearn.linear_model import Ridge

ridge_model = Ridge(alpha=18.3) # HyperParameter Tuning - 'alpha'. Ranges -> 1, 10, 100, ...
ridge_model.fit(x_train, y_train)

y_pred = ridge_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mas = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"MAS: {mas}")
print(f"R^2: {r2}")

MSE: 0.4434714440600844
MAS: 0.4880888311977972
R^2: 0.6192231336679883


In [7]:
from sklearn.linear_model import Lasso

lasso_model = Lasso(alpha=0.02) # HyperParameter Tuning - 'alpha'. Ranges -> 0.1, 0.2, 0.3, ...  
lasso_model.fit(x_train, y_train)

y_pred = lasso_model.predict(x_test)

mse = mean_squared_error(y_test, y_pred)
mas = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MSE: {mse}")
print(f"MAS: {mas}")
print(f"R^2: {r2}")

MSE: 0.44911791467233325
MAS: 0.5180574554138159
R^2: 0.6143749175892173


---
## Results (R2 score):
- **LinearRegression:**   0.6039185108581973
- **Ridge(alpha=18.3):**  0.6192231336679884
- **Lasso(alpha=0.02):**  0.6143749175892173