# Modeling & predictions

### Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
#from sklearn.model_selection import cross_val_score, KFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
#from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso
import xgboost as xgb
#from sklearn.utils import resample
from sklearn.metrics import r2_score

### Load data

In [89]:
prep_X_train = pd.read_csv('data/preprocessed_X_train.csv')
prep_X_train_scaled = pd.read_csv('data/preprocessed_X_train_scaled.csv')

prep_y_train = pd.read_csv('data/preprocessed_y_train.csv')

prep_X_test = pd.read_csv('data/preprocessed_X_test.csv')
prep_X_test_scaled = pd.read_csv('data/preprocessed_X_test_scaled.csv')
orig_X_test = pd.read_csv('data/test.csv')

### Choose only the features with variability in the values
i.e. if only few observations with a different value, the feature is not included.
No a result improving effect.

In [90]:
#prep_X_train_scaled = prep_X_train_scaled.iloc[:, np.r_[2:11,12:15,16,20,22:30,31]].copy()

#prep_X_test_scaled = prep_X_test_scaled.iloc[:,np.r_[2:11,12:15,16,20,22:30,31]].copy()

### Remove rows with high leverage point feature values based on z value
This one does not have an improving effect either...

In [79]:
from scipy.stats import zscore

# Compute the Z-scores for each data point
z_scores = zscore(prep_X_train_scaled)

# Define a threshold for Z-scores (e.g., 3 standard deviations)
threshold = 3.5

# Create a boolean mask for outliers: FALSE = outlier
outliers_mask_z = (abs(z_scores) < threshold).all(axis=1)

# Drop rows with at least one outlier
df_no_outliers_z = prep_X_train_scaled[outliers_mask_z]

# Get the indices of the rows without outliers in df_X
indices_no_outliers_z = df_no_outliers_z.index

# Filter df_y based on the indices
df_y_no_outliers_z = prep_y_train.loc[indices_no_outliers_z]

print(prep_X_train_scaled.shape)
print(df_no_outliers_z.shape)
print(df_y_no_outliers_z.shape)

(27147, 23)
(25048, 23)
(25048, 1)


### XGBoost algorithm tested

No better results than SVR. 

In [91]:
from sklearn.metrics import mean_squared_error

# Assuming you have your features in X_train and continuous labels in y_train
X_train, X_test, y_train, y_test = train_test_split(prep_X_train_scaled, prep_y_train, test_size=0.2, random_state=42)
#X_train, X_test, y_train, y_test = train_test_split(df_no_outliers_z, df_y_no_outliers_z, test_size=0.2, random_state=42) #where high leverage points removed.

# Create DMatrix objects for XGBoost
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)
dtestreal = xgb.DMatrix(prep_X_test_scaled)

# Set XGBoost parameters for regression
params = {
    'objective': 'reg:squarederror',  # Regression task
    'eval_metric': 'rmse',             # Root Mean Squared Error
    'max_depth': 4,
    'learning_rate': 0.1,
    'gamma': 0.1,                       # Regularization parameter
    'subsample': 0.8,                   # Subsample ratio of the training instance
    'colsample_bytree': 0.8,            # Subsample ratio of columns when constructing each tree
    'alpha': 10,                       # L1 regularization term on weights
    'lambda': 0.1                       # L2 regularization term on weights
}

# Train the XGBoost regression model with early stopping
num_rounds = 1000
model = xgb.train(params, dtrain, num_rounds, evals=[(dtest, 'eval')], early_stopping_rounds=20)

# Make predictions on the test set
y_pred = model.predict(dtest)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
#print(f"Root Mean Squared Error (RMSE): {rmse}")
r2 = r2_score(y_test, y_pred)
print(f'R^2 Score = {r2:.4f}')

predictions = model.predict(dtestreal)

results = pd.DataFrame()
results['Id'] = orig_X_test['Id']
results['target'] = predictions

results.head()

results.to_csv('data/results.csv', index=False)

[0]	eval-rmse:2.06673
[1]	eval-rmse:1.95671
[2]	eval-rmse:1.86181
[3]	eval-rmse:1.78120
[4]	eval-rmse:1.70826


[5]	eval-rmse:1.64681
[6]	eval-rmse:1.60321
[7]	eval-rmse:1.56594
[8]	eval-rmse:1.52073
[9]	eval-rmse:1.48123
[10]	eval-rmse:1.45468
[11]	eval-rmse:1.42597
[12]	eval-rmse:1.40132
[13]	eval-rmse:1.38240
[14]	eval-rmse:1.36647
[15]	eval-rmse:1.34682
[16]	eval-rmse:1.33089
[17]	eval-rmse:1.31719
[18]	eval-rmse:1.30679
[19]	eval-rmse:1.29710
[20]	eval-rmse:1.28837
[21]	eval-rmse:1.28075
[22]	eval-rmse:1.27380
[23]	eval-rmse:1.26573
[24]	eval-rmse:1.25979
[25]	eval-rmse:1.25350
[26]	eval-rmse:1.24879
[27]	eval-rmse:1.24352
[28]	eval-rmse:1.23929
[29]	eval-rmse:1.23476
[30]	eval-rmse:1.23116
[31]	eval-rmse:1.22809
[32]	eval-rmse:1.22446
[33]	eval-rmse:1.22068
[34]	eval-rmse:1.21761
[35]	eval-rmse:1.21496
[36]	eval-rmse:1.21245
[37]	eval-rmse:1.20980
[38]	eval-rmse:1.20789
[39]	eval-rmse:1.20408
[40]	eval-rmse:1.20200
[41]	eval-rmse:1.19976
[42]	eval-rmse:1.19771
[43]	eval-rmse:1.19440
[44]	eval-rmse:1.19252
[45]	eval-rmse:1.19077
[46]	eval-rmse:1.18982
[47]	eval-rmse:1.18876
[48]	eval-rmse:1