# Modeling & predictions

### Import libraries

In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.model_selection import cross_val_predict #cross_val_score, KFold
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.dummy import DummyRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
#from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso
#from sklearn.utils import resample
from sklearn.metrics import r2_score

### Load data

In [10]:
prep_X_train = pd.read_csv('data/preprocessed_X_train.csv')
prep_X_train_scaled = pd.read_csv('data/preprocessed_X_train_scaled.csv')

prep_y_train = pd.read_csv('data/preprocessed_y_train.csv')

prep_X_test = pd.read_csv('data/preprocessed_X_test.csv')
prep_X_test_scaled = pd.read_csv('data/preprocessed_X_test_scaled.csv')
orig_X_test = pd.read_csv('data/test.csv')

### Drop all rows with at least one outlier > outlier defined based on quantiles +/- (e.g. 2149 rows remain out of 27147)

In [11]:
# Assuming df is your DataFrame with 27 features
# Replace 'feature_1', 'feature_2', ..., 'feature_27' with your actual column names

# Identify outliers using, for example, the interquartile range (IQR) method
Q1 = prep_X_train_scaled.quantile(0.25)
Q3 = prep_X_train_scaled.quantile(0.75)
IQR = Q3 - Q1

# Define a boolean mask for outliers
outliers_mask = ((prep_X_train_scaled < (Q1 - 1.5 * IQR)) | (prep_X_train_scaled > (Q3 +1.5 * IQR))).any(axis=1)

# Drop rows with at least one outlier
df_no_outliers = prep_X_train_scaled[~outliers_mask]

# Get the indices of the rows without outliers in df_X
indices_no_outliers = df_no_outliers.index

# Filter df_y based on the indices
df_y_no_outliers = prep_y_train.loc[indices_no_outliers]

print(prep_X_train_scaled.shape)
print(df_no_outliers.shape)
print(df_y_no_outliers.shape)


(27147, 32)
(2149, 32)
(2149, 1)


### Drop all rows with at least one outlier > outlier defined based on z score

In [51]:
from scipy.stats import zscore

# Compute the Z-scores for each data point
z_scores = zscore(prep_X_train_scaled)

# Define a threshold for Z-scores (e.g., 3 standard deviations)
threshold = 4

# Create a boolean mask for outliers: FALSE = outlier
outliers_mask_z = (abs(z_scores) < threshold).all(axis=1)

# Drop rows with at least one outlier
df_no_outliers_z = prep_X_train_scaled[outliers_mask_z]

# Get the indices of the rows without outliers in df_X
indices_no_outliers_z = df_no_outliers_z.index

# Filter df_y based on the indices
df_y_no_outliers_z = prep_y_train.loc[indices_no_outliers_z]

print(prep_X_train_scaled.shape)
print(df_no_outliers_z.shape)
print(df_y_no_outliers_z.shape)

(27147, 32)
(25166, 32)
(25166, 1)


## Compare basic versions of different models using cross validation

In [29]:
def test_basic_versions_of_models(X, y, random_state):
    
    regressors = {
        'Dummy Model': DummyRegressor(strategy='mean'),
        'OLS Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(random_state=random_state),
        'SVR': SVR(),
        'Lasso': Lasso(random_state=random_state),
    }

    for name, regressor in regressors.items():
        model = regressor
        y_cv_pred = cross_val_predict(model, X, y, cv=5)
        r2 = r2_score(y, y_cv_pred)
        print(f'{name}: R^2 Score = {r2:.4f}')

### Without standard scaler (CV with k=5)

In [14]:
#test_basic_versions_of_models(X=prep_X_train, y=prep_y_train.values.ravel(), random_state=42)

### With standard scaler (CV with k=5)

In [40]:
#test_basic_versions_of_models(X=prep_X_train_scaled, y=prep_y_train.values.ravel(), random_state=42)
#test_basic_versions_of_models(X=df_no_outliers, y=df_y_no_outliers.values.ravel(), random_state=42) #Outliers IQR
test_basic_versions_of_models(X=df_no_outliers_z, y=df_y_no_outliers_z.values.ravel(), random_state=42) #Outliers z < 2

Dummy Model: R^2 Score = -0.0195
OLS Linear Regression: R^2 Score = -2458631957274887070941184.0000
Random Forest: R^2 Score = 0.6903
SVR: R^2 Score = 0.7247
Lasso: R^2 Score = 0.2457


## Predict on actual test data

### SVR basic version

In [15]:
model = SVR().fit(prep_X_train_scaled, prep_y_train.values.ravel())

predictions = model.predict(prep_X_test_scaled)

results = pd.DataFrame()
results['Id'] = orig_X_test['Id']
results['target'] = predictions

results.head()

Unnamed: 0,Id,target
0,1000001,1.100978
1,1000002,-1.457233
2,1000003,-1.999204
3,1000004,-3.772044
4,1000005,-4.5086


### SVR (no outliers, n = 2149) > kaggle R2 = 0.37

In [42]:
model = SVR().fit(df_no_outliers, df_y_no_outliers.values.ravel())

predictions = model.predict(prep_X_test_scaled)

results = pd.DataFrame()
results['Id'] = orig_X_test['Id']
results['target'] = predictions

results.head()

Unnamed: 0,Id,target
0,1000001,-2.348107
1,1000002,-4.411795
2,1000003,-3.748738
3,1000004,-4.478076
4,1000005,-4.823865


### SVR 
no outliers [zscore < 2], n = 16586 > kaggle R2 = 0.65782
zscore < 2.3, n = 13243 > kaggle R2 = 0.65538
zscore < 1.8, n= 24249 > kaggle R2 = 0.66257

Basic R2 = 66.329

In [49]:
model = SVR().fit(df_no_outliers_z, df_y_no_outliers_z.values.ravel())

predictions = model.predict(prep_X_test_scaled)

results = pd.DataFrame()
results['Id'] = orig_X_test['Id']
results['target'] = predictions

results.head()

Unnamed: 0,Id,target
0,1000001,0.59486
1,1000002,-2.498649
2,1000003,-1.945012
3,1000004,-3.633043
4,1000005,-4.48892


In [50]:
results.to_csv('data/results.csv', index=False)