In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LassoCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler, QuantileTransformer
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

from sklearnex import patch_sklearn
patch_sklearn()

## Utility functions

In [None]:
# vif calculation
def calc_vif(X):
    vif = pd.DataFrame()
    vif["variables"] = X.columns
    vif["VIF"] = [variance_inflation_factor(
        X.values, i) for i in range(X.shape[1])]

    return(vif)

# delete correlated columns
def correlation_remover(dataset, threshold):
    reduced = dataset.copy()
    col_corr = set()
    corr_matrix = reduced.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if (abs(corr_matrix.iloc[i, j]) >= threshold) and (corr_matrix.columns[j] not in col_corr):
                colname = corr_matrix.columns[i]
                col_corr.add(colname)
                if colname in reduced.columns:
                    del reduced[colname]
    return reduced, col_corr

 ## Load the Dataset

In [None]:
X = pd.read_csv('X_train.csv')
y = pd.read_csv('y_train.csv')
df = pd.concat([y, X], axis=1)

# trop the id columns
X = X.drop(columns='id')
y = y.drop(columns='id')

## Preprocessing

Since the data was artificially augmented and altered, there are all-zero columns and also duplicates in the data.
Those were found by using simple pandas funtions.
Median imputing was found to work best here and it's a lot faster than using knn-imputing on this dataset, where we have a very high dimensional feature-space.

In [None]:
# drop zero cols
X_selection = X.drop(columns=['x193', 'x297', 'x339', 'x629'])
# drop duplicates
X_selection = X_selection.drop_duplicates()
indexes = X_selection.columns
# imputing
imp = SimpleImputer(strategy='median').fit(X_selection)
X_selection = pd.DataFrame(imp.transform(X_selection), columns=indexes)


## Feature Selection

The feature selection is actually rather simple and there could be many other possibilities like ANOVA etc. to select the best features.
It is known that the Dataset originally has only about 200 features, but the Data now contains 800 so we have to find those which were artificially added.
One could use feature importance of RandomForest or the coefficients of Lasso, but those methods performed worse than the very simple removal of data which correlates
only weakly with the target. To find a good threshold, model performance of the out of the box Sklearn models was calculated while changing the threshold.
The results are plotted below.

![](correlation_analysis/corr_vs_score_knn.png)
![](correlation_analysis/corr_vs_score_lasso.png)
![](correlation_analysis/corr_vs_score_sgd.png)
![](correlation_analysis/corr_vs_score_svr.png)

Most algorithms had a peak around correlation threshold of 0.1, expect KNN. Hence, a value around 0.1 was chosen for feature selection.

In [None]:
# calculate the correlation in respect to y and select only the ones with a certain threshold
corr_scores = []

corr_y = []
high_corr_idx = []

for idx, col in enumerate(X_selection.columns):
    corr_y.append(
        [idx, pd.concat([y, X_selection[col]], axis=1).corr().iloc[0, 1]])

for item in corr_y:
    if abs(item[1]) >= 0.1:
        high_corr_idx.append(item[0])

print("Selected", len(high_corr_idx), "features.")
X_selection = X_selection.iloc[:, high_corr_idx]
X_selection, removed_cols = correlation_remover(X_selection, 0.98)

One can watch at the VIF values, which look okay.

In [None]:
vif = calc_vif(X_selection)
vif.sort_values('VIF', ascending=False)

## Model Training

The dataset is split into 80% train and 20% test set.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_selection, y.y, test_size=0.2, random_state=11)

The model is composed of SVR, GBR and Ridge, stacked by the sklearn StackingRegressor.
The hyperparameters are found with a GridSearch. Note that the initial grids were a lot bigger and only the best ranges are kept.

### SVR

The support vector regressor was found to perform very well with the standard RBF kernel and mean-variance scaling.

In [None]:
param_grid = [
    {'svr__C': np.logspace(0, 3, 4),
     'svr__epsilon': np.logspace(-6, 0, 7),
     'svr__gamma': ['auto', 'scale']},
]
svr_model = Pipeline([
    ('scaler', StandardScaler()),
    ('svr', SVR())
])

svr_model = GridSearchCV(svr_model, param_grid,scoring='r2', cv=5, n_jobs=-1)
svr_model.fit(X_train, y_train)
print(svr_model.best_params_)
svr_model = svr_model.best_estimator_
print(r2_score(y_test, svr_model.predict(X_test)))


### GBR

The gradient boost regressor adds another approach and should make the total model more robust.
The quantile scaler was used, with adapted quantile sizes to fit the dataset better.
This was found to work better than the StandardScaler, probably because boosting is known to be sensitive to outliers.
The quantile size value was found empirically.

In [None]:
cv_val = 5
n_qantiles_full = int(X_train.shape[0]*2/cv_val)

param_grid = [{
    'gb__n_estimators': [1000],
    'gb__min_samples_split': [2, 3],
    'gb__min_samples_leaf': [2, 3],
    'gb__learning_rate': [0.1],
    'gb__max_depth': [3,4],
    'gb__max_features': ['sqrt'],
    'scaler__n_quantiles': [int(n_qantiles_full/2), n_qantiles_full]
}]

gb_model = Pipeline([
    ('scaler', QuantileTransformer(output_distribution="normal")),
    ('gb', GradientBoostingRegressor(random_state=42))
])

search = GridSearchCV(gb_model, param_grid, cv=cv_val, n_jobs=-1)
search.fit(X_train, y_train)
print(search.best_params_)
gb_model = search.best_estimator_

score = r2_score(y_test, gb_model.predict(X_test))
print(score)

### Ridge

In [None]:
param_grid = [{
    'estimator__n_estimators': [1000],
    'estimator__max_features': ['auto', 'log2', 'sqrt']
}]

adb_model = Pipeline([
    ('scaler', StandardScaler()),
    ('estimator', ExtraTreesRegressor())
])

search = GridSearchCV(adb_model, param_grid, cv=5, n_jobs=-1, verbose=1)
search.fit(X_train, y_train)
print(search.best_params_)
adb_model = search.best_estimator_

score = r2_score(y_test, adb_model.predict(X_test))
print(score)

### Ensemble

An ensemble should make the model more robust and perform better in general. They are also very often amongst the top models in Kaggle.
Before fitting the final Regressor, the models are trained on the whole available data.

In [None]:
svr_model.fit(X_selection, y.y)
gb_model.fit(X_selection, y.y)
adb_model.fit(X_selection,y.y)
estimators = [
    ('knn', adb_model),
    ('svr', svr_model),
    ('gb', gb_model)
]


Fitting the stacking regression model with LassoCV, which has built-in cross validation.

In [None]:
final_pipeline = Pipeline([
    ('model', LassoCV())
])

reg = StackingRegressor(estimators, final_pipeline, n_jobs=-1)
reg.fit(X_train, y_train)


In [None]:
score = r2_score(y_test, reg.predict(X_test))
print(score)

As one can see, the ensemble model performs better than the individual models and was found to be more robust too.
Before predicting on test data, the model should be trained on the whole available data. Then the test data must be processed in the same way.