In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest, GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import normalize, MinMaxScaler, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
import sklearn.metrics as skm
from sklearn.feature_selection import VarianceThreshold, RFECV, mutual_info_regression, SelectPercentile, SelectKBest, f_regression, RFE, SelectFromModel
from sklearn.svm import SVR
from sklearn.decomposition import PCA, KernelPCA
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor,ExtraTreesRegressor
from sklearn.neighbors import LocalOutlierFactor
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor

In [118]:
# .585
X = pd.read_csv("X_train.csv").drop("id", axis=1)
y = pd.read_csv("y_train.csv").drop("id", axis=1)
pipe_pre = Pipeline([
    ('s1', SimpleImputer(strategy='median')),
    ('s2', MinMaxScaler()),
    ('s4', SelectKBest(score_func=f_regression)),
    ('s5', LGBMRegressor(n_jobs=1))
])
grid = {
    's4__k': [100],
    's5__num_leaves': [24],
    's5__subsample_for_bin': [80],
    's5__min_child_samples': [1],
    's5__max_depth': [13],
    's5__colsample_bytree': [0.45],
    's5__n_estimators': [500],
    's5__learning_rate': [0.1],
}
estimator = GridSearchCV(pipe_pre, grid, cv=5, n_jobs=16, scoring="r2").fit(X, y)

In [None]:
#this one
final = pipe_pre = Pipeline([
    ('s1', SimpleImputer(strategy='median')),
    ('s2', MinMaxScaler()),
    ('s4', SelectKBest(score_func=f_regression, k=100)),
    ('s5', LGBMRegressor(n_jobs=1, min_child_samples=1, max_depth=13, colsample_bytree=0.45, n_estimators=500, learning_rate=0.1, num_leaves=24, subsample_for_bin=80))
])
X_test = pd.read_csv("X_test.csv").drop("id", axis=1)
final = final.fit(X, y)
pd.DataFrame(final.predict(X_test)).to_csv("rofl.csv", index_label='id', header=['y'])

In [None]:
#Not this one, experimenting
X = pd.read_csv("X_train.csv").drop("id", axis=1)
y = pd.read_csv("y_train.csv").drop("id", axis=1)
inner_estim = ExtraTreesRegressor(max_depth=15, min_samples_leaf=4, min_samples_split=8, n_estimators=150)
pipe_pre = Pipeline([
    ('s1', SimpleImputer(strategy='median')),
    ('s2', MinMaxScaler()),
    ('s4', SelectFromModel(inner_estim, threshold=-np.inf)),
    ('s5', LGBMRegressor(n_jobs=1))
])
grid = {
    's4__max_features': [60,70,80],
    's5__num_leaves': [23,24,25],
    's5__subsample_for_bin': [80],
    's5__min_child_samples': [1,2],
    's5__max_depth': [11,13,14],
    's5__colsample_bytree': [0.3,0.45,0.5],
    's5__n_estimators': [500],
    's5__learning_rate': [0.1],
}
estimator = GridSearchCV(pipe_pre, grid, cv=5, n_jobs=8, scoring="r2").fit(X, y)

In [416]:

#   best score .56 with tuning
#   variancethreshold useless
#   IterativeImputer useless
X = pd.read_csv("X_train.csv").drop("id", axis=1)
y = pd.read_csv("y_train.csv").drop("id", axis=1)
pipe_pre = Pipeline([
    ('s1', SimpleImputer(strategy='median')),
    ('s2', MinMaxScaler()),
    ('s4', SelectKBest(score_func=mutual_info_regression)),
    ('s5', GradientBoostingRegressor(n_estimators=300))
])
grid = {
    's4__k': [50],
    's5__max_depth': [11],
    's5__min_samples_split': [12],
    's5__min_samples_leaf': [7]
}
estimator = GridSearchCV(pipe_pre, grid, cv=5, n_jobs=8, scoring="r2").fit(X, y)