In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import time, os
from sklearn.metrics import mean_squared_error

## DANE

In [2]:
# ładowanie danych treningowych
X = pd.read_csv('X_train.csv', header=0)
y = pd.read_csv('y_train.csv', header=0)['Expected']

In [3]:
# podział danych (X_test i y_test posłużą jako dane walidacyjne)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# BASELINE model

In [4]:
%%time
base = GradientBoostingRegressor()
base.fit(X_train,y_train)
mean_squared_error(y_test, base.predict(X_test))
# 0.31999768147616214

CPU times: user 3min 2s, sys: 16.9 ms, total: 3min 2s
Wall time: 3min 2s


0.3972373393951361

## Scaling data

In [5]:
%%time
sc_X = StandardScaler()
sc_X.fit(X_train)
X_train = sc_X.transform(X_train)
X_test = sc_X.transform(X_test)
X = sc_X.transform(X)

CPU times: user 876 ms, sys: 852 ms, total: 1.73 s
Wall time: 1.74 s


In [6]:
# %%time
# pca = PCA()
# pca.fit(X)
# pca_data = pca.transform(X)
# pca.explained_variance_ratio_ * 100

## GradientBoosting

In [None]:
%%time
#siatka parametrów do modelu
parameters = {
    'pca__n_components': [50, 100, 200, None],
    'gradientboosting__learning_rate': [0.05, 0.1],
    'gradientboosting__n_estimators': [100, 200, 500],
    'gradientboosting__subsample': [0.8, 1.0],
    'gradientboosting__max_depth': [3, 5],
    'gradientboosting__min_samples_split': [2, 4],
    'gradientboosting__min_samples_leaf': [1, 2],
    'gradientboosting__max_features': [1.0, 'sqrt'],
}
# model
pipeline = Pipeline([
    ('pca', PCA()),
    ('gradientboosting', GradientBoostingRegressor())
])

# cross walidacja
start = time.time()
reg = GridSearchCV(estimator=pipeline, param_grid=parameters, scoring='neg_root_mean_squared_error',
                   n_jobs=os.cpu_count(), verbose=4, cv=3)
reg.fit(X_train, y_train)

Fitting 3 folds for each of 576 candidates, totalling 1728 fits
[CV 1/3] END gradientboosting__learning_rate=0.05, gradientboosting__max_depth=3, gradientboosting__max_features=1.0, gradientboosting__min_samples_leaf=1, gradientboosting__min_samples_split=2, gradientboosting__n_estimators=100, gradientboosting__subsample=0.8, pca__n_components=100;, score=-0.604 total time=  17.6s
[CV 2/3] END gradientboosting__learning_rate=0.05, gradientboosting__max_depth=3, gradientboosting__max_features=1.0, gradientboosting__min_samples_leaf=1, gradientboosting__min_samples_split=2, gradientboosting__n_estimators=100, gradientboosting__subsample=0.8, pca__n_components=200;, score=-0.603 total time=  32.7s
[CV 2/3] END gradientboosting__learning_rate=0.05, gradientboosting__max_depth=3, gradientboosting__max_features=1.0, gradientboosting__min_samples_leaf=1, gradientboosting__min_samples_split=2, gradientboosting__n_estimators=100, gradientboosting__subsample=1.0, pca__n_components=100;, score=-0

In [None]:
mean_squared_error(y_test, reg.predict(X_test))

In [12]:
X_konkursowe = pd.read_csv('X_test.csv', header=0)
X_konkursowe = sc_X.transform(X_konkursowe)
prediction = base.predict(X_konkursowe)
pd.DataFrame(data=prediction, columns=['Expected']).to_csv('kaggle_predicion.csv', index_label='Id')