In [20]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
#import xgboost as xgb
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import r2_score
from sklearn import manifold

In [30]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed


train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

for c in train.columns:
    if train[c].dtype == 'object':
        lbl = LabelEncoder()
        lbl.fit(list(train[c].values) + list(test[c].values))
        train[c] = lbl.transform(list(train[c].values))
        test[c] = lbl.transform(list(test[c].values))



n_comp = 12
n_neighbors = 30

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train.drop(["y"], axis=1))
pca2_results_test = pca.transform(test)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train.drop(["y"], axis=1))
ica2_results_test = ica.transform(test)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train.drop(["y"], axis=1))
grp_results_test = grp.transform(test)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train.drop(["y"], axis=1))
srp_results_test = srp.transform(test)

clf = manifold.Isomap(n_neighbors, n_components=3)
X_iso = clf.fit_transform(train.drop(["y"], axis=1))
X_iso_t = clf.transform(test)

clf = manifold.LocallyLinearEmbedding(n_neighbors, n_components=3,
                                      method='standard')
X_lle = clf.fit_transform(train.drop(["y"], axis=1))
X_lle_t = clf.transform(test)

#clf = manifold.MDS(n_components=n_comp, n_init=1, max_iter=100)
#X_mds = clf.fit_transform(train.drop(["y"], axis=1))
#X_mds_t = clf.transform(test)

#save columns list before adding the decomposition components

usable_columns = list(set(train.columns) - set(['y']))

# Append decomposition components to datasets
for i in range(1, n_comp + 1):
    train['pca_' + str(i)] = pca2_results_train[:, i - 1]
    test['pca_' + str(i)] = pca2_results_test[:, i - 1]

    train['ica_' + str(i)] = ica2_results_train[:, i - 1]
    test['ica_' + str(i)] = ica2_results_test[:, i - 1]

    train['tsvd_' + str(i)] = tsvd_results_train[:, i - 1]
    test['tsvd_' + str(i)] = tsvd_results_test[:, i - 1]

    train['grp_' + str(i)] = grp_results_train[:, i - 1]
    test['grp_' + str(i)] = grp_results_test[:, i - 1]

    train['srp_' + str(i)] = srp_results_train[:, i - 1]
    test['srp_' + str(i)] = srp_results_test[:, i - 1]
    if n_comp < 4:
        train['iso_' + str(i)] = X_iso[:, i - 1]
        test['iso_' + str(i)] = X_iso_t[:, i - 1]
        
        train['lle_' + str(i)] = X_lle[:, i - 1]
        test['lle_' + str(i)] = X_lle_t[:, i - 1]

In [54]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor

train = train.sample(frac=1.0)
y_train = train['y'].values

clf = MLPRegressor(hidden_layer_sizes = (50,),activation="relu")
print(cross_val_score(clf,train.drop('y', axis=1),y_train,scoring="r2",n_jobs=3))

params = {'n_estimators': 600, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = GradientBoostingRegressor(**params)
print(cross_val_score(clf,train.drop('y', axis=1),y_train,scoring="r2",n_jobs=3))

clf = RandomForestRegressor()
print(cross_val_score(clf,train.drop('y', axis=1),y_train,scoring="r2",n_jobs=3))

clf = LassoLarsCV()
print(cross_val_score(clf,train.drop('y', axis=1),y_train,scoring="r2",n_jobs=3))

[ 0.33140762  0.21845297  0.07507684]
[ 0.57778623  0.49634811  0.57653935]
[ 0.47880328  0.40876998  0.49613045]
[ 0.57069378  0.48050593  0.55661164]


In [55]:

id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values
clf = GradientBoostingRegressor(**params)
clf.fit(finaltrainset,y_train)
y_pred = clf.predict(finaltestset)
sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred
sub.to_csv('submission.csv', index=False)

## Funktioniert gerade nicht so gut

In [48]:
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=GradientBoostingRegressor(**params)),
    LassoLarsCV()

)

stacked_pipeline.fit(finaltrainset, y_train)
y_pred = stacked_pipeline.predict(finaltestset)

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred

  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))
  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))


In [49]:
sub.to_csv('submission.csv', index=False)

# Noch nicht ausprobiert

In [None]:
#usable_columns = list(set(train.columns) - set(['y']))
from sklearn.ensemble import GradientBoostingRegressor
y_train = train['y'].values
y_mean = np.mean(y_train)
id_test = test['ID'].values
#finaltrainset and finaltestset are data to be used only the stacked model (does not contain PCA, SVD... arrays) 
finaltrainset = train[usable_columns].values
finaltestset = test[usable_columns].values


'''Train the xgb model then predict the test data'''

xgb_params = {
    'n_trees': 520, 
    'eta': 0.0045,
    'max_depth': 4,
    'subsample': 0.93,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'base_score': y_mean, # base prediction = mean(target)
    'silent': 1
}
# NOTE: Make sure that the class is labeled 'class' in the data file

dtrain = xgb.DMatrix(train.drop('y', axis=1), y_train)
dtest = xgb.DMatrix(test)

num_boost_rounds = 1250
# train model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_rounds)

params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 2,
          'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(train.drop('y', axis=1), y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)

y_pred = model.predict(dtest)

'''Train the stacked models then predict the test data'''

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV()

)


stacked_pipeline.fit(finaltrainset, y_train)
results = stacked_pipeline.predict(finaltestset)

'''R2 Score on the entire Train data when averaging'''

print('R2 score on train data:')
print(r2_score(y_train,stacked_pipeline.predict(finaltrainset)*0.2855 + model.predict(dtrain)*0.7145))

'''Average the preditionon test data  of both models then save it on a csv file'''

sub = pd.DataFrame()
sub['ID'] = id_test
sub['y'] = y_pred*0.75 + results*0.25
sub.to_csv('stacked-models.csv', index=False)


# Any results you write to the current directory are saved as output.