In [188]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.exceptions import ConvergenceWarning
from sklearn import model_selection, preprocessing, metrics
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import ElasticNetCV, LassoLarsCV, OrthogonalMatchingPursuitCV
from sklearn.ensemble import GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import time
import xgboost as xgb
import csv
import pickle
import warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

color = sns.color_palette()

In [175]:
RAW_DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data/raw_data'
DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data'
TRAIN_DATA = os.path.join(RAW_DATA_PATH, 'train.csv')
TEST_DATA = os.path.join(RAW_DATA_PATH, 'test.csv')
SAMPLE_SUBMISSION = os.path.join(RAW_DATA_PATH, 'sample_submission.csv')
SUBMISSION_PATH = os.path.join(DATA_PATH, 'submissions')
MODELS_PATH = os.path.join(DATA_PATH, 'models')

In [176]:
train_df = pd.read_csv(TRAIN_DATA)
test_df = pd.read_csv(TEST_DATA)
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION)

In [177]:
# Preprocess data
for column in train_columns:
    cardinality = len(np.unique(train_df[column]))
    if cardinality == 1:
        train_df.drop(column, axis=1, inplace=True)
        test_df.drop(column, axis=1, inplace=True)

train_columns = list(set(train_df.columns) - set(['ID', 'y']))
        
for f in ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
    lbl = preprocessing.LabelEncoder()
    train_values = set(train_df[f].values)
    test_values = set(test_df[f].values)
    all_values = list(train_values | test_values)
    lbl.fit(all_values) 
    train_df[f] = lbl.transform(list(train_df[f].values))
    test_df[f] = lbl.transform(list(test_df[f].values))
    
print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

train_df.shape (4209, 378)
test_df.shape (4209, 377)


In [178]:
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train_df.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test_df)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train_df.drop(["y"], axis=1))
pca2_results_test = pca.transform(test_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train_df.drop(["y"], axis=1))
ica2_results_test = ica.transform(test_df)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train_df.drop(["y"], axis=1))
grp_results_test = grp.transform(test_df)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train_df.drop(["y"], axis=1))
srp_results_test = srp.transform(test_df)

# Append decomposition components to datasets
for i in range(0, n_comp):
    train_df['pca_' + str(i)] = pca2_results_train[:, i]
    test_df['pca_' + str(i)] = pca2_results_test[:, i]

    train_df['ica_' + str(i)] = ica2_results_train[:, i]
    test_df['ica_' + str(i)] = ica2_results_test[:, i]

    train_df['tsvd_' + str(i)] = tsvd_results_train[:, i]
    test_df['tsvd_' + str(i)] = tsvd_results_test[:, i]

    train_df['grp_' + str(i)] = grp_results_train[:, i]
    test_df['grp_' + str(i)] = grp_results_test[:, i]

    train_df['srp_' + str(i)] = srp_results_train[:, i]
    test_df['srp_' + str(i)] = srp_results_test[:, i]

print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

train_df.shape (4209, 438)
test_df.shape (4209, 437)


In [190]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))
        return X_transformed

NUM_FOLDS = 10
kf = model_selection.KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
best_errors = {}
r2_scores = np.ndarray(shape=[NUM_FOLDS, 2], dtype=np.float32)
model_ids = []

Y = train_df['y'].values
X = train_df[train_columns]
Y_mean = np.mean(Y)

print('X.shape', X.shape)
print('Y.shape', Y.shape)

for fold, (train_idxs, val_idxs) in enumerate(kf.split(X)):
    print('Fold', fold)
    
    train_X_stacked, val_X_stacked = X.iloc[train_idxs], X.iloc[val_idxs]
    train_Y_stacked, val_Y_stacked = Y[train_idxs], Y[val_idxs]

    model_id = "model-" + str(int(time.time()))
    model_ids.append(model_id)
    print('Training', model_id)    
    
    stacked_pipeline = make_pipeline(
        StackingEstimator(estimator=LassoLarsCV(normalize=True)),
        StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", n_estimators=500, max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.9)),
        #ExtraTreesRegressor(n_estimators=500, max_features=0.55, max_depth=3, min_samples_leaf=18, min_samples_split=14, n_jobs=-1),
        LassoLarsCV())

    stacked_pipeline.fit(train_X_stacked, train_Y_stacked)
    pickle.dump(stacked_pipeline, open(os.path.join(MODELS_PATH, model_id + ".stacked"), "wb"))
    train_Y_stacked_pred = stacked_pipeline.predict(train_X_stacked)
    train_r2_score = metrics.r2_score(train_Y_stacked, train_Y_stacked_pred)
    val_Y_stacked_pred = stacked_pipeline.predict(val_X_stacked)
    val_r2_score = metrics.r2_score(val_Y_stacked, val_Y_stacked_pred)

    r2_scores[fold, 0] = train_r2_score
    r2_scores[fold, 1] = val_r2_score
    print('  train r2', train_r2_score, 'val r2', val_r2_score)
    print('  Saved', model_id)
    
print('train r2', np.mean(r2_scores[:, 0]),
      'val r2', np.mean(r2_scores[:, 1]))

X.shape (4209, 376)
Y.shape (4209,)
Fold 0
Training model-1497972393


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.57218156037 val r2 0.632454676266
  Saved model-1497972393
Fold 1
Training model-1497972404


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.578769953073 val r2 0.565390879065
  Saved model-1497972404
Fold 2
Training model-1497972416


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.608849804974 val r2 0.343535825287
  Saved model-1497972416
Fold 3
Training model-1497972427


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.577769593788 val r2 0.586632366997
  Saved model-1497972427
Fold 4
Training model-1497972438


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.578320073321 val r2 0.596821003434
  Saved model-1497972438
Fold 5
Training model-1497972449


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.574669294919 val r2 0.607040216929
  Saved model-1497972449
Fold 6
Training model-1497972460


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.572300491893 val r2 0.645721620161
  Saved model-1497972460
Fold 7
Training model-1497972472


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.575280559157 val r2 0.601266582266
  Saved model-1497972472
Fold 8
Training model-1497972483


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.573303090482 val r2 0.63519611272
  Saved model-1497972483
Fold 9
Training model-1497972494


  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))


  train r2 0.578229764318 val r2 0.570065756566
  Saved model-1497972494
train r2 0.578967 val r2 0.578413


In [170]:
NUM_FOLDS = 10
kf = model_selection.KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
best_errors = {}
r2_scores = np.ndarray(shape=[NUM_FOLDS, 2], dtype=np.float32)
model_ids = []

Y = train_df['y'].values
X = train_df.drop(["ID", "y"], axis=1)
Y_mean = np.mean(Y)

print('X.shape', X.shape)
print('Y.shape', Y.shape)

for fold, (train_idxs, val_idxs) in enumerate(kf.split(X)):
    print('Fold', fold)
    
    train_X, val_X = X.iloc[train_idxs], X.iloc[val_idxs]
    train_Y, val_Y = Y[train_idxs], Y[val_idxs]

    def xgb_r2_score(preds, dtrain):
        labels = dtrain.get_label()
        return 'rmse', -1.0 * metrics.r2_score(labels, preds)

    model_id = "model-" + str(int(time.time()))
    model_ids.append(model_id)
    print('Training', model_id)

    model = xgb.XGBRegressor(max_depth = 10,
                        gamma=0.5,
                        objective="reg:linear",
                        n_estimators=1000,
                        learning_rate=0.005,
                        nthread=12,
                        subsample=0.8,
                        colsample_bytree=0.70,
                        colsample_bylevel=0.70,
                        #base_score=Y_mean,
                        seed=42,
                        silent=True)

    model.fit(train_X, train_Y, eval_set=[(train_X, train_Y), (val_X, val_Y)], verbose=False, eval_metric='rmse', early_stopping_rounds=50)
    evals_result = model.evals_result()
    pickle.dump(model, open(os.path.join(MODELS_PATH, model_id + ".xgb"), "wb"))
    train_Y_pred = model.predict(train_X)
    train_r2_score = metrics.r2_score(train_Y, train_Y_pred)
    val_Y_pred = model.predict(val_X)
    val_r2_score = metrics.r2_score(val_Y, val_Y_pred)
    r2_scores[fold, 0] = train_r2_score
    r2_scores[fold, 1] = val_r2_score
    print('  train r2', train_r2_score, 'val r2', val_r2_score)
    print('  Saved', model_id)
    
print('train r2', np.mean(r2_scores[:, 0]),
      'val r2', np.mean(r2_scores[:, 1]))

X.shape (4209, 424)
Y.shape (4209,)
Fold 0
Training model-1497928853
  train r2 0.834918220471 val r2 0.587470460682
  Saved model-1497928853
Fold 1
Training model-1497928859
  train r2 0.831633303508 val r2 0.53356097117
  Saved model-1497928859
Fold 2
Training model-1497928865
  train r2 0.858805314039 val r2 0.317516438311
  Saved model-1497928865
Fold 3
Training model-1497928872
  train r2 0.841224381545 val r2 0.553905277812
  Saved model-1497928872
Fold 4
Training model-1497928878
  train r2 0.836197535809 val r2 0.588333206693
  Saved model-1497928878
Fold 5
Training model-1497928883
  train r2 0.842227217629 val r2 0.609209220178
  Saved model-1497928883
Fold 6
Training model-1497928889
  train r2 0.837264033881 val r2 0.624121124766
  Saved model-1497928889
Fold 7
Training model-1497928895
  train r2 0.838075084721 val r2 0.594774446669
  Saved model-1497928895
Fold 8
Training model-1497928900
  train r2 0.835875943723 val r2 0.612757486423
  Saved model-1497928900
Fold 9
Trai

In [171]:
# Avg xgb models predictions

test_Y_pred = np.zeros(shape=(test_df.shape[0], 9), dtype=np.float32)

pred_count = 0
for fold in [0, 1, 3, 4, 5, 6, 7, 8, 9]:
    model_id = model_ids[fold]
    model = pickle.load(open(os.path.join(MODELS_PATH, model_id + '.xgb'), 'rb'))
    test_X = test_df.drop(['ID'], axis=1)
    test_Y_pred[:, pred_count] = model.predict(test_X)
    pred_count+=1

test_Y_avg = np.mean(test_Y_pred, axis=1)
test_Y_avg.shape

(4209,)

In [169]:
# Avg stacked models predictions

test_Y_pred = np.zeros(shape=(test_df.shape[0], 9), dtype=np.float32)
test_X = test_df[train_columns]

pred_count = 0
for fold in [0, 1, 3, 4, 5, 6, 7, 8, 9]:
    model_id = model_ids[fold]
    model = pickle.load(open(os.path.join(MODELS_PATH, model_id + '.stacked'), 'rb'))
    test_Y_pred[:, pred_count] = model.predict(test_X)
    pred_count+=1

test_Y_avg_stacked = np.mean(test_Y_pred, axis=1)
test_Y_avg_stacked.shape

(4209,)

In [127]:
", ".join(model_ids)

'model-1497917928, model-1497917949, model-1497917969, model-1497917990, model-1497918010, model-1497918031, model-1497918052, model-1497918072, model-1497918093, model-1497918114'

In [172]:
# Weighted average of predictions
test_Y = test_Y_avg * 0.75 + test_Y_avg_stacked * 0.25
test_Y.shape

(4209,)

In [173]:
# Create submission file
submission_df = test_df[['ID']]
submission_df['y'] = test_Y_avg_stacked.tolist()
submission_df.to_csv(os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'), index=False)
print('Generated submission ', os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'))

Generated submission  /kaggle/dev/mercedes-benz-greener-manufacturing-data/submissions/submission-1497928914.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
