In [72]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import model_selection, preprocessing, metrics
from sklearn.decomposition import PCA, FastICA
from sklearn.decomposition import TruncatedSVD
from sklearn.random_projection import GaussianRandomProjection
from sklearn.random_projection import SparseRandomProjection
from sklearn.base import BaseEstimator,TransformerMixin, ClassifierMixin
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline, make_union
from sklearn.utils import check_array
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
import time
import xgboost as xgb
import csv
import pickle
color = sns.color_palette()

In [73]:
RAW_DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data/raw_data'
DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data'
TRAIN_DATA = os.path.join(RAW_DATA_PATH, 'train.csv')
TEST_DATA = os.path.join(RAW_DATA_PATH, 'test.csv')
SAMPLE_SUBMISSION = os.path.join(RAW_DATA_PATH, 'sample_submission.csv')
SUBMISSION_PATH = os.path.join(DATA_PATH, 'submissions')
MODELS_PATH = os.path.join(DATA_PATH, 'models')

In [74]:
train_df = pd.read_csv(TRAIN_DATA)
test_df = pd.read_csv(TEST_DATA)
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION)

In [75]:
# Preprocess data
usable_columns = list(set(train_df.columns) - set(['ID', 'y']))

for column in usable_columns:
    cardinality = len(np.unique(train_df[column]))
    if cardinality == 1:
        train_df.drop(column, axis=1, inplace=True)
        test_df.drop(column, axis=1, inplace=True)

for f in ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
    lbl = preprocessing.LabelEncoder()
    train_values = set(train_df[f].values)
    test_values = set(test_df[f].values)
    all_values = list(train_values | test_values)
    lbl.fit(all_values) 
    train_df[f] = lbl.transform(list(train_df[f].values))
    test_df[f] = lbl.transform(list(test_df[f].values))
    
print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

train_df.shape (4209, 366)
test_df.shape (4209, 365)


In [61]:
n_comp = 12

# tSVD
tsvd = TruncatedSVD(n_components=n_comp, random_state=420)
tsvd_results_train = tsvd.fit_transform(train_df.drop(["y"], axis=1))
tsvd_results_test = tsvd.transform(test_df)

# PCA
pca = PCA(n_components=n_comp, random_state=420)
pca2_results_train = pca.fit_transform(train_df.drop(["y"], axis=1))
pca2_results_test = pca.transform(test_df)

# ICA
ica = FastICA(n_components=n_comp, random_state=420)
ica2_results_train = ica.fit_transform(train_df.drop(["y"], axis=1))
ica2_results_test = ica.transform(test_df)

# GRP
grp = GaussianRandomProjection(n_components=n_comp, eps=0.1, random_state=420)
grp_results_train = grp.fit_transform(train_df.drop(["y"], axis=1))
grp_results_test = grp.transform(test_df)

# SRP
srp = SparseRandomProjection(n_components=n_comp, dense_output=True, random_state=420)
srp_results_train = srp.fit_transform(train_df.drop(["y"], axis=1))
srp_results_test = srp.transform(test_df)

# Append decomposition components to datasets
for i in range(0, n_comp):
    train_df['pca_' + str(i)] = pca2_results_train[:, i]
    test_df['pca_' + str(i)] = pca2_results_test[:, i]

    train_df['ica_' + str(i)] = ica2_results_train[:, i]
    test_df['ica_' + str(i)] = ica2_results_test[:, i]

    train_df['tsvd_' + str(i)] = tsvd_results_train[:, i]
    test_df['tsvd_' + str(i)] = tsvd_results_test[:, i]

    train_df['grp_' + str(i)] = grp_results_train[:, i]
    test_df['grp_' + str(i)] = grp_results_test[:, i]

    train_df['srp_' + str(i)] = srp_results_train[:, i]
    test_df['srp_' + str(i)] = srp_results_test[:, i]

print('train_df.shape', train_df.shape)
print('test_df.shape', test_df.shape)

train_df.shape (4209, 426)
test_df.shape (4209, 425)


In [80]:
class StackingEstimator(BaseEstimator, TransformerMixin):
    
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y=None, **fit_params):
        self.estimator.fit(X, y, **fit_params)
        return self
    def transform(self, X):
        X = check_array(X)
        X_transformed = np.copy(X)
        # add class probabilities as a synthetic feature
        if issubclass(self.estimator.__class__, ClassifierMixin) and hasattr(self.estimator, 'predict_proba'):
            X_transformed = np.hstack((self.estimator.predict_proba(X), X))

        # add class prodiction as a synthetic feature
        X_transformed = np.hstack((np.reshape(self.estimator.predict(X), (-1, 1)), X_transformed))

        return X_transformed
    
Y = train_df['y'].values
X = train_df.drop(["ID", "y"], axis=1)
test_X = test_df.drop(['ID'], axis=1)

stacked_pipeline = make_pipeline(
    StackingEstimator(estimator=LassoLarsCV(normalize=True)),
    StackingEstimator(estimator=GradientBoostingRegressor(learning_rate=0.001, loss="huber", max_depth=3, max_features=0.55, min_samples_leaf=18, min_samples_split=14, subsample=0.7)),
    LassoLarsCV())

stacked_pipeline.fit(X, Y)
stacked_Y = stacked_pipeline.predict(test_X)
stacked_Y

  g1 = arrayfuncs.min_pos((C - Cov) / (AA - corr_eq_dir + tiny))
  g2 = arrayfuncs.min_pos((C + Cov) / (AA + corr_eq_dir + tiny))




array([  79.3508126 ,   94.28268476,   79.0642661 , ...,   94.24218367,
        111.95202541,   94.27743469])

In [77]:
NUM_FOLDS = 10
kf = model_selection.KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
best_errors = {}
r2_scores = np.ndarray(shape=[NUM_FOLDS, 2], dtype=np.float32)
model_ids = []

Y = train_df['y'].values
X = train_df.drop(["ID", "y"], axis=1)
Y_mean = np.mean(Y)

print('X.shape', X.shape)
print('Y.shape', Y.shape)

for fold, (train_idxs, val_idxs) in enumerate(kf.split(X)):
    print('Fold', fold)
    
    train_X, val_X = X.iloc[train_idxs], X.iloc[val_idxs]
    train_Y, val_Y = Y[train_idxs], Y[val_idxs]

    def xgb_r2_score(preds, dtrain):
        labels = dtrain.get_label()
        return 'rmse', -1.0 * metrics.r2_score(labels, preds)

    model_id = "model-" + str(int(time.time()))
    model_ids.append(model_id)
    print('Training', model_id)

    model = xgb.XGBRegressor(max_depth = 10,
                        gamma=0.5,
                        objective="reg:linear",
                        n_estimators=1000,
                        learning_rate=0.005,
                        nthread=12,
                        subsample=0.8,
                        colsample_bytree=0.70,
                        colsample_bylevel=0.70,
                        #base_score=Y_mean,
                        seed=42,
                        silent=True)

    model.fit(train_X, train_Y, eval_set=[(train_X, train_Y), (val_X, val_Y)], verbose=False, eval_metric='rmse', early_stopping_rounds=50)
    evals_result = model.evals_result()
    #pickle.dump(model, open(os.path.join(MODELS_PATH, model_id + "-evals-result.pk"), "wb"))
    pickle.dump(model, open(os.path.join(MODELS_PATH, model_id + ".xgb"), "wb"))
    #best_val_error_idx = np.argmin(evals_result['validation_1']['rmse'])
    #best_val_error = evals_result['validation_1']['rmse'][best_val_error_idx]
    #best_train_error = evals_result['validation_0']['rmse'][best_val_error_idx]
    #best_errors[model_id] = (best_train_error, best_val_error)
    #print('  Best train err', best_train_error, 'val err', best_val_error)
    
    train_Y_pred = model.predict(train_X)
    train_r2_score = metrics.r2_score(train_Y, train_Y_pred)

    val_Y_pred = model.predict(val_X)
    val_r2_score = metrics.r2_score(val_Y, val_Y_pred)
    r2_scores[fold, 0] = train_r2_score
    r2_scores[fold, 1] = val_r2_score
    print('  train r2', train_r2_score, 'val r2', val_r2_score)
    print('  Saved', model_id)
    
# min_error_model_id = None
# min_val_error = 1e15
# sum_train_error = 0.0
# sum_val_error = 0.0
# for key, val in best_errors.items():
#     sum_train_error += val[0]
#     sum_val_error += val[1]
#     if val[1] < min_val_error:
#         min_val_error = val[1]
#         min_error_model_id = key

print(#'Avg model', min_error_model_id,
      #'train err', sum_train_error/5.0,
      #'val err', sum_val_error/5.0,
      'train r2', np.mean(r2_scores[:, 0]),
      'val r2', np.mean(r2_scores[:, 1]))
# print('Best model', min_error_model_id,
#       'train err', best_errors[min_error_model_id][0],
#       'val err', best_errors[min_error_model_id][1])

X.shape (4209, 364)
Y.shape (4209,)
Fold 0
Training model-1497903797
  train r2 0.76487599465 val r2 0.59014905394
  Saved model-1497903797
Fold 1
Training model-1497903801
  train r2 0.765837247672 val r2 0.557062811841
  Saved model-1497903801
Fold 2
Training model-1497903805
  train r2 0.783658136477 val r2 0.319068676081
  Saved model-1497903805
Fold 3
Training model-1497903809
  train r2 0.75962719781 val r2 0.558103926927
  Saved model-1497903809
Fold 4
Training model-1497903813
  train r2 0.759906895093 val r2 0.595836607858
  Saved model-1497903813
Fold 5
Training model-1497903818
  train r2 0.765403682035 val r2 0.6078119607
  Saved model-1497903818
Fold 6
Training model-1497903822
  train r2 0.757555020526 val r2 0.6250280859
  Saved model-1497903822
Fold 7
Training model-1497903826
  train r2 0.767562517384 val r2 0.580738767038
  Saved model-1497903826
Fold 8
Training model-1497903831
  train r2 0.765616775204 val r2 0.611494164006
  Saved model-1497903831
Fold 9
Training m

In [78]:
test_Y_pred = np.zeros(shape=(test_df.shape[0], 9), dtype=np.float32)

pred_count = 0
for fold in [0, 1, 3, 4, 5, 6, 7, 8, 9]:
    model_id = model_ids[fold]
    model = pickle.load(open(os.path.join(MODELS_PATH, model_id + '.xgb'), 'rb'))
    test_X = test_df.drop(['ID'], axis=1)
    test_Y_pred[:, pred_count] = model.predict(test_X)
    pred_count+=1

test_Y_avg = np.mean(test_Y_pred, axis=1)
test_Y_avg.shape

(4209,)

In [85]:
", ".join(model_ids)

'model-1497903797, model-1497903801, model-1497903805, model-1497903809, model-1497903813, model-1497903818, model-1497903822, model-1497903826, model-1497903831, model-1497903835'

In [82]:
test_Y = test_Y_avg * 0.5 + stacked_Y * 0.5
test_Y.shape

(4209,)

In [83]:
submission_df = test_df[['ID']]
submission_df['y'] = test_Y.tolist()
submission_df.to_csv(os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'), index=False)
print('Generated submission ', os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'))

Generated submission  /kaggle/dev/mercedes-benz-greener-manufacturing-data/submissions/submission-1497903881.csv


Exception ignored in: <bound method DMatrix.__del__ of <xgboost.core.DMatrix object at 0x7f320205a2b0>>
Traceback (most recent call last):
  File "/kaggle/dev/ashish/mercedes-benz-greener-manufacturing/merc/lib/python3.5/site-packages/xgboost/core.py", line 324, in __del__
    _check_call(_LIB.XGDMatrixFree(self.handle))
AttributeError: 'DMatrix' object has no attribute 'handle'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
