In [2]:
%matplotlib inline

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn import model_selection, preprocessing, metrics
import matplotlib.pyplot as plt
import glob
import re
import time
import seaborn as sns
import xgboost as xgb
import csv
import pickle
color = sns.color_palette()

In [3]:
RAW_DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data/raw_data'
DATA_PATH = '/kaggle/dev/mercedes-benz-greener-manufacturing-data'
TRAIN_DATA = os.path.join(RAW_DATA_PATH, 'train.csv')
TEST_DATA = os.path.join(RAW_DATA_PATH, 'test.csv')
SAMPLE_SUBMISSION = os.path.join(RAW_DATA_PATH, 'sample_submission.csv')
SUBMISSION_PATH = os.path.join(DATA_PATH, 'submissions')
MODELS_PATH = os.path.join(DATA_PATH, 'models')

In [4]:
train_df = pd.read_csv(TRAIN_DATA)
test_df = pd.read_csv(TEST_DATA)
sample_submission_df = pd.read_csv(SAMPLE_SUBMISSION)

In [5]:
# Preprocess data
usable_columns = list(set(train_df.columns) - set(['ID', 'y']))

for column in usable_columns:
    cardinality = len(np.unique(train_df[column]))
    if cardinality == 1:
        train_df.drop(column, axis=1, inplace=True)
        test_df.drop(column, axis=1, inplace=True)

for f in ["X0", "X1", "X2", "X3", "X4", "X5", "X6", "X8"]:
    lbl = preprocessing.LabelEncoder()
    train_values = set(train_df[f].values)
    test_values = set(test_df[f].values)
    all_values = list(train_values | test_values)
    lbl.fit(all_values) 
    train_df[f] = lbl.transform(list(train_df[f].values))
    test_df[f] = lbl.transform(list(test_df[f].values))
    
train_df.head()

Unnamed: 0,ID,y,X0,X1,X2,X3,X4,X5,X6,X8,...,X375,X376,X377,X378,X379,X380,X382,X383,X384,X385
0,0,130.81,37,23,20,0,3,27,9,14,...,0,0,1,0,0,0,0,0,0,0
1,6,88.53,37,21,22,4,3,31,11,14,...,1,0,0,0,0,0,0,0,0,0
2,7,76.26,24,24,38,2,3,30,9,23,...,0,0,0,0,0,0,1,0,0,0
3,9,80.62,24,21,38,5,3,30,11,4,...,0,0,0,0,0,0,0,0,0,0
4,13,78.02,24,23,38,5,3,14,3,13,...,0,0,0,0,0,0,0,0,0,0


In [6]:
NUM_FOLDS = 10
kf = model_selection.KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
best_errors = {}
r2_scores = np.ndarray(shape=[NUM_FOLDS], dtype=np.float32)
model_ids = []

Y = train_df['y'].values
X = train_df.drop(["ID", "y"], axis=1)

print('X.shape', X.shape)
print('Y.shape', Y.shape)

for fold, (train_idxs, val_idxs) in enumerate(kf.split(X)):
    print('Fold', fold)
    
    train_X, val_X = X.iloc[train_idxs], X.iloc[val_idxs]
    train_Y, val_Y = Y[train_idxs], Y[val_idxs]

    def xgb_r2_score(preds, dtrain):
        labels = dtrain.get_label()
        return 'rmse', -1.0 * metrics.r2_score(labels, preds)

    model_id = "model-" + str(int(time.time()))
    model_ids.append(model_id)
    print('Training', model_id)

    model = xgb.XGBRegressor(max_depth = 10,
                        gamma=0.5,
                        objective="reg:linear",
                        n_estimators=1000,
                        learning_rate=0.005,
                        nthread=12,
                        subsample=0.8,
                        colsample_bytree=0.70,
                        colsample_bylevel=0.70,
                        seed=42,
                        silent=True)

    model.fit(train_X, train_Y, eval_set=[(train_X, train_Y), (val_X, val_Y)], verbose=False, eval_metric='rmse', early_stopping_rounds=50)
    evals_result = model.evals_result()
    #pickle.dump(model, open(os.path.join(MODELS_PATH, model_id + "-evals-result.pk"), "wb"))
    pickle.dump(model, open(os.path.join(MODELS_PATH, model_id + ".xgb"), "wb"))
    best_val_error_idx = np.argmin(evals_result['validation_1']['rmse'])
    best_val_error = evals_result['validation_1']['rmse'][best_val_error_idx]
    best_train_error = evals_result['validation_0']['rmse'][best_val_error_idx]
    best_errors[model_id] = (best_train_error, best_val_error)
    print('  Best train err', best_train_error, 'val err', best_val_error)
    val_Y_pred = model.predict(val_X)
    val_r2_score = metrics.r2_score(val_Y, val_Y_pred)
    r2_scores[fold] = val_r2_score
    print('  Val R2', val_r2_score)
    print('  Saved', model_id)
    
min_error_model_id = None
min_val_error = 1e15
sum_train_error = 0.0
sum_val_error = 0.0
for key, val in best_errors.items():
    sum_train_error += val[0]
    sum_val_error += val[1]
    if val[1] < min_val_error:
        min_val_error = val[1]
        min_error_model_id = key

print('Avg model', min_error_model_id,
      'train err', sum_train_error/5.0,
      'val err', sum_val_error/5.0,
      'val r2', np.mean(r2_scores))
print('Best model', min_error_model_id,
      'train err', best_errors[min_error_model_id][0],
      'val err', best_errors[min_error_model_id][1])

X.shape (4209, 364)
Y.shape (4209,)
Fold 0
Training model-1497900553
  Best train err 6.284662 val err 7.666833
  Val R2 0.59014905394
  Saved model-1497900553
Fold 1
Training model-1497900558
  Best train err 6.194374 val err 8.608378
  Val R2 0.557062811841
  Saved model-1497900558
Fold 2
Training model-1497900562
  Best train err 5.762289 val err 12.330914
  Val R2 0.319068676081
  Saved model-1497900562
Fold 3
Training model-1497900566
  Best train err 6.233227 val err 8.207364
  Val R2 0.558103926927
  Saved model-1497900566
Fold 4
Training model-1497900570
  Best train err 6.201195 val err 8.215694
  Val R2 0.595836607858
  Saved model-1497900570
Fold 5
Training model-1497900574
  Best train err 6.179834 val err 7.445438
  Val R2 0.6078119607
  Saved model-1497900574
Fold 6
Training model-1497900578
  Best train err 6.348969 val err 7.448643
  Val R2 0.6250280859
  Saved model-1497900578
Fold 7
Training model-1497900582
  Best train err 6.230577 val err 7.87656
  Val R2 0.5807387

In [7]:
test_Y_pred = np.zeros(shape=(test_df.shape[0], 9), dtype=np.float32)

pred_count = 0
for fold in [0, 1, 3, 4, 5, 6, 7, 8, 9]:
    model_id = model_ids[fold]
    model = pickle.load(open(os.path.join(MODELS_PATH, model_id + '.xgb'), 'rb'))
    test_X = test_df.drop(['ID'], axis=1)
    test_Y_pred[:, pred_count] = model.predict(test_X)
    pred_count+=1

test_Y_avg = np.mean(test_Y_pred, axis=1)
test_Y_avg.shape

(4209,)

In [8]:
submission_df = test_df[['ID']]
submission_df['y'] = test_Y_avg.tolist()
submission_df.to_csv(os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'), index=False)
print('Generated submission ', os.path.join(SUBMISSION_PATH, 'submission-' + str(int(time.time())) + '.csv'))

Generated submission  /kaggle/dev/mercedes-benz-greener-manufacturing-data/submissions/submission-1497900612.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
