In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from math import expm1, log1p, sqrt
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, GridSearchCV, train_test_split, cross_val_score
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.ensemble import RandomForestRegressor
from sklearn import model_selection
from sklearn import ensemble

import matplotlib.pylab as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

In [2]:
n_folds = 5

In [3]:
X = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [4]:
X.drop('ID', axis=1, inplace=True)
test_id = test.pop('ID')
X['target'] = X['target'].apply(lambda x: log1p(x))

In [5]:
y = X.pop('target')

In [6]:
cols_to_remove = []
for col in X.columns:
    if X[col].std() == 0: 
        cols_to_remove.append(col)
        
# remove constant columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True)
# remove constant columns in the test set
test.drop(cols_to_remove, axis=1, inplace=True)

print("Removed `{}` Constant Columns\n".format(len(cols_to_remove)))

Removed `256` Constant Columns



In [7]:
cols_to_remove = []
cols_scaned = []
dups = {}

columns = X.columns
for i in range(len(columns) - 1):
    v = X[columns[i]].values
    dup_cols = []
    for j in range(i + 1, len(columns)):
        if np.array_equal(v, X[columns[j]].values):
            cols_to_remove.append(columns[j])
            if columns[j] not in cols_scaned:
                dup_cols.append(columns[j]) 
                cols_scaned.append(columns[j])
                dups[columns[i]] = dup_cols
                
# remove duplicate columns in the training set
X.drop(cols_to_remove, axis=1, inplace=True) 
# remove duplicate columns in the testing set
test.drop(cols_to_remove, axis=1, inplace=True)

print("Removed `{}` Duplicate Columns\n".format(len(dups)))

Removed `4` Duplicate Columns



In [8]:
NUM_OF_FEATURES = 1000
def rmsle(y, pred):
    return np.sqrt(np.mean(np.power(y - pred, 2)))

x1, x2, y1, y2 = model_selection.train_test_split(X, y, test_size=0.20, random_state=5)
model = ensemble.RandomForestRegressor(n_jobs=-1, random_state=7)
model.fit(x1, y1)
print(rmsle(y2, model.predict(x2)))

col = pd.DataFrame({'importance': model.feature_importances_, 'feature': X.columns}).sort_values(
    by=['importance'], ascending=[False])[:NUM_OF_FEATURES]['feature'].values
X = X[col]
test = test[col]

1.5367277914935245


In [9]:
def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X)
    rmse= np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [10]:
from sklearn import random_projection
from sklearn import random_projection
from sklearn.preprocessing import scale, MinMaxScaler
import gc
import itertools
from copy import deepcopy
import time

ntrain = len(X)
ntest = len(test)
tmp = pd.concat([X, test]) # RandomProjection
weight = ((X != 0).sum() / len(X)).values # Non-zero count / total count per column

tmp_train = X[X != 0]
tmp_test = test[test != 0]
X["weight_count"] = (tmp_train * weight).sum(axis=1)
test["weight_count"] = (tmp_test * weight).sum(axis=1)
X["count_not0"] = (X != 0).sum(axis=1)
test["count_not0"] = (test != 0).sum(axis=1)
X["sum"] = X.sum(axis=1)
test["sum"] = test.sum(axis=1)
X["var"] = tmp_train.var(axis=1)
test["var"] = tmp_test.var(axis=1)
X["median"] = tmp_train.median(axis=1)
test["median"] = tmp_test.median(axis=1)
X["mean"] = tmp_train.mean(axis=1)
test["mean"] = tmp_test.mean(axis=1)
X["std"] = tmp_train.std(axis=1)
test["std"] = tmp_test.std(axis=1)
X["max"] = tmp_train.max(axis=1)
test["max"] = tmp_test.max(axis=1)
X["min"] = tmp_train.min(axis=1)
test["min"] = tmp_test.min(axis=1)
del(tmp_train)
del(tmp_test)

# train data is valid , test data has nan and infinite
tmp = pd.DataFrame(np.nan_to_num(tmp))
# Go through the columns one at a time (can't do it all at once for this dataset)
total_df = deepcopy(tmp)      
print('np.any(np.isnan(total_df)', np.any(np.isnan(total_df)))
print('np.all(np.isfinite(total_df)', np.all(np.isfinite(total_df)))

# Mean-variance scale all columns excluding 0-values'
print('total_df.columns:',total_df.columns) 
columnsCount = len(total_df.columns)
for col in total_df.columns:
    # Detect outliers in this column
    data = total_df[col].values
    data_mean, data_std = np.mean(data), np.std(data)
    cut_off = data_std * 3
    lower, upper = data_mean - cut_off, data_mean + cut_off
    outliers = [x for x in data if x < lower or x > upper]
    
    # If there are crazy high values, do a log-transform
    if len(outliers) > 0:
        non_zero_idx = data != 0
        total_df.loc[non_zero_idx, col] = np.log(data[non_zero_idx])
    
    # Scale non-zero column values
    nonzero_rows = total_df[col] != 0
    if  np.isfinite(total_df.loc[nonzero_rows, col]).all():
        total_df.loc[nonzero_rows, col] = scale(total_df.loc[nonzero_rows, col])
        if  np.isfinite(total_df[col]).all():
            # Scale all column values
            total_df[col] = scale(total_df[col])
    gc.collect()

NUM_OF_COM = 100 #need tuned
transformer = random_projection.SparseRandomProjection(n_components = NUM_OF_COM)
RP = transformer.fit_transform(tmp)
rp = pd.DataFrame(RP)
columns = ["RandomProjection{}".format(i) for i in range(NUM_OF_COM)]
rp.columns = columns

rp_train = rp[:ntrain]
rp_test = rp[ntrain:]
rp_test.index = test.index

#concat RandomProjection and raw data
X = pd.concat([X, rp_train],axis=1)
test = pd.concat([test, rp_test],axis=1)

del(rp_train)
del(rp_test)

np.any(np.isnan(total_df) False
np.all(np.isfinite(total_df) True
total_df.columns: RangeIndex(start=0, stop=1000, step=1)


In [11]:
print(X.shape)
print(test.shape)

(4459, 1109)
(49343, 1109)


In [12]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.054, gamma=0.4, 
                             learning_rate=0.01, max_depth=8, 
                             min_child_weight=5, n_estimators=1000,
                             reg_alpha=1e-05, reg_lambda=0.8571,
                             subsample=0.6, random_state =7,
                             nthread=4)

In [30]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 1.3471 (0.0563)



In [13]:
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=144,
                              learning_rate=0.005, n_estimators=720, max_depth=13,
                              metric='rmse', is_training_metric=True,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.9)

In [None]:
score = rmsle_cv(model_lgb)
print("LGB score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

In [14]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y, eval_metric='rmse'):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y, eval_metric=eval_metric)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

In [15]:
averaged_models = AveragingModels(models = (model_xgb, model_lgb))

In [18]:
score = rmsle_cv(averaged_models)
print("Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Averaged base models score: 1.3424 (0.0559)



In [16]:
submit = pd.DataFrame()
submit['ID'] = test_id

In [17]:
averaged_models.fit(X, y, eval_metric='rmse')
predictions = averaged_models.predict(test)

In [18]:
submit['target'] = [expm1(x) for x in predictions]
submit.to_csv('my_XGB_prediction.csv', index=False)