In [81]:
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from scipy.stats import wilcoxon

In [82]:
def read_data(filename):
    with open(filename, "r") as file:
        lines = list(file)
        x = np.zeros((len(lines), 51))
        y = np.zeros((len(lines), 1))
        for i in range(len(lines)):
            split_row = lines[i].split('\t')
            y[i] = float(float(split_row[0]))
            for j in range(1, len(split_row)):
                x[i, j - 1] = float(split_row[j])
        return x, y

In [27]:
X_train, y_train = read_data("features.txt")
X_test, y_test = read_data("featuresTest.txt")
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(12465, 51) (12465, 1)
(46596, 51) (46596, 1)


In [65]:
print(X_train[:5])

[[2.57727e-01 2.15909e-02 1.71299e-01 1.00000e+00 1.00000e+00 1.00000e+00
  1.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 4.31373e-01 9.35065e-01 2.08333e-02 7.08240e-02
  1.00000e+00 0.00000e+00 3.13726e-01 1.00000e+00 1.00000e+00 0.00000e+00
  9.37724e-01 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 5.66038e-02 0.00000e+00 0.00000e+00 1.00000e+00 7.39290e-01
  1.00000e+00 5.05391e-04 8.85819e-01 1.72727e-04 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00 1.53262e-01 5.78118e-01
  2.22098e-01 1.00000e+00 4.37450e-01]
 [4.24438e-01 1.64384e-01 5.72649e-01 1.00000e+00 1.00000e+00 0.00000e+00
  1.00000e+00 0.00000e+00 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00
  0.00000e+00 0.00000e+00 3.60784e-01 5.12195e-01 4.47049e-02 7.17587e-02
  1.00000e+00 1.00000e+00 3.21569e-01 1.00000e+00 1.00000e+00 0.00000e+00
  9.41214e-01 0.00000e+00 1.00000e+00 0.00000e+00 0.00000e+00 0.00000e+00

In [66]:
def create_model():
    return XGBRegressor(
        n_estimators=500, 
        learning_rate=0.02, 
        max_depth=5,
        eta=1,
        subsample=0.8,
        reg_lambda=0,
        reg_alpha=1,
        n_jobs=4)


def train_test_model(X_train, X_test, y_train, y_test, feature_to_drop=None):
    """Train the model and return RMSE on {X_test, y_test}."""
    if feature_to_drop is not None:
        X_train = np.delete(X_train, [feature_to_drop], axis=1)
        X_test = np.delete(X_test, [feature_to_drop], axis=1)
    model = create_model()
    model.fit(X_train, y_train)
    predicted = model.predict(X_test)
    return np.sqrt(mean_squared_error(y_test, predicted))

In [67]:
rmse = train_test_model(X_train, X_test, y_train, y_test)
print('Base model')
print('\nRMSE: ', rmse)

Base model

RMSE:  0.07528854867017966


In [75]:
features = [1,16,36,50]  # feature indicies to explore

In [76]:
for feature_ind in features:
    rmse = train_test_model(X_train, X_test, y_train, y_test, feature_to_drop=feature_ind)
    print('\nRMSE without feature {}: {}'.format(feature_ind, rmse))


RMSE without feature 1: 0.07532142885524475

RMSE without feature 16: 0.07535249274398034

RMSE without feature 36: 0.07530117237606908

RMSE without feature 50: 0.07527984128430162


In [72]:
X_all = X_train  # np.concatenate([X_test, X_train], axis=0)
y_all = y_train  # np.concatenate([y_test, y_train], axis=0)

print(X_all.shape, y_all.shape)

(12465, 51) (12465, 1)


In [88]:
SPLIT_COUNT = 10

splits = []
for _ in range(SPLIT_COUNT):
    splits.append(train_test_split(X_all, y_all, test_size=0.5))

In [89]:
rmses = []
for split in splits:
    X_all_train, X_all_test, y_all_train, y_all_test = split
    rmse = train_test_model(X_all_train, X_all_test, y_all_train, y_all_test)
    rmses.append(rmse)

for feature_ind in features:
    rmses_dropped = []
    for split in splits:
        X_all_train, X_all_test, y_all_train, y_all_test = split
        rmse = train_test_model(X_all_train, X_all_test, y_all_train, y_all_test, feature_to_drop=feature_ind)
        rmses_dropped.append(rmse)
    
    print('\nFeature', feature_ind)
    print('RMSE mean with:', sum(rmses) / SPLITS)
    print('RMSE mean without:', sum(rmses_dropped) / SPLITS)
    statistic, p_value = wilcoxon(rmses, rmses_dropped)
    print('p-value for hypothesis of having the same mean: {}', p_value)


Feature 1
RMSE mean with: 0.07662974838134605
RMSE mean without: 0.07668008911312142
p-value for hypothesis of having the same mean: {} 0.02841686417486375

Feature 16
RMSE mean with: 0.07662974838134605
RMSE mean without: 0.07662815787030242
p-value for hypothesis of having the same mean: {} 0.7212766990291557

Feature 36
RMSE mean with: 0.07662974838134605
RMSE mean without: 0.07665913320630996
p-value for hypothesis of having the same mean: {} 0.1688069535565081

Feature 50
RMSE mean with: 0.07662974838134605
RMSE mean without: 0.0765399537097601
p-value for hypothesis of having the same mean: {} 0.016604878103722735
