In [1]:
import os
import numpy as np
import pandas as pd
import sklearn
import collections
from functools import partial
from sklearn.model_selection import cross_val_score
from hyperopt import fmin, tpe, hp, rand, Trials, space_eval
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(objective = 'reg:squarederror', random_state=42)
xgb

In [None]:
print_out(train)

In [None]:
repeat = collections.Counter(train['P_ID'])
for i, (pid, time) in enumerate(repeat.items()):
    if time > 1:
        print(pid)

In [7]:
def impute_encoder(data):
    for col in data.columns:
        unique = pd.unique(data[col])
        if len(unique) < 100:
            data[col].fillna(value = data[col].value_counts().index[0], inplace = True)
            fe = data.groupby(col).size() / len(data)
            data.loc[:, col + '_encode'] = data[col].map(fe)
            data.drop([col], axis = 1, inplace = True)
        else:
            data[col].fillna(value = data[col].mean(), inplace = True)
    return data

def xgb_objective(parameter, feature, label):
    loss = -cross_val_score(XGBRegressor(**parameter, n_jobs = -1, objective = 'reg:squarederror', learning_rate = 0.005), feature, label, cv = 5, scoring = 'neg_mean_absolute_error',
                           n_jobs = -1).mean()
    return loss

In [3]:
xgb_parameter = {"n_estimators": hp.choice('n_estimators', np.linspace(10, 200, num = 20, dtype = np.int32)),
                 "max_depth": hp.choice('max_depth', np.arange(10, 30, dtype = int)),
                'subsample': hp.uniform ('subsample', 0.8, 1),
                'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
                 'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05)}

In [None]:
# xgb_parameter ={
#         'max_depth': hp.choice('max_depth', np.arange(10, 30, dtype = int)),
#         'min_child_weight': hp.quniform ('min_child', 1, 20, 1),
#         'subsample': hp.uniform ('subsample', 0.8, 1),
#         'n_estimators' : hp.choice('n_estimators', np.arange(1000, 10000, 100, dtype=int)),
#         'learning_rate' : hp.quniform('learning_rate', 0.025, 0.5, 0.025),
#         'gamma' : hp.quniform('gamma', 0.5, 1, 0.05),
#         'colsample_bytree' : hp.quniform('colsample_bytree', 0.5, 1, 0.05)
#     }

In [8]:
targets = [ 'CIEX', 'CIEY', 'CIEX_DIFF', 'CIEY_DIFF']
data_path = '/home/motionlab/Desktop/weitai/project/smart_manufacturing/csv_file'
train = pd.read_csv(os.path.join(data_path, 'train.csv'))
test = pd.read_csv(os.path.join(data_path, 'test.csv'))
train.drop_duplicates(subset = ['P_ID'], inplace = True)
train_features = train.drop(['id', 'Date', 'LOG ID', 'Sub. ID', 'CIEX', 'CIEY', 'CIEX_DIFF', 'CIEY_DIFF', '4-2_D', '4-2_M'], axis = 1)
test_features = test.drop(['id', 'Date', 'LOG ID', 'Sub. ID', '4-2_D', '4-2_M'], axis = 1)
encode_train_features = impute_encoder(train_features)
encode_test_features = impute_encoder(test_features)
predict_data = np.zeros((5723, 1))
# encode_features
for target in targets:
    objective = partial(xgb_objective, feature = encode_train_features, label = np.array(train[target]))
    min_objective = fmin(fn = objective, space = xgb_parameter, algo = tpe.suggest, max_evals = 15, trials = Trials())
    values = space_eval(xgb_parameter ,min_objective).values()
    best_parameter = list(values)
    print('Target is : ', target)
    print("best rf estimate parameters" , space_eval(xgb_parameter, min_objective))
    print("=============================")
    best_xgb = XGBRegressor(colsample_bytree = best_parameter[0], gamma =  best_parameter[1], max_depth =  best_parameter[2],
                           n_estimators =  best_parameter[3], subsample =  best_parameter[4], learning_rate = 0.01, n_jobs = -1)
    # 訓練
    best_xgb.fit(np.array(encode_train_features), np.array(train[target]))
    # 預測
    predict = best_xgb.predict(np.array(encode_test_features))
    predict = np.reshape(predict, (5723, 1))
    predict_data = np.concatenate((predict_data, predict), axis = 1)

predict_data = predict_data[:,1:]
predict_dataframe = pd.DataFrame(predict_data, columns = targets)

100%|██████████| 15/15 [00:09<00:00,  1.58trial/s, best loss: 0.737166854623865]
Target is :  CIEX
best rf estimate parameters {'colsample_bytree': 0.6000000000000001, 'gamma': 0.8, 'max_depth': 16, 'n_estimators': 190, 'subsample': 0.8373402879792827}
100%|██████████| 15/15 [00:06<00:00,  2.41trial/s, best loss: 0.9104901689688102]
Target is :  CIEY
best rf estimate parameters {'colsample_bytree': 0.8, 'gamma': 0.55, 'max_depth': 29, 'n_estimators': 200, 'subsample': 0.9363554792080997}
100%|██████████| 15/15 [00:04<00:00,  3.70trial/s, best loss: 0.19114650891148766]
Target is :  CIEX_DIFF
best rf estimate parameters {'colsample_bytree': 0.8500000000000001, 'gamma': 0.8500000000000001, 'max_depth': 25, 'n_estimators': 190, 'subsample': 0.8407445217938291}
100%|██████████| 15/15 [00:04<00:00,  3.09trial/s, best loss: 0.1914232833578148]
Target is :  CIEY_DIFF
best rf estimate parameters {'colsample_bytree': 0.8, 'gamma': 0.55, 'max_depth': 16, 'n_estimators': 190, 'subsample': 0.93420

In [6]:
predict_dataframe

Unnamed: 0,CIEX,CIEY,CIEX_DIFF,CIEY_DIFF
0,2.746166,1.937995,0.086999,0.071941
1,2.746166,1.937995,0.086999,0.071941
2,2.746166,1.937995,0.086999,0.071941
3,2.746166,1.937995,0.086999,0.071941
4,2.746166,1.937995,0.086999,0.071941
...,...,...,...,...
5718,0.713830,4.168368,0.086999,0.071941
5719,0.713830,4.168368,0.086999,0.071941
5720,0.713830,4.168368,0.086999,0.071941
5721,0.713830,4.168368,0.086999,0.071941


In [7]:
submmit = pd.concat([test['id'], predict_dataframe], axis = 1)
submmit

Unnamed: 0,id,CIEX,CIEY,CIEX_DIFF,CIEY_DIFF
0,2,2.746166,1.937995,0.086999,0.071941
1,4,2.746166,1.937995,0.086999,0.071941
2,5,2.746166,1.937995,0.086999,0.071941
3,7,2.746166,1.937995,0.086999,0.071941
4,10,2.746166,1.937995,0.086999,0.071941
...,...,...,...,...,...
5718,10674,0.713830,4.168368,0.086999,0.071941
5719,10675,0.713830,4.168368,0.086999,0.071941
5720,10676,0.713830,4.168368,0.086999,0.071941
5721,10680,0.713830,4.168368,0.086999,0.071941


In [8]:
submmit.to_csv('/home/motionlab/Desktop/weitai/project/test result/xgb_first_trial.csv', index = False)