In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
# import xgboost as xgb
# import catboost as ctb
import lightgbm as gbm
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif']=['SimHei']   # 用黑体显示中文
plt.rcParams['axes.unicode_minus']=False     # 正常显示负号

In [2]:
path = 'input/'

train_sales_data = pd.read_csv(path + '/Train/train_sales_data.csv')
train_search_data = pd.read_csv(path + '/Train/train_search_data.csv')
train_user_reply_data = pd.read_csv(path + '/Train/train_user_reply_data.csv')

test = pd.read_csv(path + '/evaluation_public.csv')

data = pd.concat([train_sales_data, test], ignore_index=True)
data = data.merge(train_search_data, 'left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
data = data.merge(train_user_reply_data, 'left', on=['model', 'regYear', 'regMonth'])

data['label'] = data['salesVolume']
data['id'] = data['id'].fillna(0).astype(int)
del data['salesVolume'], data['forecastVolum']
data['bodyType'] = data['model'].map(train_sales_data.drop_duplicates('model').set_index('model')['bodyType'])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  if __name__ == '__main__':


In [3]:
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))

data['mt'] = (data['regYear'] - 2016) * 12 + data['regMonth']

# 'popularity', 'carCommentVolum', 'newsReplyVolum','label'/

shift_feat = []

data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_mt'] = data['model_adcode'] * 100 + data['mt']
for i in [11]:
    i = i + 1
    shift_feat.append('shift_model_adcode_mt_label_{0}'.format(i))
    data['model_adcode_mt_{0}'.format(i)] = data['model_adcode_mt'] + i
    data_last = data[~data.label.isnull()].set_index('model_adcode_mt_{0}'.format(i))
    data['shift_model_adcode_mt_label_{0}'.format(i)] = data['model_adcode_mt'].map(data_last['label'])

num_feat = ['regYear'] + shift_feat
cate_feat = ['adcode', 'bodyType', 'model', 'regMonth']

features = num_feat + cate_feat

# data['n_label'] = data['label'] / data.groupby('model')['label'].transform('mean')
train_idx = (data['mt'] <= 20)

valid_idx = (data['mt'].between(21, 24))

test_idx = (data['mt'] > 24)

data['model_weight'] = data.groupby('model')['label'].transform('mean')
data['n_label'] = data['label'] / data['model_weight']

train_x = data[train_idx][features]
train_y = data[train_idx]['n_label']

valid_x = data[valid_idx][features]
valid_y = data[valid_idx]['n_label']

# test_x = data[test_idx][features]



In [4]:
lgb_model = lgb.LGBMRegressor(
    num_leaves=32, reg_alpha=1, reg_lambda=0.1, objective='mse',
    max_depth=-1, learning_rate=0.05, min_child_samples=5, random_state=np.random.randint(1000),
    n_estimators=5000, subsample=0.8, colsample_bytree=0.8,
)

lgb_model.fit(train_x, train_y, eval_set=[
    (valid_x, valid_y),
], categorical_feature=cate_feat, early_stopping_rounds=100, verbose=100)

data['pred_label'] = lgb_model.predict(data[features]) * data['model_weight']



New categorical_feature is ['adcode', 'bodyType', 'model', 'regMonth']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 100 rounds.
[100]	valid_0's l2: 0.281905
[200]	valid_0's l2: 0.248519
[300]	valid_0's l2: 0.242133
Early stopping, best iteration is:
[298]	valid_0's l2: 0.241591


In [5]:
def score(data, pred='pred_label', label='label', group='model'):
    data['pred_label'] = data['pred_label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
    data_agg = data.groupby('model').agg({
        pred: list,
        label: [list, 'mean'],

    }).reset_index()

    data_agg.columns = ['_'.join(col).strip() for col in data_agg.columns]
    nrmse_score = []
    for raw in data_agg[['{0}_list'.format(pred), '{0}_list'.format(label), '{0}_mean'.format(label)]].values:
        nrmse_score.append(
            mse(raw[0], raw[1]) ** 0.5 / raw[2]
        )
    print(1 - np.mean(nrmse_score))
    return 1 - np.mean(nrmse_score)

In [5]:
test_idx.shape

(36960,)

In [7]:
data.shape

(36960, 19)

In [8]:
data.head()

Unnamed: 0,adcode,bodyType,id,model,province,regMonth,regYear,popularity,carCommentVolum,newsReplyVolum,label,mt,model_adcode,model_adcode_mt,model_adcode_mt_12,shift_model_adcode_mt_label_12,model_weight,n_label,pred_label
0,310000,0,0,0,上海,1,2016,1479.0,11.0,106.0,292.0,1,310000,31000001,31000013,,444.518939,0.65689,351.316062
1,530000,0,0,0,云南,1,2016,1594.0,11.0,106.0,466.0,1,530000,53000001,53000013,,444.518939,1.048324,326.658092
2,150000,0,0,0,内蒙古,1,2016,1479.0,11.0,106.0,257.0,1,150000,15000001,15000013,,444.518939,0.578153,234.748291
3,110000,0,0,0,北京,1,2016,2370.0,11.0,106.0,408.0,1,110000,11000001,11000013,,444.518939,0.917846,583.015924
4,510000,0,0,0,四川,1,2016,3562.0,11.0,106.0,610.0,1,510000,51000001,51000013,,444.518939,1.37227,555.529154


In [7]:
# best_score = score(data[valid_idx])
lgb_model.n_estimators = 666

lgb_model.fit(data[~test_idx][features], data[~test_idx]['n_label'], categorical_feature=cate_feat)
data['forecastVolum'] = lgb_model.predict(data[features]) * data['model_weight']
sub = data[test_idx][['id']]
sub['forecastVolum'] = data[test_idx]['forecastVolum'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
sub.to_csv(path + 'sub/sub.csv', index=False)

New categorical_feature is ['adcode', 'bodyType', 'model', 'regMonth']
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


FileNotFoundError: [Errno 2] No such file or directory: 'input/sub/sub.csv'

In [8]:
sub.to_csv(path + 'sub/sub.csv', index=False)

In [9]:
sub.head()

Unnamed: 0,id,forecastVolum
31680,1,258
31681,2,393
31682,3,188
31683,4,345
31684,5,424
