In [1]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse
import lightgbm as lgb

from tqdm import tqdm
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

# 读取数据

In [2]:
data_path = '../../input/Round1/'

# train_user_reply_data = pd.read_csv(data_path + 'train_user_reply_data.csv', encoding='utf-8')
train_search_data = pd.read_csv(data_path + 'train_search_data.csv', encoding='utf-8')
train_sales_data = pd.read_csv(data_path + 'train_sales_data.csv', encoding='utf-8')
evaluation_public = pd.read_csv(data_path + 'evaluation_public.csv', encoding='utf-8')

train_sales_data = train_sales_data.merge(train_search_data, on=['province', 'adcode', 'model', 'regYear', 'regMonth'])

# 固定车型和省份顺序-样本顺序

In [3]:
cars = ['f8a6975573af1b33', '2a2ab41f8f6ff1cb', 'd4efbebb087fd03f', '3e21824be728cbec',
        'ea489c253676aafc', '6155b214590c66e6', 'fc32b1a017b34efe', '9c1c7ee8ebdda299',
        'fde95ea242abd896', '7a7885e2d7c00bcf', '7245e0ee27b195cd', 'b25c4e2e3856af22',
        '7aab7fca2470987e', 'feabbf46658382b9', '04e66e578f653ab9', '5d7fb682edd0f937',
        'b4be3a4917289c82', '54fc07138d70374c', 'ef76a85c4b39f693', 'bb9fbec9a2833839',
        '3c974920a76ac9c1', '212083a9246d2fd3', '4f79773e600518a6', 'af6f4f548684e14d',
        '936168bd4850913d', 'cd5841d44fd7625e', '0797526c057dcf5b', 'a207df29ec9583f0',
        '3d7554f1f56dd664', '7023efdab9cedc03', 'da457d15788fe8ee', '12f8b7e14947c34d',
        '28e29f2c03dcd84c', '63065128401bb3ff', 'a432c483b5beb856', '37aa9169b575ef79',
        '17bc272c93f19d56', '61e73e32ad101892', '4a103c30d593fbbe', '2d0d2c3403909fdb',
        '6858d6dfe680bdf7', '17363f08d683d52b', '346393c2c6305fb1', '5b1c11c3efed5312',
        '97f15de12cfabbd5', 'a9a43d1a7ecbe75d', '7cf283430b3b5e38', 'c6833cb891626c17',
        'a28bb927b6fcb33c', 'dff803b4024d261d', '02aab221aabc03b9', 'f5d69960089c3614',
        '06880909932890ca', '79de4e4b24c35b04', 'd0f245b8781e3631', 'c06a2a387c0ee510',
        'cc21c7e91a3b5a0c', 'f270f6a489c6a9d7', '8c915fe4632fb9fa', 'c6cd4e0e073f5ac2']

provinces = ['浙江', '福建', '四川', '陕西', '安徽', '湖南', '广东', '云南', '上海', '山东',
             '湖北', '黑龙江', '江苏', '广西', '内蒙古', '辽宁', '北京', '重庆', '河北', '山西',
             '江西', '河南']

# 评估函数

In [4]:
def metrics(y_true, y_pred, model):
    data = pd.DataFrame({'model': model, 'salesVolume': y_true, 'label': y_pred})
    data['label'] = data['label'].map(lambda index: -index if index < 0 else index)
    res, count = 0, 0
    for index, cars in data.groupby('model'):
        a = np.array(cars['salesVolume'])
        b = np.array(cars['label'])
        temp = np.sqrt(np.sum((a - b) ** 2) / len(a)) / np.mean(a)
        res += temp
        count += 1
        print(temp)
    return 1 - (res / count)

# 获取训练/测试数据索引下标

In [5]:
def get_train_feature(windows_size, before):
    # windows_size=1, before=10
    features = pd.DataFrame()
    # 每个车型，每个省份，car_province_part就相当于只有24个月的数据
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)]
            car_province_part['label'] = car_province_part['salesVolume'].shift(-windows_size)
            # print('car_province_part:\n', car_province_part)
            # car_province_part的行数为24，
            car_province_part = car_province_part[before: 24-windows_size]
            features = pd.concat([features, car_province_part], axis=0)
    features.index = range(len(features))
    return features


def get_test_feature(windows_size, before):
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)]
            car_province_part['label'] = car_province_part['salesVolume'].shift(-windows_size)
            # print('car_province_part:\n', car_province_part)
            # 最后一行用作验证
            car_province_part = car_province_part[-1:]
            features = pd.concat([features, car_province_part], axis=0)
    features.index = range(len(features))
    return features

# 特征提取

In [6]:
def get_basic_feature(windows_size, before, data_set_name):
    # windows_size=1, before=10, data_set_name='train'
    features = pd.DataFrame()
    for car in cars:
        for province in provinces:
            car_province_part = train_sales_data[(train_sales_data['model'] == car) & (train_sales_data['province'] == province)].copy()
            
            # 对popularity和salesVolume做log变换
            car_province_part['popularity'] = car_province_part['popularity'].apply(lambda index: np.log(index))
            car_province_part['salesVolume'] = car_province_part['salesVolume'].apply(lambda index: np.log(index))
            
            # 春节标记特征
            car_province_part['is_spring_festival'] = [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                                                       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
            car_province_part['distance_spring_festival'] = [1, 0, 1, 2, 3, 4, 5, 5, 4, 3, 2, 1,
                                                             0, 1, 2, 3, 4, 5, 6, 6, 5, 4, 3, 2]
            
            # 一阶差分
            for index in range(1, before):
                # salesVolume
                # shift
                car_province_part['salesVolume_' + str(index)] = car_province_part['salesVolume'].shift(index)
                # 后一项减前一项
                car_province_part['salesVolume_diff_' + str(index)] = car_province_part['salesVolume'].diff(index)
                # shift的比值
                car_province_part['salesVolume_qoq_' + str(index)] = (car_province_part['salesVolume']
                                                                      / car_province_part['salesVolume_' + str(index)])
                
                # popularity
                car_province_part['popularity_' + str(index)] = car_province_part['popularity'].shift(index)
                car_province_part['popularity_diff_' + str(index)] = car_province_part['popularity'].diff(index)
                car_province_part['popularity_hb_' + str(index)] = (car_province_part['popularity']
                                                                    / car_province_part['popularity_' + str(index)])

            # 二阶差分
            for index in range(1, before-1):   
                car_province_part['salesVolume_diff2_' + str(index)] = car_province_part['salesVolume_diff_' + str(index)].diff(1)

            # 历史统计特征
            salesVolume = list(car_province_part['salesVolume'])
            popularity = list(car_province_part['popularity'])
            # index=1, 2, 3, 4, ..., 24
            car_province_part['index'] = 1
            car_province_part['index'] = car_province_part['index'].cumsum()
            car_province_part['salesVolume_his'] = car_province_part['index'].map(lambda index: salesVolume[index-7: index])
            car_province_part['popularity_his'] = car_province_part['index'].map(lambda index: popularity[index-7: index])

            car_province_part['salesVolume_his_diff'] = car_province_part['salesVolume_his'].map(lambda index: np.diff(index))
            car_province_part['popularity_his_diff'] = car_province_part['popularity_his'].map(lambda index: np.diff(index))

            def pth(array):
                return np.max(array) - np.min(array)

            fea_name = ['max', 'min', 'aver', 'var', 'pth']
            fun_name = [np.max, np.min, np.average, np.var, pth]
            for i in range(len(fun_name)):
                car_province_part['salesVolume_his_' + fea_name[i]] = car_province_part['salesVolume_his'].apply(lambda index: 0 if len(index) == 0 else fun_name[i](index))                
                car_province_part['salesVolume_his_diff_' + fea_name[i]] = car_province_part['salesVolume_his_diff'].apply(lambda index: 0 if len(index) == 0 else fun_name[i](index))
            
            car_province_part.drop(['index', 'salesVolume_his', 'popularity_his', 'salesVolume_his_diff', 'popularity_his_diff'], axis=1, inplace=True)
            
            # 数据集划分
            if data_set_name == 'train':
                car_province_part = car_province_part[before: 24-windows_size]
            else:
                car_province_part = car_province_part[-1:]

            car_province_part.drop(['popularity'], axis=1, inplace=True)    ###  , 'day_count', 'day_salesVolume', 'popularity'
            features = pd.concat([features, car_province_part], axis=0, ignore_index=True)

    print(features.head())
    return features

# Begin

In [7]:
test_prob_collection = pd.DataFrame()

# Model-LightGBM - 一月

In [8]:
size, pre = 1, 10  # 4
train_feature  = get_train_feature(size, pre) # [10: 23] 行用作train
test_feature  = get_test_feature(size, pre) # 24 行用作验证

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  'salesVolume'

categorial_name = [0, 1, 2, 3, 4, 6, 7]
drop_cols = ['salesVolume', 'popularity']

# train
temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
print('train_feature.columns:', train_feature.columns)

# test
temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
print('test_feature.columns:', test_feature.columns)

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 1
###############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 11)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

  province  adcode             model bodyType  regYear  regMonth  salesVolume  \
0       浙江  330000  f8a6975573af1b33    Sedan     2016        11     6.543912   
1       浙江  330000  f8a6975573af1b33    Sedan     2016        12     7.282074   
2       浙江  330000  f8a6975573af1b33    Sedan     2017         1     6.622736   
3       浙江  330000  f8a6975573af1b33    Sedan     2017         2     6.220590   
4       浙江  330000  f8a6975573af1b33    Sedan     2017         3     6.659294   

   is_spring_festival  distance_spring_festival  salesVolume_1  \
0                   0                         2       6.752270   
1                   0                         1       6.543912   
2                   1                         0       7.282074   
3                   0                         1       6.622736   
4                   0                         2       6.220590   

   salesVolume_diff_1  salesVolume_qoq_1  popularity_1  popularity_diff_1  \
0           -0.208359           0.96914

  province  adcode             model bodyType  regYear  regMonth  salesVolume  \
0       浙江  330000  f8a6975573af1b33    Sedan     2017        12     7.357556   
1       福建  350000  f8a6975573af1b33    Sedan     2017        12     6.605298   
2       四川  510000  f8a6975573af1b33    Sedan     2017        12     6.864848   
3       陕西  610000  f8a6975573af1b33    Sedan     2017        12     5.480639   
4       安徽  340000  f8a6975573af1b33    Sedan     2017        12     6.834109   

   is_spring_festival  distance_spring_festival  salesVolume_1  \
0                   0                         2       7.110696   
1                   0                         2       6.469250   
2                   0                         2       6.535241   
3                   0                         2       5.187386   
4                   0                         2       6.493754   

   salesVolume_diff_1  salesVolume_qoq_1  popularity_1  popularity_diff_1  \
0            0.246860           1.03471

In [9]:
# LightGBM model
params = {
    'boosting_type': 'gbdt',
    'objective': 'rmse',
    'metric': ['rmse'],   # 'l2', 'binary_logloss',
    'learning_rate': 0.03,
    'num_leaves': 2 ** 5 - 1,    # 2 ** 5 - 1
    # 'min_child_samples': 100,
    'max_depth': 6,    # 6
    'subsample': 0.8,   # 0.8
    'subsample_freq': 5,
    'colsample_bytree': 0.8,
    'seed': 2020,
    'nthread': -1,
    'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# num_boost_round: 5000   early_stopping_rounds:100
module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

# feature importance
importance = module.feature_importance()
print('importance:\n', importance)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params,
                  lgb_data,
                  num_boost_round=iters,
                  categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print('predict:\n', predict)

print('model train over, rmse:', nrmse)   
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)
print('train_feature.shape: ', train_feature.shape)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.504302
[200]	valid_0's rmse: 0.475746
[300]	valid_0's rmse: 0.464901
[400]	valid_0's rmse: 0.458845
[500]	valid_0's rmse: 0.456934
[600]	valid_0's rmse: 0.454988
[700]	valid_0's rmse: 0.454449
[800]	valid_0's rmse: 0.454944
[900]	valid_0's rmse: 0.454174
Early stopping, best iteration is:
[879]	valid_0's rmse: 0.453522
importance:
 [1922 3730   80  192 2038  737  214  211  302  172  208  134  302  264
  275  361  338  151  210  225  229  309  264  108  173  172  184  199
  270   81  156  131  207  246  214   81  155  165  177  259  233   88
  147  121  194  239  209   83  159  152  264  231  182  111  121  139
  372  370  254  112  167  135  372  340  337  399  357  303  250  395
  129  247  129  174  266   46  223  182  220  198]
0.15913383310606155
0.246715670421317
0.3307019994369855
0.18918198580209203
0.36206224847378665
0.8080661576809964
0.23674459385868812
0.18041021154792256
0.205359916412832

# Model-LightGBM - 二月

In [10]:
size, pre = 2, 9  # 4
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 2
######################################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 10)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

  province  adcode             model bodyType  regYear  regMonth  salesVolume  \
0       浙江  330000  f8a6975573af1b33    Sedan     2016        10     6.752270   
1       浙江  330000  f8a6975573af1b33    Sedan     2016        11     6.543912   
2       浙江  330000  f8a6975573af1b33    Sedan     2016        12     7.282074   
3       浙江  330000  f8a6975573af1b33    Sedan     2017         1     6.622736   
4       浙江  330000  f8a6975573af1b33    Sedan     2017         2     6.220590   

   is_spring_festival  distance_spring_festival  salesVolume_1  \
0                   0                         3       6.714171   
1                   0                         2       6.752270   
2                   0                         1       6.543912   
3                   1                         0       7.282074   
4                   0                         1       6.622736   

   salesVolume_diff_1  salesVolume_qoq_1  popularity_1  popularity_diff_1  \
0            0.038100           1.00567

  province  adcode             model bodyType  regYear  regMonth  salesVolume  \
0       浙江  330000  f8a6975573af1b33    Sedan     2017        12     7.357556   
1       福建  350000  f8a6975573af1b33    Sedan     2017        12     6.605298   
2       四川  510000  f8a6975573af1b33    Sedan     2017        12     6.864848   
3       陕西  610000  f8a6975573af1b33    Sedan     2017        12     5.480639   
4       安徽  340000  f8a6975573af1b33    Sedan     2017        12     6.834109   

   is_spring_festival  distance_spring_festival  salesVolume_1  \
0                   0                         2       7.110696   
1                   0                         2       6.469250   
2                   0                         2       6.535241   
3                   0                         2       5.187386   
4                   0                         2       6.493754   

   salesVolume_diff_1  salesVolume_qoq_1  popularity_1  popularity_diff_1  \
0            0.246860           1.03471

In [11]:
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2020,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print('predict: \n', predict)

print('model train over, rmse:', nrmse)   
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.596677
[200]	valid_0's rmse: 0.55588
[300]	valid_0's rmse: 0.553743
[400]	valid_0's rmse: 0.54934
[500]	valid_0's rmse: 0.549843
Early stopping, best iteration is:
[428]	valid_0's rmse: 0.548657
0.243072706825977
0.2017772084770516
0.2998721359007381
0.2984598844671088
0.42708531238293623
0.9905714835675781
0.20331963145040813
0.2196452993475898
0.18737377006853437
0.18009488823803552
0.3730129451297239
0.2695050484794701
0.17142473299620248
0.43746710929608373
0.21484132495173355
0.2438503456792253
0.12257853867978169
0.1695201667151035
0.12765516034406632
0.45474869891142916
0.27623107737074704
0.37104190934172543
0.09195179891973009
0.40949779696318545
0.16983607541837475
0.5536869964835384
0.3062229445631245
0.27762187915960884
0.08689149266092204
0.25696968346258314
0.41503710147042944
0.150999392224026
0.2575159445750681
0.12046548941399783
0.6698127255667251
0.3428090759566444
0.298455947556435

# Model-LightGBM - 三月

In [12]:
size, pre = 3, 8   # 5
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 3
##############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 9)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

  province  adcode             model bodyType  regYear  regMonth  salesVolume  \
0       浙江  330000  f8a6975573af1b33    Sedan     2016         9     6.714171   
1       浙江  330000  f8a6975573af1b33    Sedan     2016        10     6.752270   
2       浙江  330000  f8a6975573af1b33    Sedan     2016        11     6.543912   
3       浙江  330000  f8a6975573af1b33    Sedan     2016        12     7.282074   
4       浙江  330000  f8a6975573af1b33    Sedan     2017         1     6.622736   

   is_spring_festival  distance_spring_festival  salesVolume_1  \
0                   0                         4       6.610696   
1                   0                         3       6.714171   
2                   0                         2       6.752270   
3                   0                         1       6.543912   
4                   1                         0       7.282074   

   salesVolume_diff_1  salesVolume_qoq_1  popularity_1  popularity_diff_1  \
0            0.103474           1.01565

In [13]:
# LightGBM model
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2020,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print('predict:\n', predict)

print('model train over, rmse:', nrmse)
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.636188
[200]	valid_0's rmse: 0.614706
[300]	valid_0's rmse: 0.612224
[400]	valid_0's rmse: 0.605734
[500]	valid_0's rmse: 0.608614
Early stopping, best iteration is:
[406]	valid_0's rmse: 0.605348
0.17176226507120473
0.2695058375361292
0.3670427195705863
0.2480186927907422
0.487756359090314
1.0261872886939045
0.2557075692071053
0.2208736594732273
0.2875924897672297
0.21144535209082144
0.445673850840031
0.22786025733331286
0.1705048418162528
0.3965518214936857
0.23754053342581624
0.2724556421632539
0.16794127273355583
0.20353815274359413
0.2615753173803801
0.49106703657833845
0.21781538874431075
0.3274828635223032
0.11036839197496423
0.41270326526398166
0.2605744395181962
0.5765364758890045
0.2935559292620728
0.23359502840969687
0.12383728025749352
0.4555915647038619
0.2748764528093025
0.13009834007826085
0.3272515191345883
0.15296169670632326
0.9967163977630485
0.4803777837011554
0.11629958768907386
0

# Model-LightGBM - 四月

In [14]:
size, pre = 4, 7   # 5
train_feature  = get_train_feature(size, pre)
test_feature  = get_test_feature(size, pre)

cols = ['province', 'adcode', 'model', 'regYear', 'regMonth', 'bodyType']   #  , 'salesVolume'

temp_train = get_basic_feature(size, pre, 'train')
train_feature = train_feature.drop(drop_cols, axis=1).merge(temp_train, on=cols, how='left')
train_feature

temp_test = get_basic_feature(size, pre, 'test')
test_feature = test_feature.drop(drop_cols, axis=1).merge(temp_test, on=cols, how='left')
test_feature
train_feature.isnull().sum()
test_feature

submit = test_feature[['province', 'adcode', 'model']]
submit['regYear'] = 2018
submit['regMonth'] = 4
###############################

test_index = list(train_feature[(train_feature['regYear'] == 2017) & (train_feature['regMonth'] == 8)].index)

def drop_duplicate(n):
    return n not in test_index

train_index = list(filter(drop_duplicate, list(range(len(train_feature)))))

train_model = train_feature['model'].values[train_index]   # model
val_model = train_feature['model'].values[test_index]

model_set = dict()
for index in range(len(cars)):
    model_set[cars[index]] = index
train_feature['bodyType'] = train_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
train_feature['model'] = train_feature['model'].map(model_set)
test_feature['bodyType'] = test_feature['bodyType'].map({'Hatchback': 0, 'MPV': 1, 'SUV': 2, 'Sedan': 3})
test_feature['model'] = test_feature['model'].map(model_set)

train_label = train_feature[['label']]
train_feature.drop(['province', 'label'], axis=1, inplace=True)
test_feature.drop(['province', 'label'], axis=1, inplace=True)

train_label['log'] = train_label['label'].apply(lambda index: np.log2(index) + 1)
x_train = train_feature.values[train_index]
y_train = train_label['log'].values[train_index]
x_test = train_feature.values[test_index]
y_test = train_label['log'].values[test_index]

  province  adcode             model bodyType  regYear  regMonth  salesVolume  \
0       浙江  330000  f8a6975573af1b33    Sedan     2016         8     6.610696   
1       浙江  330000  f8a6975573af1b33    Sedan     2016         9     6.714171   
2       浙江  330000  f8a6975573af1b33    Sedan     2016        10     6.752270   
3       浙江  330000  f8a6975573af1b33    Sedan     2016        11     6.543912   
4       浙江  330000  f8a6975573af1b33    Sedan     2016        12     7.282074   

   is_spring_festival  distance_spring_festival  salesVolume_1  \
0                   0                         5       6.419995   
1                   0                         4       6.610696   
2                   0                         3       6.714171   
3                   0                         2       6.752270   
4                   0                         1       6.543912   

   salesVolume_diff_1  salesVolume_qoq_1  popularity_1  popularity_diff_1  \
0            0.190701           1.02970

In [15]:
params = {
      'boosting_type': 'gbdt',
      'objective': 'rmse',
      'metric': ['rmse'],   # 'l2', 'binary_logloss',
      'learning_rate': 0.03,
      'num_leaves': 2 ** 5 - 1,
      # 'min_child_samples': 100,
      'max_depth': 6,
      'subsample': 0.8,
      'subsample_freq': 5,
      'colsample_bytree': 0.8,
      'seed': 2020,
      'nthread': -1,
      'verbose': 1,
}

lgb_train = lgb.Dataset(x_train, y_train.ravel())
lgb_eval = lgb.Dataset(x_test, y_test.ravel(), reference=lgb_train)
# categorial_name = ['adcode', 'model', 'bodyType', 'regYear', 'regMonth']

module = lgb.train(params,
                   lgb_train,
                   num_boost_round=5000,
                   valid_sets=lgb_eval,
                   early_stopping_rounds=100,
                   categorical_feature=categorial_name,
                   verbose_eval=100)

val = module.predict(x_test, num_iteration=module.best_iteration)
val = 2 ** (val - 1)
y_true = 2 ** (y_test.reshape(1, -1)[0] - 1)
nrmse = metrics(y_true, val, val_model.reshape(1, -1)[0])

iters = module.best_iteration + 100
train_all = np.vstack((x_train, x_test))
label_all = np.hstack((y_train, y_test))
lgb_data = lgb.Dataset(train_all, label_all.ravel())
model = lgb.train(params, lgb_data, num_boost_round=iters, categorical_feature=categorial_name)

predict = model.predict(test_feature)
predict = 2 ** (predict - 1)
print(predict)

print('model train over, rmse:', nrmse)
submit['forecastVolum'] = predict
test_prob_collection = pd.concat([test_prob_collection, submit], axis=0, ignore_index=True)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's rmse: 0.67011
[200]	valid_0's rmse: 0.642214
[300]	valid_0's rmse: 0.642017
[400]	valid_0's rmse: 0.632225
Early stopping, best iteration is:
[360]	valid_0's rmse: 0.631175
0.4008416718770208
0.2625600761820513
0.36931393180376465
0.13038779458818095
0.36837418823763113
1.06934505409548
0.17395422815370856
0.2125370836645495
0.4010984247740295
0.15951282998036076
0.5042040593581459
0.20817938050019824
0.17007036539674703
0.4762809378842139
0.35663846145029643
0.2736807995027956
0.13587220283132942
0.24805838092304067
0.22007602665495413
0.5245971249998665
0.26421761593430076
0.3146734014202273
0.1847290862338108
0.3667345567553267
0.42554240420665174
0.518637884609816
0.30139366964392716
0.22198420900631077
0.17294340966059743
0.47064043600841293
0.20382256801171791
0.15569665044472228
0.4210336883284066
0.22791233527804056
1.0007138661970951
0.6912741037651658
0.15127812070384683
0.5534039025340067
0.2803756

In [16]:
train_feature.shape

(17160, 59)

In [17]:
test_prob_collection.head()

Unnamed: 0,province,adcode,model,regYear,regMonth,forecastVolum
0,浙江,330000,f8a6975573af1b33,2018,1,879.539082
1,福建,350000,f8a6975573af1b33,2018,1,470.442071
2,四川,510000,f8a6975573af1b33,2018,1,658.857897
3,陕西,610000,f8a6975573af1b33,2018,1,167.458367
4,安徽,340000,f8a6975573af1b33,2018,1,653.136139


In [18]:
test_prob_collection.index = range(len(test_prob_collection))
evaluation_public = evaluation_public.merge(test_prob_collection, on=['province', 'adcode', 'model', 'regYear', 'regMonth'], how='left')
evaluation_public['forecastVolum'] = evaluation_public['forecastVolum_y']
evaluation_public['forecastVolum'] = evaluation_public['forecastVolum'].apply(lambda index: int(np.round(index)))
evaluation_public['forecastVolum'].mean()

468.8901515151515

In [19]:
evaluation_public[['id', 'forecastVolum']].to_csv('../../sub/sub_method_one.csv', encoding='utf-8', index=None)

In [20]:
evaluation_public.describe()
evaluation_public.groupby(['regMonth'], as_index=False)['forecastVolum'].mean()

Unnamed: 0,regMonth,forecastVolum
0,1,557.691667
1,2,363.236364
2,3,475.643939
3,4,478.988636
