In [1]:
from time import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error as mse
import lightgbm as lgb

from tqdm import tqdm
import gc
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

# 读取数据

In [2]:
data_path = '../../input/Round1/'

train_sales_data = pd.read_csv(data_path + 'train_sales_data.csv', encoding='utf-8')
train_search_data = pd.read_csv(data_path + 'train_search_data.csv', encoding='utf-8')
test_data = pd.read_csv(data_path + 'evaluation_public.csv', encoding='utf-8')

data = pd.concat([train_sales_data, test_data], ignore_index=True)
data = data.merge(train_search_data, on=['province', 'adcode', 'model', 'regYear', 'regMonth'], how='left')


data['label'] = data['salesVolume']

del data['salesVolume'], data['forecastVolum']
gc.collect()

0

In [3]:
data['id'] = data['id'].fillna(0).astype(int)
data['bodyType'] = data['model'].map(train_sales_data.drop_duplicates('model').set_index('model')['bodyType'])

# Label Encoder
for i in ['bodyType', 'model']:
    data[i] = data[i].map(dict(zip(data[i].unique(), range(data[i].nunique()))))

In [4]:
data['seq'] = (data['regYear'] - 2016) * 12 + data['regMonth'] # 获取时间序列标记

In [5]:
data['seq'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)

In [6]:
data['adcode'].unique()

array([310000, 530000, 150000, 110000, 510000, 340000, 370000, 140000,
       440000, 450000, 320000, 360000, 130000, 410000, 330000, 420000,
       430000, 350000, 210000, 500000, 610000, 230000], dtype=int64)

In [7]:
data['model'].unique()

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59], dtype=int64)

In [8]:
data['bodyType'].unique()

array([0, 1, 2, 3], dtype=int64)

In [9]:
data['model_adcode'] = data['adcode'] + data['model']
data['model_adcode_seq'] = data['model_adcode'] * 100 + data['seq']

data['adcode_seq'] = data['adcode'] * 100 + data['seq']
data['model_seq'] = data['model'] * 10000 + data['seq']

In [10]:
data['label'] = np.log1p(data['label'])

In [11]:
data.columns

Index(['province', 'adcode', 'model', 'bodyType', 'regYear', 'regMonth', 'id',
       'popularity', 'label', 'seq', 'model_adcode', 'model_adcode_seq',
       'adcode_seq', 'model_seq'],
      dtype='object')

# 评分函数

In [12]:
def metrics(y_true, y_pred, model):
    data = pd.DataFrame({'model': model, 'salesVolume': y_true, 'label': y_pred})
    data['label'] = data['label'].map(lambda index: -index if index < 0 else index)
    res, count = 0, 0
    for index, cars in data.groupby('model'):
        a = np.array(cars['salesVolume'])
        b = np.array(cars['label'])
        temp = np.sqrt(np.sum((a - b) ** 2) / len(a)) / np.mean(a)
        res += temp
        count += 1
        print(temp)
    return 1 - (res / count)

# 特征工程

## 获取时移特征

In [13]:
# 做一个shift操作
def get_time_shift_feature(Data, month):
    # ['province', 'adcode', 'model', 'bodyType', 'regYear', 'regMonth', 'id', 'popularity',
    #  'label', 'seq', 'model_adcode', 'model_adcode_seq', 'adcode_seq', 'model_seq']
    data = Data[['adcode', 'bodyType', 'id', 'model', 'regMonth', 'regYear', 'label',
                 'seq', 'model_adcode', 'model_adcode_seq','adcode_seq', 'model_seq',
                 'popularity']]
    for j in range(1, 13):
        data['model_adcode_seq_{}'.format(j)] = data['model_adcode_seq'] + j
        data_index = data[data.label.notnull()].set_index('model_adcode_seq_{}'.format(j))
        # label
        data['shift_label_{}'.format(j)] = data['model_adcode_seq'].map(data_index['label'])
        # popularity
        # popularity只在预测第一个月时用
        if month == 1:
            data['shift_popularity_{}'.format(j)] = data['model_adcode_seq'].map(data_index['popularity'])
        data = data.drop(['model_adcode_seq_{}'.format(j)], axis=1)
        gc.collect()
    return data

## 获取组合时移特征

In [14]:
# 先groupby对label求和，再对groupby对象做一个shift操作
def get_group_shift_feature(data, group_feature):
    # group_feature = 'adcode_seq', 'model_seq'
    Data = data.copy()
    g_data = Data.groupby(by=[group_feature])['label'].apply(lambda x: x.sum(skipna=False))
    g_data = g_data.fillna(np.nan).reset_index()
    for j in range(1, 13):
        g_data['{}_{}'.format(group_feature, j)] = g_data[group_feature] + j
        g_data_index = g_data[g_data.label.notnull()].set_index('{}_{}'.format(group_feature, j))
        g_data['{}_shift_{}'.format(group_feature, j)] = g_data[group_feature].map(g_data_index['label'])
        del g_data['{}_{}'.format(group_feature, j)]
        gc.collect()
    del g_data['label']
    gc.collect()
    data = pd.merge(data, g_data, on=[group_feature], how='left')
    return data

## 获取历史销量特征

In [15]:
# 计算过去几月的特征和
def calculate_sum_mean(feature, month):
    data[feature.format('sum_{}'.format(month))] = 0
    
    # 求和
    for i in range(1, month + 1):
        data[feature.format('sum_{}'.format(month))] += data[feature.format(i)]
    
    # 求均值
    data[feature.format('mean_{}'.format(month))] = data[feature.format('sum_{}'.format(month))] / month


# 对前几个月的特征求和
def get_history_label_feature(month):
    # month = 1, 2, 3, 4
    # month = 1
    for i in tqdm([2, 3, 4, 6, 12]):
        # i=2: feature.format('sum_{}'.format(month))=shift_label_sum_2=shift_label_1+shift_label_2
        #      feature.format('mean_{}')=shift_label_mean_2
        # i=3: feature.format('sum_{}'.format(month))=shift_label_sum_3=shift_label_1+shift_label_2+shift_label_3
        #      feature.format('mean_{}')=shift_label_mean_3
        
        # shift_label_{}
        calculate_sum_mean('shift_label_{}', i)
        
        # popularity只在预测第一个月时用
        # shift_popularity_{}
        if month == 1:
            calculate_sum_mean('shift_popularity_{}', i)
        
        # adcode_seq_shift_{}
        calculate_sum_mean('adcode_seq_shift_{}', i)
        
        # adcode_seq_shift_{}
        calculate_sum_mean('model_seq_shift_{}', i)

# 定义lgb模型

In [16]:
lgb_model = lgb.LGBMRegressor(
    num_leaves=2**5-1,
    reg_alpha=0.25,
    reg_lambda=0.25,
    objective='mse',
    max_depth=-1,
    learning_rate=0.05,
    min_child_samples=5,
    seed=2020,
    n_estimators=2000,
    subsample=0.9,
    colsample_bytree=0.7
)

# 分车型预测单月销量，再预测下月

In [17]:
Data = data.copy()
datas = pd.DataFrame()
for k in range(4):
    data = Data[Data['bodyType'] == k]
    for i in range(1, 5):
        print('=================predict bodytype{0} month {1}=================='.format(k, i))

        data = get_time_shift_feature(data, i)
        data = get_group_shift_feature(data, 'adcode_seq')
        data = get_group_shift_feature(data, 'model_seq')
        get_history_label_feature(i)

        data_columns = list(data.columns)
        
        # 数值特征
        number_feature = []
        dels = ['regMonth', 'regYear', 'adcode', 'bodyType', 'id', 'model', 'province', 'label', 'seq', 'model_adcode',
                'model_adcode_seq', 'adcode_seq', 'model_seq', 'popularity']
        for index in data_columns:
            if index in dels:
                continue
            else:
                number_feature.append(index)

        print(len(number_feature))
        
        # 类别特征
        category_feature = ['regMonth', 'regYear', 'adcode', 'bodyType', 'model', 'model_adcode_seq', 'model_adcode']
        features = list(number_feature) + category_feature
        
        # 预测数据
        predict_data = data[data['seq'] == 24 + i]
        # 训练数据，只用一年内的数据
        train_idx = (data['seq'].between(13, 23 + i))

        train_y = data[train_idx]['label']
        train_x = data[train_idx][features]

        print("train LGB model\n")
        lgb_model.fit(train_x, train_y, categorical_feature=category_feature)
        predict_data['lgb_pred_label'] = lgb_model.predict(predict_data[features])
        print('month {} train ending\n'.format(i))

        predict_data = predict_data.sort_values(by=['id'])
        
        # 将预测结果合并到训练集中，一起训练
        data['transform_label'] = data['id'].map(predict_data.set_index('id')['lgb_pred_label'])
        data['label'] = data['label'].fillna(data['transform_label'])
        del data['transform_label']
        
        # 将所有预测结果拼接起来
        datas = pd.concat([datas, predict_data], ignore_index=True)



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 55.70it/s]


88
train LGB model

month 1 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 74.85it/s]


66
train LGB model

month 2 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 73.73it/s]


66
train LGB model

month 3 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 74.87it/s]


66
train LGB model

month 4 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 55.72it/s]


88
train LGB model

month 1 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 78.34it/s]


66
train LGB model

month 2 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 70.65it/s]


66
train LGB model

month 3 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 79.58it/s]


66
train LGB model

month 4 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 92.82it/s]


88
train LGB model

month 1 train ending



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 118.86it/s]


66
train LGB model

month 2 train ending



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 122.16it/s]


66
train LGB model

month 3 train ending



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 116.59it/s]


66
train LGB model

month 4 train ending



100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 92.77it/s]


88
train LGB model

month 1 train ending



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 125.33it/s]


66
train LGB model

month 2 train ending



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 123.50it/s]


66
train LGB model

month 3 train ending



100%|███████████████████████████████████████████████████████████████████████████████████| 5/5 [00:00<00:00, 122.39it/s]


66
train LGB model

month 4 train ending



# Submission

In [18]:
datas = datas.sort_values(by=['id'])
datas['label'] = np.expm1(datas['lgb_pred_label'])
datas['forecastVolum'] = datas['label'].apply(lambda x: 0 if x < 0 else x).round().astype(int)
datas[['id', 'forecastVolum']].to_csv('../../sub/lgb_one.csv', index=False)

In [19]:
datas['forecastVolum'].mean()
datas.groupby(['regMonth'])['forecastVolum'].mean()

regMonth
1    486.823485
2    326.763636
3    491.113636
4    482.550000
Name: forecastVolum, dtype: float64