In [1]:
import time
import math
import warnings
import numpy as np
import pandas as pd
import lightgbm as lgb
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

In [2]:
#对销量采取平滑log处理
is_get_82_model, lg, log = 0, 2, 1

In [3]:
pre_train_sale = pd.read_csv('../../input/Round1/train_sales_data.csv')
input_data  = pd.read_csv('../../input/Round2/train_sales_data.csv')
final_data  = pd.read_csv('../../input/Round2/evaluation_public.csv')
search_data = pd.read_csv('../../input/Round2/train_search_data.csv')

In [4]:
# 将复赛新车型标记出来
pre_model = list(set(list(pre_train_sale['model'])))
input_data['new_model'] = list(map(lambda x: 1 if pre_model.count(x) == 0 else 0, input_data['model']))
final_data['new_model'] = list(map(lambda x: 1 if pre_model.count(x) == 0 else 0, final_data['model']))

In [5]:
pre_train_sale.head()

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610


In [6]:
input_data.head()

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume,new_model
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292,0
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466,0
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257,0
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408,0
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610,0


In [7]:
final_data.head()

Unnamed: 0,id,province,adcode,model,regYear,regMonth,forecastVolum,new_model
0,1,上海,310000,3c974920a76ac9c1,2018,1,,0
1,2,云南,530000,3c974920a76ac9c1,2018,1,,0
2,3,内蒙古,150000,3c974920a76ac9c1,2018,1,,0
3,4,北京,110000,3c974920a76ac9c1,2018,1,,0
4,5,四川,510000,3c974920a76ac9c1,2018,1,,0


In [8]:
# final_data 中没有bodyType，根据训练集中的model将bodyType map过来
# 将预测的文件拼接到数据集中并补全bodytype
tmp = input_data.drop_duplicates('model').set_index('model')['bodyType']
final_data['bodyType'] = final_data['model'].map(tmp)

In [9]:
input_data = pd.concat([input_data, final_data], ignore_index=True, axis=0)
input_data['id'] = input_data['id'].fillna(0).astype(int)
input_data.drop('forecastVolum', axis=1, inplace=True)
input_data.head()

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume,new_model,id
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292.0,0,0
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466.0,0,0
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257.0,0,0
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408.0,0,0
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610.0,0,0


In [10]:
search_data.head()

Unnamed: 0,province,adcode,model,regYear,regMonth,popularity
0,河南,410000,17bc272c93f19d56,2016,1,19036
1,河南,410000,17bc272c93f19d56,2016,2,17856
2,河南,410000,17bc272c93f19d56,2016,3,12517
3,河南,410000,17bc272c93f19d56,2016,4,9700
4,河南,410000,17bc272c93f19d56,2016,5,12780


In [11]:
input_data = pd.merge(input_data, search_data, how='left', on=['province', 'adcode', 'model', 'regYear', 'regMonth'])
input_data.head()

Unnamed: 0,province,adcode,model,bodyType,regYear,regMonth,salesVolume,new_model,id,popularity
0,上海,310000,3c974920a76ac9c1,SUV,2016,1,292.0,0,0,1479.0
1,云南,530000,3c974920a76ac9c1,SUV,2016,1,466.0,0,0,1594.0
2,内蒙古,150000,3c974920a76ac9c1,SUV,2016,1,257.0,0,0,1479.0
3,北京,110000,3c974920a76ac9c1,SUV,2016,1,408.0,0,0,2370.0
4,四川,510000,3c974920a76ac9c1,SUV,2016,1,610.0,0,0,3562.0


In [12]:
def prepare(data):
    # Label Encoder(model, bodyType, province)
    data['model_id'] = data['model'].map(dict(zip(data['model'].unique(), range(data['model'].nunique()))))
    data['body_id'] = data['bodyType'].map(dict(zip(data['bodyType'].unique(), range(data['bodyType'].nunique()))))
    data['pro_id'] = data['province'].map(dict(zip(data['province'].unique(), range(data['province'].nunique()))))
    data.drop(['model', 'bodyType', 'province', 'adcode'], axis=1, inplace=True)
    
    data.rename(columns={'regYear': 'sales_year', 'regMonth': 'month_id', 'salesVolume': 'label'}, inplace=True)
    data['time_id'] = (data['sales_year'] - 2016) * 12 + data['month_id']
    data['salesVolume'] = data['label']
    return data

In [13]:
input_data = prepare(input_data)
input_data.head()

Unnamed: 0,sales_year,month_id,label,new_model,id,popularity,model_id,body_id,pro_id,time_id,salesVolume
0,2016,1,292.0,0,0,1479.0,0,0,0,1,292.0
1,2016,1,466.0,0,0,1594.0,0,0,1,1,466.0
2,2016,1,257.0,0,0,1479.0,0,0,2,1,257.0
3,2016,1,408.0,0,0,2370.0,0,0,3,1,408.0
4,2016,1,610.0,0,0,3562.0,0,0,4,1,610.0


In [14]:
input_data['time_id'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28], dtype=int64)

In [None]:
def get_stat_feature(df_, month):   
    # month = [25, 26, 27, 28]
    data = df_.copy()
    stat_feat = []
    start = int((month - 24) / 3) * 2
    start += int((month - 24) / 4)
    # 之前的start，month=25或26时，start=0；month=27时，start=2；month=28时，start=3
    start = start - 1 if start >= 1 else start
    # 最终的start，month=25或26时，start=0；month=27时，start=1；month=28时，start=2
    
    '历史月销量' 
    for last in range(1, 17):  
        tmp = data.copy()
        # 这里time_id加上了last，实际是一个shift操作
        # 小于等于28，是将shift后为空的行过滤掉
        tmp['time_id'] = list(map(lambda x: x + last + start if x + last + start <= 28 else -1, tmp['time_id']))
        tmp = tmp[~tmp['time_id'].isin([-1])][['label','time_id','pro_id','model_id','body_id']]
        tmp = tmp.rename(columns={'label':'last_{0}_sale'.format(last)})
        data = pd.merge(data,tmp,how='left',on=['time_id','pro_id','model_id','body_id'])
        if last <= 6:
            stat_feat.append('last_{0}_sale'.format(last)) 
    '历史月popularity'
    for last in range(1, 17):  
        tmp=data.copy()
        tmp['time_id']=list(map(lambda x:x+last+start if x+last+start<=28 else -1,tmp['time_id']))
        tmp=tmp[~tmp['time_id'].isin([-1])][['popularity','time_id','pro_id','model_id','body_id']]
        tmp=tmp.rename(columns={'popularity':'last_{0}_popularity'.format(last)})
        data=pd.merge(data,tmp,how='left',on=['time_id','pro_id','model_id','body_id'])
        if last <= 6 or (last >= 11 and last <= 13):
            stat_feat.append('last_{0}_popularity'.format(last)) 

    '半年销量等统计特征'
    data['1_6_sum'] = data.loc[:, 'last_1_sale': 'last_6_sale'].sum(1)
    data['1_6_mea'] = data.loc[:, 'last_1_sale': 'last_6_sale'].mean(1)
    data['1_6_max'] = data.loc[:, 'last_1_sale': 'last_6_sale'].max(1)
    data['1_6_min'] = data.loc[:, 'last_1_sale': 'last_6_sale'].min(1)
    data['jidu_1_3_sum']  = data.loc[:, 'last_1_sale': 'last_3_sale'].sum(1)
    data['jidu_4_6_sum']  = data.loc[:, 'last_4_sale': 'last_6_sale'].sum(1)
    data['jidu_1_3_mean'] = data.loc[:, 'last_1_sale': 'last_3_sale'].mean(1)
    data['jidu_4_6_mean'] = data.loc[:, 'last_4_sale': 'last_6_sale'].mean(1)
    sales_stat_feat = ['1_6_sum','1_6_mea','1_6_max','1_6_min','jidu_1_3_sum','jidu_4_6_sum','jidu_1_3_mean','jidu_4_6_mean']
    stat_feat = stat_feat + sales_stat_feat
    
    'model_pro趋势特征 diff操作'
    data['1_2_diff'] = data['last_1_sale'] - data['last_2_sale']
    data['1_3_diff'] = data['last_1_sale'] - data['last_3_sale']
    data['2_3_diff'] = data['last_2_sale'] - data['last_3_sale']
    data['2_4_diff'] = data['last_2_sale'] - data['last_4_sale']
    data['3_4_diff'] = data['last_3_sale'] - data['last_4_sale']
    data['3_5_diff'] = data['last_3_sale'] - data['last_5_sale']
    data['jidu_1_2_diff'] = data['jidu_1_3_sum'] - data['jidu_4_6_sum']
    trend_stat_feat = ['1_2_diff','1_3_diff','2_3_diff','2_4_diff','3_4_diff','3_5_diff','jidu_1_2_diff']
    stat_feat = stat_feat + trend_stat_feat

    '春节月'
    yanhaicity={1, 2, 5, 7, 9, 13, 16, 17}  # 沿海城市
    data['is_yanhai']  = list(map(lambda x:1 if x in yanhaicity else 0, data['pro_id']))
    data['is_chunjie'] = list(map(lambda x:1 if x==2 or x==13 or x==26 else 0, data['time_id']))
    data['is_chunjie_before'] = list(map(lambda x:1 if x==1 or x==12 or x==25 else 0, data['time_id']))
    data['is_chunjie_late']   = list(map(lambda x:1 if x==3 or x==14 or x==27 else 0, data['time_id']))
    month_city_stat_feat = ['is_chunjie','is_chunjie_before','is_chunjie_late','is_yanhai']
    stat_feat = stat_feat + month_city_stat_feat
    
    '两个月销量差值'
    'model 两个月的销量差值'
    pivot = pd.pivot_table(data,index=['model_id'], values='1_2_diff',aggfunc=np.sum)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff': 'model_1_2_diff_sum'}).reset_index()
    data  = pd.merge(data,pivot, on=['model_id'], how='left')
    'pro 两个月的销量差值'
    pivot = pd.pivot_table(data,index=['pro_id'],values='1_2_diff',aggfunc=np.sum)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff': 'pro_1_2_diff_sum'}).reset_index()
    data  = pd.merge(data,pivot, on=['pro_id'], how='left')
    'model, pro 两个月的销量差值 求和，求均值'
    # sum
    pivot = pd.pivot_table(data,index=['pro_id','model_id'],values='1_2_diff',aggfunc=np.sum)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff': 'model_pro_1_2_diff_sum'}).reset_index()
    data  = pd.merge(data,pivot, on=['pro_id', 'model_id'], how='left')
    # mean
    pivot = pd.pivot_table(data,index=['pro_id','model_id'],values='1_2_diff',aggfunc=np.mean)
    pivot = pd.DataFrame(pivot).rename(columns={'1_2_diff':'model_pro_1_2_diff_mean'}).reset_index()
    data  = pd.merge(data,pivot,on=['pro_id','model_id'],how='left')
    two_month_stat_feat = ['model_1_2_diff_sum','pro_1_2_diff_sum','model_pro_1_2_diff_sum','model_pro_1_2_diff_mean']
    stat_feat = stat_feat + two_month_stat_feat

    '月份 12个月'
    count_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]  # 每个月的天数
    data['count_month'] = list(map(lambda x: count_month[int(x-1)], data['month_id']))
    # 假期表
    jiaqibiao = [[11, 12, 8, 10, 10, 9, 10, 8, 9, 13, 8, 9],
                 [12, 9, 8, 11, 10, 8, 10, 8, 8, 14, 8, 10],
                 [9, 11, 9, 11]]
    data['count_jiaqi'] = list(map(lambda x, y: jiaqibiao[int(x-2016)][int(y-1)], data['sales_year'], data['month_id']))
    stat_feat.append('count_month')
    stat_feat.append('count_jiaqi')
    
    '环比'
    data['huanbi_1_2'] = data['last_1_sale'] / data['last_2_sale']
    data['huanbi_2_3'] = data['last_2_sale'] / data['last_3_sale']
    data['huanbi_3_4'] = data['last_3_sale'] / data['last_4_sale']
    data['huanbi_4_5'] = data['last_4_sale'] / data['last_5_sale']
    data['huanbi_5_6'] = data['last_5_sale'] / data['last_6_sale']
    ring_ratio_stat_feat = ['huanbi_1_2', 'huanbi_2_3', 'huanbi_3_4', 'huanbi_5_6']
    stat_feat = stat_feat + ring_ratio_stat_feat

    'add环比比'
    data['huanbi_1_2_2_3'] = data['huanbi_1_2'] / data['huanbi_2_3']
    data['huanbi_2_3_3_4'] = data['huanbi_2_3'] / data['huanbi_3_4']
    data['huanbi_3_4_4_5'] = data['huanbi_3_4'] - data['huanbi_4_5']
    data['huanbi_4_5_5_6'] = data['huanbi_4_5'] - data['huanbi_5_6']
    two_ring_ratio_stat_feat = ['huanbi_1_2_2_3', 'huanbi_2_3_3_4', 'huanbi_3_4_4_5', 'huanbi_4_5_5_6']
    stat_feat = stat_feat + two_ring_ratio_stat_feat

    # 2020-09-22 11: 20
    '该月该省份bodytype销量的占比与涨幅'
    for i in range(1, 7):
        last_time='last_{0}_sale'.format(i)
        pivot = pd.pivot_table(data,
                               index=['time_id', 'pro_id', 'body_id'],
                               values=last_time,
                               aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time: 'pro_body_last_{0}_sale_sum'.format(i)}).reset_index()
        data  = pd.merge(data, pivot, on=['time_id', 'pro_id', 'body_id'], how='left')
        data['last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0, data[last_time],data['pro_body_last_{0}_sale_sum'.format(i)]))
        stat_feat.append('last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i,i))
        if i >= 2:
            data['last_{0}_{1}_sale_pro_body_diff'.format(i-1,i)] = (data['last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i-1)]
                                                                     - data['last_{0}_sale_ratio_pro_body_last_{0}_sale_sum'.format(i)])
            stat_feat.append('last_{0}_{1}_sale_pro_body_diff'.format(i-1,i))

    '该月该省份总销量占比与涨幅'
    for i in range(1, 7):
        last_time = 'last_{0}_sale'.format(i)
        pivot = pd.pivot_table(data,index=['time_id','pro_id'],values=last_time,aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time:'pro__last_{0}_sale_sum'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=['time_id','pro_id'],how='left')
        data['last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0,
                                                                               data[last_time], data['pro__last_{0}_sale_sum'.format(i)]))
        stat_feat.append('last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i,i))
        if i >= 2:
            data['model_last_{0}_{1}_sale_pro_diff'.format(i-1,i)] = (data['last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i-1)]
                                                                      - data['last_{0}_sale_ratio_pro_last_{0}_sale_sum'.format(i)])
            stat_feat.append('model_last_{0}_{1}_sale_pro_diff'.format(i-1,i))

    'popularity的涨幅占比'
    data['huanbi_1_2popularity'] = ((data['last_1_popularity'] - data['last_2_popularity'])
                                    / data['last_2_popularity'])
    data['huanbi_2_3popularity'] = ((data['last_2_popularity'] - data['last_3_popularity'])
                                    / data['last_3_popularity'])
    data['huanbi_3_4popularity'] = ((data['last_3_popularity'] - data['last_4_popularity'])
                                    / data['last_4_popularity'])
    data['huanbi_4_5popularity'] = ((data['last_4_popularity'] - data['last_5_popularity'])
                                    / data['last_5_popularity'])
    data['huanbi_5_6popularity'] = ((data['last_5_popularity'] - data['last_6_popularity'])
                                    / data['last_6_popularity'])
    popularity_ratio_stat_feat = ['huanbi_1_2popularity','huanbi_2_3popularity','huanbi_3_4popularity','huanbi_4_5popularity','huanbi_5_6popularity']
    stat_feat = stat_feat + popularity_ratio_stat_feat

    'popu_modelpopularity'
    for i in range(1,7):
        last_time='last_{0}_popularity'.format(i)
        pivot = pd.pivot_table(data,index=['time_id','model_id'],values=last_time,aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time:'model__last_{0}_popularity_sum'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=['time_id','model_id'],how='left')
        data['last_{0}_popularity_ratio_model_last_{0}_popularity_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0,
                                                                                             data[last_time],
                                                                                             data['model__last_{0}_popularity_sum'.format(i)]))
        stat_feat.append('last_{0}_popularity_ratio_model_last_{0}_popularity_sum'.format(i,i))  

    'body month 增长率popularitydemo4'
    for i in range(1,7):
        last_time='last_{0}_popularity'.format(i)
        pivot = pd.pivot_table(data,index=['time_id','body_id'],values=last_time,aggfunc=np.sum)
        pivot = pd.DataFrame(pivot).rename(columns={last_time:'body_last_{0}_popularity_sum'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=['time_id','body_id'],how='left')
        data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i,i)]=list(map(lambda x,y:x/y if y!=0 else 0,
                                                                                            data[last_time],
                                                                                            data['body_last_{0}_popularity_sum'.format(i)]))
        if i >= 2:
            data['last_{0}_{1}_popularity_body_diff'.format(i-1,i)] = ((data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i-1)]
                                                                        - data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i)])
                                                                       /data['last_{0}_popularity_ratio_body_last_{0}_popularity_sum'.format(i)])
            stat_feat.append('last_{0}_{1}_popularity_body_diff'.format(i-1,i)) 

    '同比一年前的增长'
    data["increase16_4"]=(data["last_16_sale"] - data["last_4_sale"]) / data["last_16_sale"]
    pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_12_sale',aggfunc=np.mean)
    pivot = pd.DataFrame(pivot).rename(columns={'last_12_sale':'mean_province'}).reset_index()
    data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
    pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_12_sale',aggfunc=np.min)
    pivot = pd.DataFrame(pivot).rename(columns={'last_12_sale':'min_province'}).reset_index()
    data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
    '前4个月车型的同比'
    for i in range(1,5):
        pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_{0}_sale'.format(i),aggfunc=np.mean)
        pivot = pd.DataFrame(pivot).rename(columns={'last_{0}_sale'.format(i):'mean_province_{0}'.format(i)}).reset_index()
        data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
        pivot = pd.pivot_table(data,index=["model_id","time_id"],values='last_{0}_sale'.format(i+12),aggfunc=np.mean)
        pivot = pd.DataFrame(pivot).rename(columns={'last_{0}_sale'.format(i+12):'mean_province_{0}'.format(i+12)}).reset_index()
        data  = pd.merge(data,pivot,on=["model_id","time_id"],how="left")
    data["increase_mean_province_14_2"] = ((data["mean_province_14"] - data["mean_province_2"])
                                           / data["mean_province_14"])
    data["increase_mean_province_13_1"] = ((data["mean_province_13"] - data["mean_province_1"])
                                           / data["mean_province_13"])
    data["increase_mean_province_16_4"] = ((data["mean_province_16"] - data["mean_province_4"])
                                           / data["mean_province_16"])
    data["increase_mean_province_15_3"] = ((data["mean_province_15"] - data["mean_province_3"])
                                           / data["mean_province_15"])
    new_stat_feat = ["mean_province","min_province","increase16_4","increase_mean_province_15_3",
                     "increase_mean_province_16_4","increase_mean_province_14_2","increase_mean_province_13_1"]
    
    return data,stat_feat + new_stat_feat