In [1]:
import sys
import os
import numpy as np
import pandas as pd
import datetime 
import json
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, confusion_matrix, classification_report
from sklearn.manifold import TSNE
from imblearn.over_sampling import *
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import pickle
import warnings
warnings.filterwarnings('ignore')


数据文件存放目录

../../data/

In [2]:
# 根据标签收集正负样本
feat_cols = ['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp','timestamp']
# True，重新生成文件
# 训练集
os.makedirs('./data',exist_ok=True)
if True:
    trn_pos_df, trn_neg_df = [], []
    for fn in tqdm(os.listdir('../../data/Train')):
        with open(f'../../data/Train/{fn}', 'rb') as f:
            pkl_data = pickle.load(f)

        # 抽取内部元素    
        data = pd.DataFrame(pkl_data[0])
        data.columns = feat_cols
        mileage = pkl_data[1]['mileage']
        label = pkl_data[1]['label']
        fid = fn[:-4]
        # fid = fn
        data['pkl_id'] = fid
        data['time_id'] = range(1, 257)
        data['mileage'] = mileage
        data['label'] = label
        if label == '10':
            trn_pos_df.append(data)   
        else:
            trn_neg_df.append(data)

    trn_pos_df = pd.concat(trn_pos_df, axis=0).sort_values(by=['pkl_id','time_id']).reset_index(drop=True)
    trn_neg_df = pd.concat(trn_neg_df, axis=0).sort_values(by=['pkl_id','time_id']).reset_index(drop=True)
    print(trn_pos_df.shape, trn_neg_df.shape)

    # 保存文件
    trn_pos_df.to_csv('./data/Train_pos_df.csv',index=False)
    trn_neg_df.to_csv('./data/Train_neg_df.csv',index=False)

    # 测试集
    tst_df = []
    for fn in tqdm(os.listdir('../../data/Test_A')):
        with open(f'../../data/Test_A/{fn}', 'rb') as f:
            pkl_data = pickle.load(f)

        # 抽取内部元素    
        data = pd.DataFrame(pkl_data[0])
        data.columns = feat_cols
        mileage = pkl_data[1]['mileage']
        fid = fn[:-4]
        # fid = fn
        data['pkl_id'] = fid
        data['time_id'] = range(1, 257)
        data['mileage'] = mileage
        tst_df.append(data)   

    tst_df = pd.concat(tst_df, axis=0).sort_values(by=['pkl_id','time_id']).reset_index(drop=True)
    print(tst_df.shape)

    # 保存文件
    tst_df.to_csv('./data/Test_df.csv',index=False)

In [3]:
# 读取文件
trn_pos_df = pd.read_csv('./data/Train_pos_df.csv')
trn_neg_df = pd.read_csv('./data/Train_neg_df.csv')
trn_df = pd.concat([trn_pos_df, trn_neg_df], axis=0).reset_index(drop=True)
tst_df = pd.read_csv('./data/Test_df.csv')
print(trn_pos_df.shape, trn_neg_df.shape, tst_df.shape)

(1191424, 12) (6076160, 12) (1595904, 11)


In [4]:
trn_df.head()

Unnamed: 0,volt,current,soc,max_single_volt,min_single_volt,max_temp,min_temp,timestamp,pkl_id,time_id,mileage,label
0,156.4,-5.1,34.3,1.74,1.725,204.0,186.0,25150.0,100,1,8365.4,10
1,156.4,-5.1,34.3,1.74,1.725,204.0,186.0,25151.0,100,2,8365.4,10
2,156.4,-5.1,34.3,1.74,1.725,204.0,186.0,25152.0,100,3,8365.4,10
3,156.4,-5.1,34.3,1.74,1.725,204.0,186.0,25153.0,100,4,8365.4,10
4,156.4,-5.1,34.3,1.74,1.725,204.0,186.0,25154.0,100,5,8365.4,10


In [5]:
tst_df.head()

Unnamed: 0,volt,current,soc,max_single_volt,min_single_volt,max_temp,min_temp,timestamp,pkl_id,time_id,mileage
0,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9751.0,0,1,3983.8
1,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9752.0,0,2,3983.8
2,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9753.0,0,3,3983.8
3,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9754.0,0,4,3983.8
4,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9755.0,0,5,3983.8


In [4]:
# 特征工程

# 按里程分类
def mileage2milecls(mileage):
    if mileage == 0:
        milecls = 0
    elif (mileage > 0) & (mileage <= 7500):
        milecls = 1
    elif (mileage > 7500) & (mileage <= 10000):
        milecls = 2
    elif (mileage > 10000) & (mileage <= 15000):
        milecls = 3
    elif (mileage > 15000) & (mileage <= 20000):
        milecls = 4
    elif (mileage > 20000) & (mileage <= 30000):
        milecls = 5
    else:
        milecls = 6
    return milecls

trn_df['milecls'] = trn_df['mileage'].apply(lambda x:mileage2milecls(x))
tst_df['milecls'] = tst_df['mileage'].apply(lambda x:mileage2milecls(x))

In [5]:
trn_df[['pkl_id','milecls']].drop_duplicates().groupby('milecls')['pkl_id'].count(),\
tst_df[['pkl_id','milecls']].drop_duplicates().groupby('milecls')['pkl_id'].count()

(milecls
 0     1272
 1    15978
 2     2623
 3     4752
 4      834
 5     2038
 6      892
 Name: pkl_id, dtype: int64,
 milecls
 0     261
 1    5142
 2     530
 3     301
 Name: pkl_id, dtype: int64)

In [6]:

# True，重新生成文件
# if False:
if True:
    feat_cols = ['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp']
    # 合并数据集
    trn_df['data_type'] = 'train'
    tst_df['data_type'] = 'test'
    all_df = pd.concat([trn_df, tst_df], axis=0)
    del trn_df, tst_df
    gc.collect()

    # 针对每个特征做降维，获取降维特征  
    tsne_feat_dfs = []
    for f in feat_cols:
        print(f'TSNE:{f}')
        mile_dfs = []
        # 针对每个里程类别的数据做降维
        for milecls in tqdm(range(7)):
            # 对每个pkl的特定里程的每个df，做降维
            flat_arr = []
            data_types, pkl_ids, mileclss = [], [], []
            for _, tmp_df in all_df[all_df['milecls']==milecls].groupby(['data_type','pkl_id']):
                flat_arr.append(tmp_df[f].values.reshape(1,-1))
                data_types.append(tmp_df['data_type'].values[0])
                pkl_ids.append(tmp_df['pkl_id'].values[0])
                mileclss.append(tmp_df['milecls'].values[0])
            flat_arr = np.stack(flat_arr, axis=0).reshape(-1,256)
            tsne = TSNE(n_components=2, random_state=0)
            flat_2darr = tsne.fit_transform(flat_arr)
            
            tsne_feat_df = pd.DataFrame({'data_type':data_types, 'pkl_id':pkl_ids, 'milecls':mileclss,
                                        f'{f}_tnse0':flat_2darr[:,0], f'{f}_tnse1':flat_2darr[:,1]})
            mile_dfs.append(tsne_feat_df)

        mile_dfs = pd.concat(mile_dfs, axis=0)
        all_df = pd.merge(all_df, mile_dfs, how='left', on=['data_type','pkl_id','milecls'])
        del mile_dfs, tsne_feat_df, flat_arr
    all_df.to_csv('./data/all_tnse_df.csv', index=False)

TSNE:volt


100%|██████████| 7/7 [03:40<00:00, 31.53s/it]


TSNE:current


100%|██████████| 7/7 [04:15<00:00, 36.49s/it]


TSNE:soc


100%|██████████| 7/7 [03:47<00:00, 32.43s/it]


TSNE:max_single_volt


100%|██████████| 7/7 [03:46<00:00, 32.30s/it]


TSNE:min_single_volt


100%|██████████| 7/7 [03:47<00:00, 32.43s/it]


TSNE:max_temp


100%|██████████| 7/7 [04:27<00:00, 38.23s/it]


TSNE:min_temp


100%|██████████| 7/7 [04:26<00:00, 38.13s/it]


In [7]:
all_df = pd.read_csv('./data/all_tnse_df.csv')
print(all_df.shape)

(8863488, 28)


In [9]:
all_df

Unnamed: 0,volt,current,soc,max_single_volt,min_single_volt,max_temp,min_temp,timestamp,pkl_id,time_id,...,soc_tnse0,soc_tnse1,max_single_volt_tnse0,max_single_volt_tnse1,min_single_volt_tnse0,min_single_volt_tnse1,max_temp_tnse0,max_temp_tnse1,min_temp_tnse0,min_temp_tnse1
0,156.4,-5.1,34.3,1.740,1.725,204.0,186.0,25150.0,100,1,...,-23.852812,-48.70353,-40.310250,-12.045576,-29.253952,-15.916116,7.565139,-4.861164,31.01951,-26.855532
1,156.4,-5.1,34.3,1.740,1.725,204.0,186.0,25151.0,100,2,...,-23.852812,-48.70353,-40.310250,-12.045576,-29.253952,-15.916116,7.565139,-4.861164,31.01951,-26.855532
2,156.4,-5.1,34.3,1.740,1.725,204.0,186.0,25152.0,100,3,...,-23.852812,-48.70353,-40.310250,-12.045576,-29.253952,-15.916116,7.565139,-4.861164,31.01951,-26.855532
3,156.4,-5.1,34.3,1.740,1.725,204.0,186.0,25153.0,100,4,...,-23.852812,-48.70353,-40.310250,-12.045576,-29.253952,-15.916116,7.565139,-4.861164,31.01951,-26.855532
4,156.4,-5.1,34.3,1.740,1.725,204.0,186.0,25154.0,100,5,...,-23.852812,-48.70353,-40.310250,-12.045576,-29.253952,-15.916116,7.565139,-4.861164,31.01951,-26.855532
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8863483,158.4,-4.0,45.7,1.762,1.749,126.0,108.0,6750.0,9999,252,...,34.859850,30.10862,71.695755,43.690560,66.410880,49.200394,-26.859500,52.178574,-34.96859,17.669834
8863484,158.4,-4.0,45.7,1.762,1.748,126.0,108.0,6751.0,9999,253,...,34.859850,30.10862,71.695755,43.690560,66.410880,49.200394,-26.859500,52.178574,-34.96859,17.669834
8863485,158.4,-4.0,45.7,1.762,1.749,126.0,108.0,6753.0,9999,254,...,34.859850,30.10862,71.695755,43.690560,66.410880,49.200394,-26.859500,52.178574,-34.96859,17.669834
8863486,158.4,-4.0,45.7,1.762,1.748,126.0,108.0,6754.0,9999,255,...,34.859850,30.10862,71.695755,43.690560,66.410880,49.200394,-26.859500,52.178574,-34.96859,17.669834


In [10]:
# 特征工程

# 获取时间差
tmp_dfs = []
for i, tmp_df in tqdm(all_df.groupby(['data_type','pkl_id'])):
    tmp_df['ts_interval'] = tmp_df['timestamp'].diff(1)
    tmp_df['ts_interval'] = tmp_df['ts_interval'].fillna(tmp_df['ts_interval'].mean())
    tmp_dfs.append(tmp_df)
all_df = pd.concat(tmp_dfs, axis=0)
# W = UI
all_df['patv'] = all_df['volt']*all_df['current']

100%|██████████| 34623/34623 [00:22<00:00, 1519.16it/s]


In [11]:
print(pd.isnull(all_df[all_df['data_type']=='train'].values).any())

False


In [12]:
all_df.shape

(8863488, 30)

In [13]:
all_df

Unnamed: 0,volt,current,soc,max_single_volt,min_single_volt,max_temp,min_temp,timestamp,pkl_id,time_id,...,max_single_volt_tnse0,max_single_volt_tnse1,min_single_volt_tnse0,min_single_volt_tnse1,max_temp_tnse0,max_temp_tnse1,min_temp_tnse0,min_temp_tnse1,ts_interval,patv
7267584,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9751.0,0,1,...,-12.750956,-42.421700,-32.091976,-28.604914,23.611483,-27.481125,20.249376,-11.155273,2.933333,-746.12
7267585,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9752.0,0,2,...,-12.750956,-42.421700,-32.091976,-28.604914,23.611483,-27.481125,20.249376,-11.155273,1.000000,-746.12
7267586,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9753.0,0,3,...,-12.750956,-42.421700,-32.091976,-28.604914,23.611483,-27.481125,20.249376,-11.155273,1.000000,-746.12
7267587,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9754.0,0,4,...,-12.750956,-42.421700,-32.091976,-28.604914,23.611483,-27.481125,20.249376,-11.155273,1.000000,-746.12
7267588,162.2,-4.6,54.6,1.806,1.794,168.0,156.0,9755.0,0,5,...,-12.750956,-42.421700,-32.091976,-28.604914,23.611483,-27.481125,20.249376,-11.155273,1.000000,-746.12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
864507,148.6,-5.0,27.9,1.652,1.647,162.0,138.0,19522.0,28388,252,...,58.194540,-4.772778,64.890960,4.036720,8.991027,41.649803,-0.994445,26.110640,11.000000,-743.00
864508,148.6,-5.0,27.9,1.652,1.646,162.0,138.0,19532.0,28388,253,...,58.194540,-4.772778,64.890960,4.036720,8.991027,41.649803,-0.994445,26.110640,10.000000,-743.00
864509,148.6,-5.0,27.9,1.652,1.647,162.0,138.0,19542.0,28388,254,...,58.194540,-4.772778,64.890960,4.036720,8.991027,41.649803,-0.994445,26.110640,10.000000,-743.00
864510,148.6,-5.1,27.9,1.652,1.647,162.0,138.0,19553.0,28388,255,...,58.194540,-4.772778,64.890960,4.036720,8.991027,41.649803,-0.994445,26.110640,11.000000,-757.86


In [14]:
all_df.columns

Index(['volt', 'current', 'soc', 'max_single_volt', 'min_single_volt',
       'max_temp', 'min_temp', 'timestamp', 'pkl_id', 'time_id', 'mileage',
       'label', 'milecls', 'data_type', 'volt_tnse0', 'volt_tnse1',
       'current_tnse0', 'current_tnse1', 'soc_tnse0', 'soc_tnse1',
       'max_single_volt_tnse0', 'max_single_volt_tnse1',
       'min_single_volt_tnse0', 'min_single_volt_tnse1', 'max_temp_tnse0',
       'max_temp_tnse1', 'min_temp_tnse0', 'min_temp_tnse1', 'ts_interval',
       'patv'],
      dtype='object')

In [15]:
all_df.to_csv('./data/all_df.csv', index=False)

In [34]:
all_df = pd.read_csv('./data/all_df.csv')
print(all_df.shape)

(8863488, 30)


In [35]:
# 因为内存过大，所以先拆分出数据集
trn_df = all_df[all_df['data_type']=='train']
tst_df = all_df[all_df['data_type']=='test']
del all_df
trn_df.to_csv('./data/trn_df.csv',index=False)
tst_df.to_csv('./data/tst_df.csv',index=False)

In [36]:
#  特征函数定义
# 斜率 一次函数拟合
def apply_slope(df,x_name,y_name):
    coef = np.polyfit(df[x_name],df[y_name],1)
    return coef[0]

def get_slope(df,feats,data_type='sample_type',pkl_id='pkl_id',timestamp='timestamp'):
    dfs = []
    # new_df = pd.DataFrame()
    for feat in feats:
        new_df = df.groupby([data_type,pkl_id])[timestamp,feat].apply(apply_slope,timestamp,feat).reset_index()
        new_df.columns = [f'{feat}_slope' if c not in [data_type,pkl_id] else c for c in list(new_df) ]
        dfs.append(new_df)
    new_df = dfs[0]
    for df in dfs[1:]:
        new_df = pd.merge(new_df,df, how='left', on=[data_type,pkl_id])
    return new_df

# 离群点数量
def apply_detect_outliers(sr):
    q1 = sr.quantile(0.25)
    q3 = sr.quantile(0.75)
    iqr = q3-q1 #Interquartile range
    fence_low  = q1-1.5*iqr
    fence_high = q3+1.5*iqr
    outliers = sr.loc[(sr < fence_low) | (sr > fence_high)]
    counts = len(outliers)
    return counts

def get_outliers(df,feats,data_type='sample_type',pkl_id='pkl_id'):
    dfs = []
    for feat in feats:
        new_df = df.groupby([data_type,pkl_id])[feat].apply(apply_detect_outliers).reset_index()
        new_df.columns = [f'{feat}_outliers' if c not in [data_type,pkl_id] else c for c in list(new_df) ]
        dfs.append(new_df)
    new_df = dfs[0]
    for df in dfs[1:]:
        new_df = pd.merge(new_df,df, how='left', on=[data_type,pkl_id])
    return new_df

In [37]:
trn_df = pd.read_csv('./data/trn_df.csv')

# 获取静态变量
static_cols = ['pkl_id', 'mileage', 'label', 'milecls', 'volt_tnse0',
       'volt_tnse1', 'current_tnse0', 'current_tnse1', 'soc_tnse0',
       'soc_tnse1', 'max_single_volt_tnse0', 'max_single_volt_tnse1',
       'min_single_volt_tnse0', 'min_single_volt_tnse1', 'max_temp_tnse0',
       'max_temp_tnse1', 'min_temp_tnse0', 'min_temp_tnse1']
print('static:')
dim_f_df = trn_df[static_cols].drop_duplicates().reset_index()

# 获取统计特征
feat_cols = ['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp','ts_interval','patv']

print('mean:')
mean_f_df = trn_df.groupby(['data_type','pkl_id'])[feat_cols].mean().reset_index()
mean_f_df.columns = [f'{c}_mean' if c not in ['data_type','pkl_id'] else c for c in list(mean_f_df) ]

print('std:')
std_f_df = trn_df.groupby(['data_type','pkl_id'])[feat_cols].std().reset_index()
std_f_df.columns = [f'{c}_std' if c not in ['data_type','pkl_id'] else c for c in list(std_f_df) ]

print('max:')
max_f_df = trn_df.groupby(['data_type','pkl_id'])[feat_cols].max().reset_index()
max_f_df.columns = [f'{c}_max' if c not in ['data_type','pkl_id'] else c for c in list(max_f_df) ]

print('min:')
min_f_df = trn_df.groupby(['data_type','pkl_id'])[feat_cols].min().reset_index()
min_f_df.columns = [f'{c}_min' if c not in ['data_type','pkl_id'] else c for c in list(min_f_df) ]

print('skew:')
skew_f_df = trn_df.groupby(['data_type','pkl_id'])[feat_cols].skew().reset_index()
skew_f_df.columns = [f'{c}_skew' if c not in ['data_type','pkl_id'] else c for c in list(skew_f_df) ]

print('slope:')
slope_f_df = get_slope(trn_df,feat_cols,data_type='data_type',pkl_id='pkl_id',timestamp='timestamp')

print('outliers:')
outliers_f_df = get_outliers(trn_df,feat_cols,data_type='data_type',pkl_id='pkl_id')

trn_df2 = pd.merge(dim_f_df, mean_f_df, how='left', on=['pkl_id'])
trn_df2 = pd.merge(trn_df2, std_f_df, how='left', on=['pkl_id'])
trn_df2 = pd.merge(trn_df2, max_f_df, how='left', on=['pkl_id'])
trn_df2 = pd.merge(trn_df2, min_f_df, how='left', on=['pkl_id'])
trn_df2 = pd.merge(trn_df2, skew_f_df, how='left', on=['pkl_id'])
trn_df2 = pd.merge(trn_df2, slope_f_df, how='left', on=['pkl_id'])
trn_df2 = pd.merge(trn_df2, outliers_f_df, how='left', on=['pkl_id'])
del trn_df
print(trn_df2.shape)
trn_df2.to_csv('./data/trn_df_final.csv', index=False)
del trn_df2

static:
mean:
std:
max:
min:
skew:
slope:
outliers:
(28389, 89)


In [38]:

tst_df = pd.read_csv('./data/tst_df.csv')

# 获取静态变量
static_cols = ['pkl_id', 'mileage', 'label', 'milecls', 'volt_tnse0',
       'volt_tnse1', 'current_tnse0', 'current_tnse1', 'soc_tnse0',
       'soc_tnse1', 'max_single_volt_tnse0', 'max_single_volt_tnse1',
       'min_single_volt_tnse0', 'min_single_volt_tnse1', 'max_temp_tnse0',
       'max_temp_tnse1', 'min_temp_tnse0', 'min_temp_tnse1']
print('static:')
dim_f_df = tst_df[static_cols].drop_duplicates().reset_index()

# 获取统计特征
feat_cols = ['volt','current','soc','max_single_volt','min_single_volt','max_temp','min_temp','ts_interval','patv']

print('mean:')
mean_f_df = tst_df.groupby(['data_type','pkl_id'])[feat_cols].mean().reset_index()
mean_f_df.columns = [f'{c}_mean' if c not in ['data_type','pkl_id'] else c for c in list(mean_f_df) ]

print('std:')
std_f_df = tst_df.groupby(['data_type','pkl_id'])[feat_cols].std().reset_index()
std_f_df.columns = [f'{c}_std' if c not in ['data_type','pkl_id'] else c for c in list(std_f_df) ]

print('max:')
max_f_df = tst_df.groupby(['data_type','pkl_id'])[feat_cols].max().reset_index()
max_f_df.columns = [f'{c}_max' if c not in ['data_type','pkl_id'] else c for c in list(max_f_df) ]

print('min:')
min_f_df = tst_df.groupby(['data_type','pkl_id'])[feat_cols].min().reset_index()
min_f_df.columns = [f'{c}_min' if c not in ['data_type','pkl_id'] else c for c in list(min_f_df) ]

print('skew:')
skew_f_df = tst_df.groupby(['data_type','pkl_id'])[feat_cols].skew().reset_index()
skew_f_df.columns = [f'{c}_skew' if c not in ['data_type','pkl_id'] else c for c in list(skew_f_df) ]

print('slope:')
slope_f_df = get_slope(tst_df,feat_cols,data_type='data_type',pkl_id='pkl_id',timestamp='timestamp')

print('outliers:')
outliers_f_df = get_outliers(tst_df,feat_cols,data_type='data_type',pkl_id='pkl_id')

tst_df2 = pd.merge(dim_f_df, mean_f_df, how='left', on=['pkl_id'])
tst_df2 = pd.merge(tst_df2, std_f_df, how='left', on=['pkl_id'])
tst_df2 = pd.merge(tst_df2, max_f_df, how='left', on=['pkl_id'])
tst_df2 = pd.merge(tst_df2, min_f_df, how='left', on=['pkl_id'])
tst_df2 = pd.merge(tst_df2, skew_f_df, how='left', on=['pkl_id'])
tst_df2 = pd.merge(tst_df2, slope_f_df, how='left', on=['pkl_id'])
tst_df2 = pd.merge(tst_df2, outliers_f_df, how='left', on=['pkl_id'])
del tst_df
print(tst_df2.shape)
tst_df2.to_csv('./data/tst_df_final.csv', index=False)
del tst_df2

static:
mean:
std:
max:
min:
skew:
slope:
outliers:
(6234, 89)


## 正式开始建模！

In [39]:
trn_df = pd.read_csv('./data/trn_df_final.csv')
tst_df = pd.read_csv('./data/tst_df_final.csv')
print(trn_df.shape, tst_df.shape)

(28389, 89) (6234, 89)


In [21]:
os.makedirs('./lgb',exist_ok=True)

In [None]:
trn_df.head()

In [None]:
tst_df.head()

In [22]:
trn_df.columns

Index(['index', 'pkl_id', 'mileage', 'label', 'milecls', 'volt_tnse0',
       'volt_tnse1', 'current_tnse0', 'current_tnse1', 'soc_tnse0',
       'soc_tnse1', 'max_single_volt_tnse0', 'max_single_volt_tnse1',
       'min_single_volt_tnse0', 'min_single_volt_tnse1', 'max_temp_tnse0',
       'max_temp_tnse1', 'min_temp_tnse0', 'min_temp_tnse1', 'data_type_x',
       'volt_mean', 'current_mean', 'soc_mean', 'max_single_volt_mean',
       'min_single_volt_mean', 'max_temp_mean', 'min_temp_mean',
       'ts_interval_mean', 'patv_mean', 'data_type_y', 'volt_std',
       'current_std', 'soc_std', 'max_single_volt_std', 'min_single_volt_std',
       'max_temp_std', 'min_temp_std', 'ts_interval_std', 'patv_std',
       'data_type_x.1', 'volt_max', 'current_max', 'soc_max',
       'max_single_volt_max', 'min_single_volt_max', 'max_temp_max',
       'min_temp_max', 'ts_interval_max', 'patv_max', 'data_type_y.1',
       'volt_min', 'current_min', 'soc_min', 'max_single_volt_min',
       'min_singl

In [7]:
def plot_confusion_matrix(guess, fact):
    '''绘制混淆矩阵'''
    classes = list(set(fact))
    classes.sort()
    confusion = confusion_matrix(guess, fact)
    plt.imshow(confusion, cmap=plt.cm.Blues)
    indices = range(len(confusion))
    plt.xticks(indices, classes)
    plt.yticks(indices, classes)
    plt.colorbar()
    plt.xlabel('Prediction')
    plt.ylabel('Ground Truth')
    for first_index in range(len(confusion)):
        for second_index in range(len(confusion[first_index])):
            plt.text(first_index, second_index, confusion[first_index][second_index])
    plt.show()

    
def analyse_prediction(preds, y, conf_thres=0.5):
    '''基于预测值和真实值，绘制混淆矩阵和给出分类评估结果
    输入:
        preds  (list): 预测概率标签
        y  (list): 真实类别标签
        conf_thres  (float): 对预测概率标签划分类别的概率阈值
    '''
    # plot_confusion_matrix([1 if v > conf_thres else 0 for v in preds], y) # 绘制混淆矩阵 
    # print(classification_report(y, [1 if v > conf_thres else 0 for v in preds])) # 给出分类评估结果
    print('Full AUC score %.6f' % roc_auc_score(y, preds))     # (真实label, 预测label)
    print('='*20)

In [None]:
# use_cols

In [None]:
# 参数定义
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss','auc'],
    'num_leaves':80,
    'max_depth':3, 
    'learning_rate': 0.1, 
    'feature_fraction': 0.4, 
    'num_threads':10,
    'bagging_fraction': 0.4,
    'bagging_freq': 5, 
    'lambda_l2':8.0,  
    'random_state':42,
    'min_gain_to_split':0.2,
    'verbose':-1
}
from sklearn.model_selection import StratifiedKFold
stratifiedKFolds = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

use_cols = [
            'soc_min', 'max_temp_max', 'max_temp_min', 'current_mean',
            'patv_mean', 'current_min', 'min_temp_max', 'ts_interval_std',
            'max_temp_mean', 'soc_mean', 'soc_std', 'mileage',
            'current_tnse1', 'patv_min', 'ts_interval_mean', 'min_temp_mean',
            'milecls', 'current_tnse0', 'volt_tnse0', 
            'ts_interval_outliers', 'volt_tnse1', 'min_temp_min', 'volt_std',
            'ts_interval_min',
            'min_single_volt_min','max_single_volt_max',
            ]

print(len(use_cols))
trn_df.loc[trn_df['label']==10,'label'] = 1
trn_x = trn_df[use_cols].values
trn_y = trn_df['label'].values
test_x = tst_df[use_cols].values
test_pred = []
fold = 0

for (trn_idx, val_idx) in stratifiedKFolds.split(trn_x, trn_y):
    xtrain, xval = trn_x[trn_idx], trn_x[val_idx]
    ytrain, yval = trn_y[trn_idx], trn_y[val_idx]   
   
    ros = KMeansSMOTE(sampling_strategy={0:26108*10},random_state = 1)
    xtrain, ytrain = ros.fit_resample(xtrain, ytrain)

    # 模型训练和验证
    cat_feats = ['milecls']
    lgb_train = lgb.Dataset(xtrain, ytrain, categorical_feature=cat_feats)
    lgb_eval =  lgb.Dataset(xval, yval, categorical_feature=cat_feats)
    lgb_model = lgb.train(params,
                    lgb_train,
                    num_boost_round=500,
                    valid_sets=[lgb_train, lgb_eval],
                    early_stopping_rounds = 10,
                    verbose_eval = 100,
                    feature_name = use_cols
                    )
    # 模型测试
    y_trn_pred = lgb_model.predict(xtrain, num_iteration=lgb_model.best_iteration)
    y_val_pred = lgb_model.predict(xval, num_iteration=lgb_model.best_iteration)
    y_test_pred = lgb_model.predict(test_x, num_iteration=lgb_model.best_iteration)
    test_pred.append(y_test_pred)

    print('=============Train=============')
    analyse_prediction(y_trn_pred, ytrain)
    print('=============Valid=============')
    analyse_prediction(y_val_pred, yval)

    # 保存模型
    lgb_model.save_model(f'./lgb/model_{fold}.txt', num_iteration=lgb_model.best_iteration)

    # 获取特征重要性
    importance = lgb_model.feature_importance(importance_type='gain')
    feature_name = lgb_model.feature_name()
    feature_importance = pd.DataFrame({'feature_name':feature_name,'importance':importance})
    importance = feature_importance.sort_values(by='importance', ascending=False)
    importance = importance.iloc[:50,:]
    importance.to_csv(f'./lgb/feature_importance_{fold}.csv', index=False)
    fold += 1

In [63]:
len(trn_idx), len(val_idx), len(test_pred)

(25551, 2838, 10)

In [68]:
sub_df = pd.DataFrame()
sub_df['file_name'] = tst_df['pkl_id'].values
sub_df['score'] = np.mean(test_pred, axis=0)
sub_df['file_name'] = sub_df['file_name'].apply(lambda x:str(x)+'.pkl')
sub_df.to_csv('./lgb/sub_lgb.csv',index=False)

In [None]:
sub_df