In [27]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.decomposition import PCA
import warnings
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

In [28]:
def group_feature(df, key, target, aggs):   
    """
    输出不同聚合字段-方法组合dataframe
    df 数据表
    key 用于分组的字段  x, y, v, d
    target 用于计算聚合的字段
    aggs  聚合的方法  ['max','min','mean','std','skew','sum']
    """
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag  # 例如 x_max 代表输出每个船（id）数据中里面的最大值
#     print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
#     print(t)
    return t

def extract_feature(df, train):
    # x的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # x的数量
    t = group_feature(df, 'ship','x',['count'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # y的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # speed_time的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','speed_time',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # 速度的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # 方向的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # x y 的最大值和最小值的交叉相减
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    
    # slope 第一个值： y的最大值和最小值的差
    # slope 第二个值： x的最大值最小值的差，如果差值为0，则去极小值 0.001
    # slope代表x y 极差的比值，可以理解为斜度，坡度
    # np.where 是条件判断 True -> x;False -> y;
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    # x y 的极差相乘，表示该船的最大活动面积
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    # 每条船数据中出现小时值次数最多的值
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train['mode_hour'] = train['ship'].map(mode_hour)
    
    # 小时的最大值，最小值
    t = group_feature(df, 'ship','hour',['max','min'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # 一次任务涉及到的小时数值
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    # 一次任务涉及的天数
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train['hour_nunique'] = train['ship'].map(hour_nunique)
    train['date_nunique'] = train['ship'].map(date_nunique)
    
    # 一次任务的时间差
    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    # 时间差的天数，秒数
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    return train

def extract_dt(df):
    # 时间格式转换
#     df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df['time'] = pd.to_datetime(df['time'])

    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date  # 提取日期
    df['hour'] = df['time'].dt.hour  # 提取小时
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday  # 提取星期
    return df

def PAC():
    pass

In [29]:
# 读取数据，整理列信息
# train = pd.read_hdf('../input/train.h5')
train = pd.read_csv("/Users/nick/Documents/dataset/智慧海洋/train_v2.csv")
train = train.drop(["Unnamed: 0"], axis=1)
train = train.rename(columns={
    "速度": "v",
    "方向": "d",
    "渔船ID": "ship",
})
# train = df.drop_duplicates(['ship','type'])

In [30]:
train.head()

Unnamed: 0,ship,x,y,v,d,time,type,speed_time
0,6966,6265902.0,5279254.0,0.11,306,1900-11-06 23:58:16,围网,
1,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:48:21,围网,595.0
2,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:38:19,围网,602.0
3,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:28:36,围网,583.0
4,6966,6265902.0,5279254.0,0.32,130,1900-11-06 23:08:17,围网,1219.0


In [31]:
# 读取数据，整理列信息
# test = pd.read_hdf('../input/test.h5')
test = pd.read_csv("/Users/nick/Documents/dataset/智慧海洋/test_v2.csv")

test = test.drop(["Unnamed: 0"], axis=1)
test = test.rename(columns={
    "速度": "v",
    "方向": "d",
    "渔船ID": "ship",
})

In [32]:
test.head()

Unnamed: 0,ship,x,y,v,d,time,speed_time
0,8793,6102450.0,5112760.0,0.0,0,1900-11-06 23:56:34,
1,8793,6102450.0,5112760.0,0.0,0,1900-11-06 23:46:34,600.0
2,8793,6102450.0,5112760.0,0.0,0,1900-11-06 23:37:31,543.0
3,8793,6102450.0,5112760.0,0.16,0,1900-11-06 23:26:34,657.0
4,8793,6102450.0,5112760.0,0.0,0,1900-11-06 23:16:34,600.0


In [None]:
# train_df["record"] = "train"
# test_df["record"] = "test"
# df = pd.concat([train, test], axis=0)

In [33]:
# 处理时间数据
train = extract_dt(train)

In [34]:
test = extract_dt(test)

In [35]:
train.head()

Unnamed: 0,ship,x,y,v,d,time,type,speed_time,date,hour,weekday
0,6966,6265902.0,5279254.0,0.11,306,1900-11-06 23:58:16,围网,,1900-11-06,23,1
1,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:48:21,围网,595.0,1900-11-06,23,1
2,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:38:19,围网,602.0,1900-11-06,23,1
3,6966,6265902.0,5279254.0,0.0,0,1900-11-06 23:28:36,围网,583.0,1900-11-06,23,1
4,6966,6265902.0,5279254.0,0.32,130,1900-11-06 23:08:17,围网,1219.0,1900-11-06,23,1


In [36]:
# 删除重复行数据
print("train", train.shape)
print("test", test.shape)
train_label = train.drop_duplicates('ship')
test_label = test.drop_duplicates('ship')
print("train", train.shape)
print("test", test.shape)

train (2699638, 11)
test (782378, 10)
train (2699638, 11)
test (782378, 10)


In [37]:
# 查看分类占比
train_label['type'].value_counts(1)

拖网    0.623000
围网    0.231571
刺网    0.145429
Name: type, dtype: float64

In [38]:
# 标签使用数字代替
type_map = dict(zip(train_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
train_label['type'] = train_label['type'].map(type_map)
type_map_rev

{0: '围网', 1: '拖网', 2: '刺网'}

In [39]:
# 分类占比
train_label['type'].value_counts(1)

1    0.623000
0    0.231571
2    0.145429
Name: type, dtype: float64

In [40]:
# 构造新列
train_label = extract_feature(train, train_label)

In [41]:
test_label = extract_feature(test, test_label)

In [42]:
# 数据按照ship id排序
train_label = train_label.sort_values("ship")
test_label = test_label.sort_values("ship")

In [43]:
train_label.head(20)

Unnamed: 0,ship,x,y,v,d,time,type,speed_time,date,hour,weekday,x_max,x_min,x_mean,x_std,x_skew,x_sum,x_count,y_max,y_min,y_mean,y_std,y_skew,y_sum,speed_time_max,speed_time_min,speed_time_mean,speed_time_std,speed_time_skew,speed_time_sum,v_max,v_min,v_mean,v_std,v_skew,v_sum,d_max,d_min,d_mean,d_std,d_skew,d_sum,x_max_x_min,y_max_y_min,y_max_x_min,x_max_y_min,slope,area,mode_hour,hour_max,hour_min,hour_nunique,date_nunique,diff_time,diff_day,diff_second
1038,0,6152038.0,5124873.0,2.59,102,1900-11-10 11:58:19,1,,1900-11-10,11,5,6152038.0,6118352.0,6119351.0,5037.320747,5.255558,2533411000.0,414,5130781.0,5124873.0,5130494.0,850.264541,-4.762308,2124025000.0,2343.0,1.0,625.983051,168.453927,4.810326,258531.0,9.39,0.0,0.265966,1.321248,5.520205,110.11,129,0,4.613527,21.24777,4.483093,1910,33686.667453,5907.975523,-987570.4,1027165.0,0.17538,199020000.0,15,23,0,24,4,2 days 23:48:51,2,85731
1202,1,6076254.0,5061743.0,3.99,278,1900-11-10 11:40:21,1,,1900-11-10,11,5,6102450.0,6049472.0,6091460.0,16543.394419,-1.058454,2345212000.0,385,5112874.0,5042857.0,5094050.0,26764.042729,-0.802446,1961209000.0,2948.0,50.0,671.841146,297.11422,4.355572,257987.0,10.47,0.0,1.607922,2.412688,1.590284,619.05,336,0,56.153247,91.449382,1.418867,21619,52978.013345,70016.655842,-936597.9,1059593.0,1.321617,3709343000.0,19,23,0,24,4,2 days 23:39:47,2,85187
1667,2,6183090.0,5193685.0,0.32,145,1900-11-17 11:41:58,1,,1900-11-17,11,5,6183191.0,6182482.0,6183011.0,207.869601,-2.155218,1440641000.0,233,5193696.0,5193576.0,5193682.0,21.740609,-4.563165,1210128000.0,3613.0,3.0,1110.762931,577.820813,0.463753,257697.0,50.46,0.0,0.59515,3.415824,13.63159,138.67,360,0,123.356223,123.097127,0.657506,28742,708.835147,120.565,-988786.1,989615.5,0.170089,85460.71,17,23,0,24,4,2 days 23:34:57,2,84897
1481,3,5229849.0,4608510.0,0.32,0,1900-11-10 11:50:10,1,,1900-11-10,11,5,5287805.0,5228590.0,5239159.0,17503.714347,1.608637,1755118000.0,335,4608628.0,4577467.0,4601532.0,11590.605179,-1.19421,1541513000.0,3025.0,1.0,771.553892,470.031245,1.283994,257699.0,10.09,0.0,1.471343,2.528593,2.135446,492.9,352,0,121.134328,121.758165,0.469794,40580,59214.73874,31160.661097,-619962.1,710337.5,0.526232,1845170000.0,22,23,0,24,4,2 days 23:34:59,2,84899
734,4,7061772.0,6125021.0,0.22,119,1900-11-17 11:57:24,0,,1900-11-17,11,5,7070797.0,7049394.0,7062005.0,5979.578887,-0.596732,2831864000.0,401,6136033.0,6094996.0,6116389.0,12055.148984,-0.331618,2452672000.0,2453.0,537.0,647.58,186.449355,4.726044,259032.0,10.09,0.0,1.412219,2.496836,1.910336,566.3,359,0,139.067332,121.130025,0.372601,55766,21402.484584,41036.883038,-913361.4,975800.7,1.917389,878291300.0,23,23,0,24,4,2 days 23:57:12,2,86232
606,5,6388833.0,5340338.0,2.81,212,1900-11-06 23:52:56,1,,1900-11-06,23,1,6503900.0,6388833.0,6488945.0,22271.760798,-3.00683,2433354000.0,375,5603770.0,5340338.0,5576252.0,62867.041759,-2.37181,2091094000.0,3430.0,1.0,690.927807,348.735706,4.054919,258407.0,10.09,0.0,1.713813,3.105356,1.666827,642.68,360,0,130.461333,105.256336,0.277056,48923,115066.764852,263431.538333,-785063.6,1163562.0,2.28938,30312210000.0,10,23,0,24,3,2 days 23:46:47,2,85607
144,6,6057481.0,5019993.0,4.59,77,1900-11-03 11:58:40,1,,1900-11-03,11,5,6079949.0,6032505.0,6058099.0,11331.957978,-0.262505,2386891000.0,394,5047377.0,5004409.0,5028613.0,12271.199509,-0.373218,1981273000.0,7798.0,455.0,657.979644,404.168344,14.439236,258586.0,6.75,0.86,3.750787,0.784521,-0.200335,1477.81,358,0,142.972081,99.220841,0.240674,56331,47444.02313,42967.888322,-985128.6,1075541.0,0.905654,2038569000.0,11,23,0,24,4,2 days 23:49:46,2,85786
288,7,6588193.0,5813085.0,0.27,37,1900-11-23 23:54:17,0,,1900-11-23,23,4,6986976.0,6587996.0,6759621.0,136484.691403,-0.017425,2352348000.0,348,5814691.0,5785242.0,5805508.0,7062.945568,-0.896368,2020317000.0,1903.0,443.0,620.70317,147.456689,6.130863,215384.0,13.49,0.0,4.487529,4.217782,0.229882,1561.66,351,0,168.327586,127.59555,-0.344343,58578,398980.613694,29448.10389,-773305.2,1201734.0,0.073808,11749220000.0,23,23,0,24,3,2 days 11:49:44,2,42584
6420,8,6189711.0,5141803.0,2.21,204,1900-11-03 11:50:29,1,,1900-11-03,11,5,6215849.0,6181299.0,6201416.0,8892.414131,-0.587547,2269718000.0,366,5166370.0,5101720.0,5139750.0,15473.350143,-0.698113,1881149000.0,3003.0,15.0,707.769863,315.803583,2.632006,258336.0,8.2,0.0,3.341366,1.279559,-0.073893,1222.94,359,0,138.114754,99.758217,0.124689,50550,34549.985805,64649.307235,-1014930.0,1114129.0,1.871182,2233633000.0,5,23,0,24,4,2 days 23:45:36,2,85536
6288,9,6102853.0,5112531.0,0.0,0,1900-11-06 23:55:26,1,,1900-11-06,23,1,6102853.0,6102853.0,6102853.0,0.0,0.0,2422833000.0,397,5112531.0,5112531.0,5112531.0,0.0,0.0,2029675000.0,9118.0,417.0,652.994949,486.840805,14.510541,258586.0,0.16,0.0,0.012519,0.027956,3.008194,4.97,0,0,0.0,0.0,0.0,0,0.0,0.0,-990322.0,990322.0,0.0,0.0,23,23,0,24,3,2 days 23:49:46,2,85786


In [44]:
# 拆出暂不需要的特征
delete_list = ['ship','type','time','diff_time','date', 'hour_nunique', 'speed_time']
features = [x for x in train_label.columns if x not in delete_list]
target = 'type'

In [45]:
# 查看训练用到的列
print(len(features), '\n,'.join(features))

49 x
,y
,v
,d
,hour
,weekday
,x_max
,x_min
,x_mean
,x_std
,x_skew
,x_sum
,x_count
,y_max
,y_min
,y_mean
,y_std
,y_skew
,y_sum
,speed_time_max
,speed_time_min
,speed_time_mean
,speed_time_std
,speed_time_skew
,speed_time_sum
,v_max
,v_min
,v_mean
,v_std
,v_skew
,v_sum
,d_max
,d_min
,d_mean
,d_std
,d_skew
,d_sum
,x_max_x_min
,y_max_y_min
,y_max_x_min
,x_max_y_min
,slope
,area
,mode_hour
,hour_max
,hour_min
,date_nunique
,diff_day
,diff_second


In [46]:
# lgbm的参数
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
}

In [47]:
# 交叉验证 五折
fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 特征和标签
X = train_label[features].copy()
y = train_label[target]

# 模型列表
models = []
# 预测值
pred = np.zeros((len(test_label),3))  # onehot形式 (2000, 3)
oof = np.zeros((len(X), 3))  # onehot形式 (7000, 3)

# 循环交叉验证
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])  # 筛选训练数据
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])        # 筛选验证数据

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)  # 训练模型
    models.append(model)  # 模型列表
    
    # 预测标签，每个结果是 (len ,3)
    val_pred = model.predict(X.iloc[val_idx])  
    oof[val_idx] = val_pred
    
    # 原始标签
    val_y = y.iloc[val_idx]
    
    # 输出最大值
    val_pred = np.argmax(val_pred, axis=1)
    
    # 计算f1值
    print(index, 'val f1(指定次数的随机验证集F1值: )', metrics.f1_score(val_y, val_pred, average='macro'))
    
    # 预测待提交测试集结果
    test_pred = model.predict(test_label[features])
    pred += test_pred/5

Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.0769279	valid_1's multi_logloss: 0.268747
[200]	training's multi_logloss: 0.0189083	valid_1's multi_logloss: 0.264287
Early stopping, best iteration is:
[159]	training's multi_logloss: 0.0329073	valid_1's multi_logloss: 0.261576
0 val f1(指定次数的随机验证集F1值: ) 0.8558544966181407
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.0730026	valid_1's multi_logloss: 0.301329
[200]	training's multi_logloss: 0.0175061	valid_1's multi_logloss: 0.305969
Early stopping, best iteration is:
[134]	training's multi_logloss: 0.0443033	valid_1's multi_logloss: 0.298972
1 val f1(指定次数的随机验证集F1值: ) 0.8400603601742401
Training until validation scores don't improve for 100 rounds.
[100]	training's multi_logloss: 0.0762111	valid_1's multi_logloss: 0.266105
[200]	training's multi_logloss: 0.0188	valid_1's multi_logloss: 0.266452
Early stopping, best iteration is:
[134]	train

In [48]:
# 综合五次的交叉验证的结果评估值
oof = np.argmax(oof, axis=1)
print('【准确率】oof f1: ', metrics.f1_score(oof, y, average='macro'))

# 0.8666565020816382
# 0.8556040441133175
# 0.8556040441133175

【准确率】oof f1:  0.8556040441133175


In [49]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('result.csv', index=None, header=None)

1    0.637
0    0.234
2    0.129
Name: pred, dtype: float64


In [50]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [51]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [52]:
df

Unnamed: 0,name,score
42,y_max_x_min,612.6
41,y_max,578.4
36,x_min,562.4
34,x_max_y_min,537.2
27,v_std,486.2
30,x,465.4
40,y,461.4
26,v_skew,412.4
15,slope,389.2
45,y_min,386.0
