In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
from tqdm import tqdm
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn import metrics
from sklearn.decomposition import PCA
import warnings
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.preprocessing import StandardScaler, MinMaxScaler, Imputer, LabelBinarizer, Normalizer, OneHotEncoder
from sklearn.manifold import TSNE
from tqdm import tqdm

pd.set_option('display.max_columns', 100)
warnings.filterwarnings('ignore')

In [2]:
def group_feature(df, key, target, aggs):   
    """
    输出不同聚合字段-方法组合dataframe
    df 数据表
    key 用于分组的字段  x, y, v, d
    target 用于计算聚合的字段
    aggs  聚合的方法  ['max','min','mean','std','skew','sum']
    """
    agg_dict = {}
    for ag in aggs:
        agg_dict[f'{target}_{ag}'] = ag  # 例如 x_max 代表输出每个船（id）数据中里面的最大值
#     print(agg_dict)
    t = df.groupby(key)[target].agg(agg_dict).reset_index()
#     print(t)
    return t

def extract_feature(df, train):
    # x的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','x',['max','min','mean','std','skew','sum', 'median'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # x的数量
    t = group_feature(df, 'ship','x',['count'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # y的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','y',['max','min','mean','std','skew','sum', 'median'])
    train = pd.merge(train, t, on='ship', how='left')
    
#     # speed_time 的最大值，最小值，平均值，标准差，峰度，加和
#     t = group_feature(df, 'ship','speed_time',['max','min','mean','std','skew','sum', 'median'])
#     train = pd.merge(train, t, on='ship', how='left')
    
    # xy 的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','xy',['max','min','mean','std','skew','sum', 'median'])
    train = pd.merge(train, t, on='ship', how='left')
    
#     # xy_loc_pca 的最大值，最小值，平均值，标准差，峰度，加和
#     t = group_feature(df, 'ship','xy_loc_pca',['max','min','mean','std','skew','sum', 'median'])
#     train = pd.merge(train, t, on='ship', how='left')    
    
#     # vd的最大值，最小值，平均值，标准差，峰度，加和
#     t = group_feature(df, 'ship','vd',['max','min','mean','std','skew','sum'])
#     train = pd.merge(train, t, on='ship', how='left')
    
    # 速度的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','v',['max','min','mean','std','skew','sum', 'median'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # 方向的最大值，最小值，平均值，标准差，峰度，加和
    t = group_feature(df, 'ship','d',['max','min','mean','std','skew','sum', 'median'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # x y 的最大值和最小值的交叉相减
    train['x_max_x_min'] = train['x_max'] - train['x_min']
    train['y_max_y_min'] = train['y_max'] - train['y_min']
    train['y_max_x_min'] = train['y_max'] - train['x_min']
    train['x_max_y_min'] = train['x_max'] - train['y_min']
    
    train['x_median_a_y_median'] = train["x_median"] + train["y_median"]
    train['x_median_b_y_median'] = train["x_median"] - train["y_median"]
    train['x_median_c_y_median'] = train["x_median"] * train["y_median"]
    train['x_median_d_y_median'] = train["x_median"] / np.where(train['y_median']==0, 0.001, train['y_median'])
    
    bizhi = ["x", "y", "v", "d"]
    for i in range(4):
        for j in range(4):
            if i < j:
                n = "{}_d_{}".format(bizhi[i], bizhi[j])
                train[n] = train[bizhi[i]] / np.where(train[bizhi[j]]==0, 0.001, train[bizhi[j]])
    
    # slope 第一个值： y的最大值和最小值的差
    # slope 第二个值： x的最大值最小值的差，如果差值为0，则去极小值 0.001
    # slope代表x y 极差的比值，可以理解为斜度，坡度
    # np.where 是条件判断 True -> x;False -> y;
    train['slope'] = train['y_max_y_min'] / np.where(train['x_max_x_min']==0, 0.001, train['x_max_x_min'])
    # x y 的极差相乘，表示该船的最大活动面积
    train['area'] = train['x_max_x_min'] * train['y_max_y_min']
    
    # 每条船数据中出现小时值次数最多的值
    mode_hour = df.groupby('ship')['hour'].agg(lambda x:x.value_counts().index[0]).to_dict()
    train['mode_hour'] = train['ship'].map(mode_hour)
    
    # 小时的最大值，最小值
    t = group_feature(df, 'ship','hour',['max','min'])
    train = pd.merge(train, t, on='ship', how='left')
    
    # 一次任务涉及到的小时数值
    hour_nunique = df.groupby('ship')['hour'].nunique().to_dict()
    # 一次任务涉及的天数
    date_nunique = df.groupby('ship')['date'].nunique().to_dict()
    train['hour_nunique'] = train['ship'].map(hour_nunique)
    train['date_nunique'] = train['ship'].map(date_nunique)
    
    # 一次任务的时间差
    t = df.groupby('ship')['time'].agg({'diff_time':lambda x:np.max(x)-np.min(x)}).reset_index()
    # 时间差的天数，秒数
    t['diff_day'] = t['diff_time'].dt.days
    t['diff_second'] = t['diff_time'].dt.seconds
    train = pd.merge(train, t, on='ship', how='left')
    return train

def extract_dt(df):
    # 时间格式转换
#     df['time'] = pd.to_datetime(df['time'], format='%m%d %H:%M:%S')
    df['time'] = pd.to_datetime(df['time'])

    # df['month'] = df['time'].dt.month
    # df['day'] = df['time'].dt.day
    df['date'] = df['time'].dt.date  # 提取日期
    df['hour'] = df['time'].dt.hour  # 提取小时
    # df = df.drop_duplicates(['ship','month'])
    df['weekday'] = df['time'].dt.weekday  # 提取星期
    return df

def PAC():
    pass

In [3]:
# 读取数据，整理列信息
# train = pd.read_hdf('../input/train.h5')
train = pd.read_csv("/Users/nick/Documents/dataset/智慧海洋/train_v2.csv")
train = train.drop(["Unnamed: 0"], axis=1)
train = train.rename(columns={
    "速度": "v",
    "方向": "d",
    "渔船ID": "ship",
})
# train = df.drop_duplicates(['ship','type'])
train = train.sort_values("ship")

In [4]:
# 读取数据，整理列信息
# test = pd.read_hdf('../input/test.h5')
test = pd.read_csv("/Users/nick/Documents/dataset/智慧海洋/test_v2.csv")

test = test.drop(["Unnamed: 0"], axis=1)
test = test.rename(columns={
    "速度": "v",
    "方向": "d",
    "渔船ID": "ship",
})
test = test.sort_values("ship")

In [5]:
# 合并训练集和测试集
train = train[:10000]
test = test[:5000]

train["record"] = "train"
test["record"] = "test"
df_all = pd.concat([train, test], axis=0)
df_all.head()

Unnamed: 0,d,record,ship,speed_time,time,type,v,x,y
399657,0,train,0,600.0,1900-11-09 16:18:22,拖网,0.0,6118352.0,5130672.0
399676,0,train,0,600.0,1900-11-09 13:08:23,拖网,0.0,6118352.0,5130672.0
399677,0,train,0,600.0,1900-11-09 12:58:23,拖网,0.0,6118352.0,5130672.0
399678,0,train,0,600.0,1900-11-09 12:48:23,拖网,0.0,6118352.0,5130672.0
399679,0,train,0,601.0,1900-11-09 12:38:22,拖网,0.0,6118352.0,5130672.0


In [6]:
# 查看数据的相关统计指标
df_all.describe()

Unnamed: 0,d,ship,speed_time,v,x,y
count,15000.0,15000.0,14962.0,15000.0,15000.0,15000.0
mean,119.754933,2343.8034,661.591632,1.743463,6342667.0,5325664.0
std,115.750804,3296.560713,313.853556,2.69708,376251.1,299142.2
min,0.0,0.0,1.0,0.0,5228590.0,4577467.0
25%,0.0,10.0,599.0,0.11,6182402.0,5163718.0
50%,91.0,19.0,600.0,0.32,6262057.0,5242462.0
75%,221.0,7003.0,603.0,3.02,6510065.0,5503023.0
max,360.0,7012.0,9118.0,80.73,7119130.0,6136033.0


# 对xy的高频坐标点进行one-hot变量的处理

In [7]:
# x y 值统计特征
df_all_xy_count = df_all[["ship", "x", "y"]]

print("df_all_xy_count", df_all_xy_count.shape)
# 删掉 ship-x-y 的重复值

df_all_xy_count["x"] = (df_all_xy_count["x"]/10).astype(int)
df_all_xy_count["y"] = (df_all_xy_count["y"]/10).astype(int)

df_all_xy_count = df_all_xy_count.drop_duplicates(["ship", "x", "y"])
print("df_all_xy_count", df_all_xy_count.shape)

# df_all_xy_count["ship"] = df_all_xy_count.astype("object")
# df_all_xy_count["x"] = df_all_xy_count.astype("object")
# df_all_xy_count["y"] = df_all_xy_count.astype("object")

x_counts = df_all_xy_count.groupby("x")["x"].count()
x_count_dict = x_counts[x_counts >= 30].to_dict()

y_counts = df_all_xy_count.groupby("y")["y"].count()
y_count_dict = y_counts[y_counts >= 30].to_dict()

# x_count_dict
# y_count_dict

print("x_count_dict: {}".format(len(x_count_dict)))
print("y_count_dict: {}".format(len(y_count_dict)))

# 查看xy的交叉值
# print(set(x_count_dict) & set(y_count_dict))

df_all_xy_count (15000, 3)
df_all_xy_count (6623, 3)
x_count_dict: 0
y_count_dict: 0


In [8]:
# 船号
df_all.groupby("ship")["ship"].count()

ship
0       414
1       385
2       233
3       335
4       401
5       375
6       394
7       348
8       366
9       397
10      397
11      377
12      416
13      426
14      402
15      382
16      398
17      265
18      409
19      388
20      376
21      415
22      422
23      399
24      392
25      388
26      100
7000    373
7001    458
7002    410
7003    425
7004    398
7005    404
7006    316
7007    411
7008    430
7009    419
7010    412
7011    398
7012    146
Name: ship, dtype: int64

In [9]:
ship_index_list = list(df_all.groupby("ship")["ship"].count().index)
print(ship_index_list)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 7000, 7001, 7002, 7003, 7004, 7005, 7006, 7007, 7008, 7009, 7010, 7011, 7012]


In [10]:
# 整理每一条船的 x => [] 和 y => [] 还有对应的船号
x_count_list = []
y_count_list = []
ship_list = []
for i in tqdm(ship_index_list):
    ship_list.append(i)
    x_count_list.append(list(df_all_xy_count[df_all_xy_count["ship"]==i]["x"]))
    y_count_list.append(list(df_all_xy_count[df_all_xy_count["ship"]==i]["y"]))

100%|██████████| 40/40 [00:00<00:00, 726.37it/s]


In [11]:
# ship 对应 x‘ y’ 的 df
xy_count_list = pd.DataFrame({"ship": ship_list, "x_count_list": x_count_list, "y_count_list": y_count_list})

# 补充one hot编码的列， 全部填充0
for a, _ in tqdm(x_count_dict.items()):
    a = "{}_x".format(a)
    xy_count_list[a] = 0
    
for a, _ in tqdm(y_count_dict.items()):
    a = "{}_y".format(a)
    xy_count_list[a] = 0
    
xy_count_list.head(1)

0it [00:00, ?it/s]
0it [00:00, ?it/s]


Unnamed: 0,ship,x_count_list,y_count_list
0,0,"[611835, 615203, 615122, 615042, 614961, 61488...","[513067, 512487, 512521, 512556, 512590, 51262..."


In [12]:
# 填充每个船只的one hot编码 对应的值
print(xy_count_list.shape)
# xy_count_list = xy_count_list.reset_index(drop=True)

for index2, i_list in tqdm(enumerate(xy_count_list["x_count_list"])):
    
    common_key = set(list(x_count_dict.keys())) & set(i_list)
    common_key = list(common_key)
    
    common_key_s = []
    for c in common_key:
        common_key_s.append("{}_x".format(c))
    
    try:
        xy_count_list.loc[index2, common_key_s] = 1
    except Exception as e:
        raise e

for index2, i_list in tqdm(enumerate(xy_count_list["y_count_list"])):
    
    common_key = set(list(y_count_dict.keys())) & set(i_list)
    common_key = list(common_key)
    
    common_key_s = []
    for c in common_key:
        common_key_s.append("{}_y".format(c))
    
    try:
        xy_count_list.loc[index2, common_key_s] = 1
    except Exception as e:
        raise e

0it [00:00, ?it/s]

(40, 3)


40it [00:00, 2179.00it/s]
40it [00:00, 2671.66it/s]


In [13]:
xy_count_list.head(1)

Unnamed: 0,ship,x_count_list,y_count_list
0,0,"[611835, 615203, 615122, 615042, 614961, 61488...","[513067, 512487, 512521, 512556, 512590, 51262..."


In [16]:
# 删除不用的列
xy_count_list_drop_df_ship = xy_count_list[["ship"]]
xy_count_list_drop_df = xy_count_list.drop(["ship", "x_count_list", "y_count_list"], axis=1)
xy_count_list_drop_df.head(2)

0
1


In [15]:
# 对较高频的坐标进行降维
print(xy_count_list_drop_df.shape)
pca = TSNE(n_components=3)
reduced_xy_count = pca.fit_transform(xy_count_list_drop_df)
print(reduced_xy_count.shape)

reduced_xy_count_df = pd.DataFrame(reduced_xy_count)
reduced_xy_count_df["ship"] = xy_count_list_drop_df_ship["ship"].values
reduced_xy_count_df.head(2)

(40, 0)


ValueError: Found array with 0 feature(s) (shape=(40, 0)) while a minimum of 1 is required.

# 对坐标划分网格

In [None]:
# 确定xy网格范围
xmin = df_all["x"].min()
xmax = df_all["x"].max()

ymin = df_all["y"].min()
ymax = df_all["y"].max()

x_offset = xmax - xmin
y_offset = ymax - ymin

offset_count = 30

x_box = x_offset / offset_count
y_box = y_offset / offset_count


print("x min: {}".format(xmin))
print("x max: {}".format(xmax))
print("x offset: {}".format(x_offset))
print("x box: {}".format(x_box))

print("y min: {}".format(ymin))
print("y max: {}".format(ymax))
print("y offset: {}".format(y_offset))
print("y box: {}".format(y_box))

# 计算网格区域
x_list = []
for i in range(offset_count + 1):
    x_list.append(xmin + i * x_box)
    
y_list = []
for i in range(offset_count + 1):
    y_list.append(ymin + i * y_box)
    
pd.DataFrame({"x_area": x_list, "y_area": y_list})[:2]

In [None]:
# 对网格编码后赋值
def map_loc_x(x):
    for i in range(offset_count+1):
        if x <= x_list[i]:
            return i
        if x > x_list[i] and x <= x_list[i+1]:
            return i

def map_loc_y(y):
    for i in range(offset_count+1):
        if y <= y_list[i]:
            return i
        if y > y_list[i] and y <= y_list[i+1]:
            return i
        
df_all["x_loc"] = df_all["x"].apply(map_loc_x)
df_all["y_loc"] = df_all["y"].apply(map_loc_y)

In [None]:
df_all[df_all["ship"]==0][["x_loc", "y_loc"]].plot.scatter(x='x_loc', y='y_loc', figsize=(6,5))
plt.show()
df_all[df_all["ship"]==0][["x", "y"]].plot.scatter(x='x', y='y', figsize=(6,5))
plt.show()

In [None]:
# 构造新列代表每一个区域
df_all["x_y_loc_area"] = df_all['x_loc'].astype(str) + ',' + df_all['y_loc'].astype(str)
df_all.head(1)

In [None]:
# 抽取每一个区域和对应ship
df_xyloc = df_all[["x_y_loc_area"]]

df_xyloc_dummies = pd.get_dummies(df_xyloc)
print(df_xyloc_dummies.shape)
df_xyloc_dummies.head(2)

In [None]:
print(df_xyloc_dummies.shape)
pca = PCA(n_components=3)
reduced_xy_loc = pca.fit_transform(df_xyloc_dummies)
print(reduced_xy_loc.shape)

print(reduced_xy_loc)

df_all["xy_loc_pca_1"] = reduced_xy_loc[:, 0]
df_all["xy_loc_pca_2"] = reduced_xy_loc[:, 1]
df_all["xy_loc_pca_3"] = reduced_xy_loc[:, 2]
df_all.head(2)

# xyd 降维

In [None]:
df_all_xy = df_all[["x", "y", "d"]]
print(df_all_xy.shape)
pca = TSNE(n_components=1)
reduced_xy = pca.fit_transform(df_all_xy)
print(reduced_xy.shape)
df_all["xy"] = reduced_xy

In [None]:
# df_all_vd = df_all[["v", "d"]]
# print(df_all_vd.shape)
# pca = PCA(n_components=1)     #加载PCA算法，设置降维后主成分数目为2
# reduced_vd = pca.fit_transform(df_all_vd)#对样本进行降维
# print(reduced_vd.shape)
# df_all["vd"] = reduced_vd

# 处理时间数据

In [None]:
# 处理时间数据
df_all = extract_dt(df_all)
df_all.head(1)

In [None]:
# 删除重复行数据
print("df_all", df_all.shape)
df_label = df_all.drop_duplicates('ship')
print("df_label", df_label.shape)

In [None]:
# 查看分类占比
df_label['type'].value_counts(1)

In [None]:
# 标签使用数字代替
type_map = dict(zip(df_label['type'].unique(), np.arange(3)))
type_map_rev = {v:k for k,v in type_map.items()}
df_label['type'] = df_label['type'].map(type_map)
type_map_rev

In [None]:
# 分类占比
df_label['type'].value_counts(1)

In [None]:
# 构造新列
df_label = extract_feature(df_all, df_label)

In [None]:
# 降维后合并
reduced_xy_count_df = reduced_xy_count_df.rename(columns={
    0: "xy_count_pca_1",
    1: "xy_count_pca_2",
    2: "xy_count_pca_3",
})
df_label = pd.merge(df_label, reduced_xy_count_df, on='ship', how='left')
df_label.head(3)

In [None]:
# 拆出暂不需要的特征
delete_list = ['speed_time', "record", 'ship','type','time','diff_time','date', 'x_y_loc_area']
features = [x for x in df_label.columns if x not in delete_list]
target = 'type'

# features = []

In [None]:
# 查看训练用到的列
print(len(features), "\n", '    '.join(features))

In [None]:
train_label = df_label[df_label["record"] == "train"]
test_label = df_label[df_label["record"] == "test"]

train_label = train_label.drop(["record"], axis=1)
test_label = test_label.drop(["record", "type"], axis=1)

train_label[features].head(3)

In [None]:
# lgbm的参数
params = {
    'n_estimators': 5000,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 3,
    'early_stopping_rounds': 100,
    'learning_rate': 0.01,
}

# 交叉验证 五折
fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# 特征和标签
X = train_label[features].copy()
y = train_label[target]

# 模型列表
models = []
# 预测值
pred = np.zeros((len(test_label),3))  # onehot形式 (2000, 3)
oof = np.zeros((len(X), 3))  # onehot形式 (7000, 3)

# 循环交叉验证
for index, (train_idx, val_idx) in enumerate(fold.split(X, y)):

    train_set = lgb.Dataset(X.iloc[train_idx], y.iloc[train_idx])  # 筛选训练数据
    val_set = lgb.Dataset(X.iloc[val_idx], y.iloc[val_idx])        # 筛选验证数据

    model = lgb.train(params, train_set, valid_sets=[train_set, val_set], verbose_eval=100)  # 训练模型
    models.append(model)  # 模型列表
    
    # 预测标签，每个结果是 (len ,3)
    val_pred = model.predict(X.iloc[val_idx])  
    oof[val_idx] = val_pred
    
    # 原始标签
    val_y = y.iloc[val_idx]
    
    # 输出最大值
    val_pred = np.argmax(val_pred, axis=1)
    
    # 计算f1值
    print(index, 'val f1(指定次数的随机验证集F1值: )', metrics.f1_score(val_y, val_pred, average='macro'))
    
    # 预测待提交测试集结果
    test_pred = model.predict(test_label[features])
    pred += test_pred/5

In [None]:
# 综合五次的交叉验证的结果评估值
oof = np.argmax(oof, axis=1)
print('【准确率】oof f1: ', metrics.f1_score(oof, y, average='macro'))

# origin 0.8666565020816382
# speed time 0.8556040441133175
# speed time + xy 0.869776300826063
# xy 0.8695008449788421
# speed time + xyvd 0.8666262659963254
# speed time + xyd 0.8712710026618916
# speed time + xyv 0.8677183665397498
# speed time + xy + vd 0.8657659692232876
# speed time + xyd + vd 0.8683521658107259
# xy loc 0.8755378306270695

In [None]:
pred = np.argmax(pred, axis=1)
sub = test_label[['ship']]
sub['pred'] = pred

print(sub['pred'].value_counts(1))
sub['pred'] = sub['pred'].map(type_map_rev)
sub.to_csv('result.csv', index=None, header=None)

In [None]:
ret = []
for index, model in enumerate(models):
    df = pd.DataFrame()
    df['name'] = model.feature_name()
    df['score'] = model.feature_importance()
    df['fold'] = index
    ret.append(df)
    
df = pd.concat(ret)

In [None]:
df = df.groupby('name', as_index=False)['score'].mean()
df = df.sort_values(['score'], ascending=False)

In [None]:
df

In [None]:
df.shape