In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from pyproj import Proj
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from gensim.models import Word2Vec
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import lightgbm as lgb
import os
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def geohash_encode(latitude, longitude, precision=12):
    """
    Encode a position given in float arguments latitude, longitude to
    a geohash which will have the character count precision.
    """
    lat_interval, lon_interval = (-90.0, 90.0), (-180.0, 180.0)
    base32 = '0123456789bcdefghjkmnpqrstuvwxyz'
    geohash = []
    bits = [16, 8, 4, 2, 1]
    bit = 0
    ch = 0
    even = True
    while len(geohash) < precision:
        if even:
            mid = (lon_interval[0] + lon_interval[1]) / 2
            if longitude > mid:
                ch |= bits[bit]
                lon_interval = (mid, lon_interval[1])
            else:
                lon_interval = (lon_interval[0], mid)
        else:
            mid = (lat_interval[0] + lat_interval[1]) / 2
            if latitude > mid:
                ch |= bits[bit]
                lat_interval = (mid, lat_interval[1])
            else:
                lat_interval = (lat_interval[0], mid)
        even = not even
        if bit < 4:
            bit += 1
        else:
            geohash += base32[ch]
            bit = 0
            ch = 0
    return ''.join(geohash)


def hashfxn(astring):
    return ord(astring[0])


def tfidf(input_values, output_num, output_prefix, seed=1024):
    tfidf_enc = TfidfVectorizer()
    tfidf_vec = tfidf_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(tfidf_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_tfidf_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp


def count2vec(input_values, output_num, output_prefix, seed=1024):
    count_enc = CountVectorizer()
    count_vec = count_enc.fit_transform(input_values)
    svd_tmp = TruncatedSVD(n_components=output_num, n_iter=20, random_state=seed)
    svd_tmp = svd_tmp.fit_transform(count_vec)
    svd_tmp = pd.DataFrame(svd_tmp)
    svd_tmp.columns = ['{}_countvec_{}'.format(output_prefix, i) for i in range(output_num)]
    return svd_tmp


def get_geohash_tfidf(df, group_id, group_target, num):
    # tfidf_df = get_geohash_tfidf(df, 'ID', 'lat_lon', 30)
    df[group_target] = df.apply(lambda x: geohash_encode(x['lat'], x['lon'], 7), axis=1)
    tmp = df.groupby(group_id)[group_target].agg(list).reset_index()
    tmp[group_target] = tmp[group_target].apply(lambda x: ' '.join(x))

    tfidf_tmp = tfidf(tmp[group_target], num, group_target)
    count_tmp = count2vec(tmp[group_target], num, group_target)
    return pd.concat([tmp[[group_id]], tfidf_tmp, count_tmp], axis=1)


def get_grad_tfidf(df, group_id, group_target, num):
    # grad_tfidf = get_grad_tfidf(df, 'ID', 'grad', 30)
    grad_df = df.groupby(group_id)['lat'].apply(lambda x: np.gradient(x)).reset_index()
    grad_df['lon'] = df.groupby(group_id)['lon'].apply(lambda x: np.gradient(x))
    grad_df['lat'] = grad_df['lat'].apply(lambda x: np.round(x, 4))
    grad_df['lon'] = grad_df['lon'].apply(lambda x: np.round(x, 4))
    # grad_df[group_target] = grad_df.apply(
    #     lambda x: ' '.join(['{}_{}'.format(z[0], z[1]) for z in zip(x['lat'], x['lon'])]), axis=1)
    grad_df[group_target] = grad_df.apply(lambda x: str(x['lat']) + ' ' + str(x['lon']), axis=1)

    tfidf_tmp = tfidf(grad_df[group_target], num, group_target)
    return pd.concat([grad_df[[group_id]], tfidf_tmp], axis=1)


def get_sample_tfidf(df, group_id, group_target, num):
    # sample_tfidf = get_sample_tfidf(df, 'ID', 'sample', 30)
    tmp = df.groupby(group_id)['lat_lon'].apply(lambda x: x.sample(frac=0.1, random_state=1)).reset_index()
    del tmp['level_1']
    tmp.columns = [group_id, group_target]
    tmp = tmp.groupby(group_id)[group_target].agg(list).reset_index()
    tmp[group_target] = tmp[group_target].apply(lambda x: ' '.join(x))

    tfidf_tmp = tfidf(tmp[group_target], num, group_target)
    return pd.concat([tmp[[group_id]], tfidf_tmp], axis=1)


# workers设为1可复现训练好的词向量，但速度稍慢，若不考虑复现的话，可对此参数进行调整
def w2v_feat(df, group_id, feat, length):
    # w2v_df = w2v_feat(df, 'ID', 'lat_lon', 30)
    print('start word2vec ...')
    data_frame = df.groupby(group_id)[feat].agg(list).reset_index()
    model = Word2Vec(data_frame[feat].values, size=length, window=5, min_count=1, sg=1, hs=1,
                     workers=1, iter=10, seed=1, hashfxn=hashfxn)
    data_frame[feat] = data_frame[feat].apply(lambda x: pd.DataFrame([model[c] for c in x]))
    for m in range(length):
        data_frame['w2v_{}_mean'.format(m)] = data_frame[feat].apply(lambda x: x[m].mean())
    del data_frame[feat]
    return data_frame


def d2v_feat(df, group_id, feat, length):
    print('start doc2vec ...')
    data_frame = df.groupby(group_id)[feat].agg(list).reset_index()
    documents = [TaggedDocument(doc, [i]) for i, doc in zip(data_frame[group_id].values, data_frame[feat])]
    model = Doc2Vec(documents, vector_size=length, window=5, min_count=1, workers=1, seed=1, hashfxn=hashfxn, 
                    epochs=10, sg=1, hs=1)
    doc_df = data_frame[group_id].apply(lambda x: ','.join([str(i) for i in model[x]])).str.split(',', expand=True).apply(pd.to_numeric)
    doc_df.columns = ['{}_d2v_{}'.format(feat, i) for i in range(length)]
    return pd.concat([data_frame[[group_id]], doc_df], axis=1)


def q10(x):
    return x.quantile(0.1)


def q20(x):
    return x.quantile(0.2)


def q30(x):
    return x.quantile(0.3)


def q40(x):
    return x.quantile(0.4)


def q60(x):
    return x.quantile(0.6)


def q70(x):
    return x.quantile(0.7)


def q80(x):
    return x.quantile(0.8)


def q90(x):
    return x.quantile(0.9)

In [3]:
train = pd.read_csv('../../input/round2_train.csv')
test = pd.read_csv('../../input/round2_test.csv')

In [4]:
train.head()

Unnamed: 0,ID,lat,lon,speed,direction,time,type
0,20000,21.295,115.563,2.32,50,0912 23:59:55,拖网
1,20000,21.289,115.557,3.29,30,0912 23:49:54,拖网
2,20000,21.282,115.551,3.08,40,0912 23:39:26,拖网
3,20000,21.274,115.547,3.51,20,0912 23:29:49,拖网
4,20000,21.267,115.543,3.08,10,0912 23:19:48,拖网


In [5]:
test.head()

Unnamed: 0,ID,lat,lon,speed,direction,time,type
0,10000,6392512.0,5475100.0,0.27,0,1120 23:47:31,unknown
1,10000,6392512.0,5475100.0,0.27,0,1120 23:37:31,unknown
2,10000,6392512.0,5475100.0,0.05,0,1120 23:27:31,unknown
3,10000,6392512.0,5475100.0,0.05,0,1120 23:17:31,unknown
4,10000,6392512.0,5475100.0,0.05,0,1120 23:07:31,unknown


In [6]:
df = pd.concat([train, test], axis=0, ignore_index=True)
del train, test
gc.collect()
df.head()

Unnamed: 0,ID,lat,lon,speed,direction,time,type
0,20000,21.295,115.563,2.32,50,0912 23:59:55,拖网
1,20000,21.289,115.557,3.29,30,0912 23:49:54,拖网
2,20000,21.282,115.551,3.08,40,0912 23:39:26,拖网
3,20000,21.274,115.547,3.51,20,0912 23:29:49,拖网
4,20000,21.267,115.543,3.08,10,0912 23:19:48,拖网


In [7]:
df.sort_values(['ID', 'time'], inplace=True)
df.head()

Unnamed: 0,ID,lat,lon,speed,direction,time,type
5291016,9000,6265080.0,5251556.0,0.32,242,1028 00:01:38,unknown
5291015,9000,6265080.0,5251556.0,0.0,104,1028 00:11:39,unknown
5291014,9000,6265080.0,5251556.0,0.22,0,1028 00:21:45,unknown
5291013,9000,6265080.0,5251556.0,0.11,0,1028 00:31:39,unknown
5291012,9000,6265080.0,5251556.0,0.11,301,1028 00:41:42,unknown


In [8]:
df['time'] = df['time'].apply(lambda x: '2019-' + x.split(' ')[0][:2] + '-' + x.split(' ')[0][2:] + ' ' + x.split(' ')[1])
df['time'] = pd.to_datetime(df['time'])

In [9]:
df['lat_diff'] = df.groupby('ID')['lat'].diff(1)
df['lon_diff'] = df.groupby('ID')['lon'].diff(1)
df['speed_diff'] = df.groupby('ID')['speed'].diff(1)
df['diff_minutes'] = df.groupby('ID')['time'].diff(1).dt.seconds // 60

In [10]:
df['anchor'] = df.apply(lambda x: 1 if x['lat_diff'] < 0.01 and x['lon_diff'] < 0.01 and x['speed'] < 0.1 and x['diff_minutes'] < 10 else 0 , axis=1)

In [11]:
lat_lon_neq_zero = df[(df['lat_diff'] != 0) & (df['lon_diff'] != 0)]
speed_neg_zero = df[df['speed_diff'] != 0]

In [12]:
df['type'] = df['type'].map({'围网': 0, '刺网': 1, '拖网': 2, 'unknown': -1})
group_df = df.groupby('ID', as_index=False)['type'].agg({'label': 'mean', 'cnt': 'count'})

In [13]:
# 获取锚点位置信息
anchor_df = df.groupby('ID', as_index=False)['anchor'].agg('sum')
anchor_df.columns = ['ID', 'anchor_cnt']

In [14]:
group_df = group_df.merge(anchor_df, on='ID', how='left')
group_df['anchor_ratio'] = group_df['anchor_cnt'] / group_df['cnt']

## 统计特征

In [15]:
stat_functions = ['min', 'max', 'mean', 'median', 'nunique', q10, q20, q30, q40, q60, q70, q80, q90]
stat_ways = ['min', 'max', 'mean', 'median', 'nunique', 'q_10', 'q_20', 'q_30', 'q_40', 'q_60', 'q_70', 'q_80', 'q_90']

In [16]:
stat_cols = ['lat', 'lon', 'speed', 'direction']

In [17]:
group_tmp = df.groupby('ID')[stat_cols].agg(stat_functions).reset_index()
group_tmp.columns = ['ID'] + ['{}_{}'.format(i, j) for i in stat_cols for j in stat_ways]

In [18]:
lat_lon_neq_group = lat_lon_neq_zero.groupby('ID', as_index=True)[stat_cols].agg(stat_functions).reset_index()
lat_lon_neq_group.columns = ['ID'] + ['pos_neq_zero_{}_{}'.format(i, j) for i in stat_cols for j in stat_ways]

In [19]:
speed_neg_zero_group = speed_neg_zero.groupby('ID')[stat_cols].agg(stat_functions).reset_index()
speed_neg_zero_group.columns = ['ID'] + ['speed_neq_zero_{}_{}'.format(i, j) for i in stat_cols for j in stat_ways]

In [20]:
group_df = group_df.merge(group_tmp, on='ID', how='left')
group_df = group_df.merge(lat_lon_neq_group, on='ID', how='left')
group_df = group_df.merge(speed_neg_zero_group, on='ID', how='left')

In [21]:
# 获取TOP频次的位置信息，这里选Top3
mode_df = df.groupby(['ID', 'lat', 'lon'], as_index=False)['time'].agg({'mode_cnt': 'count'})
mode_df['rank'] = mode_df.groupby('ID')['mode_cnt'].rank(method='first', ascending=False)
for i in range(1, 4):
    tmp_df = mode_df[mode_df['rank'] == i]
    del tmp_df['rank']
    tmp_df.columns = ['ID', 'rank{}_mode_lat'.format(i), 'rank{}_mode_lon'.format(i), 'rank{}_mode_cnt'.format(i)]
    group_df = group_df.merge(tmp_df, on='ID', how='left')

In [22]:
tfidf_df = get_geohash_tfidf(df, 'ID', 'lat_lon', 30)
group_df = group_df.merge(tfidf_df, on='ID', how='left')
print('geohash tfidf finished.')

geohash tfidf finished.


In [23]:
grad_tfidf = get_grad_tfidf(df, 'ID', 'grad', 30)
group_df = group_df.merge(grad_tfidf, on='ID', how='left')
print('gradient tfidf finished.')

gradient tfidf finished.


In [24]:
sample_tfidf = get_sample_tfidf(df, 'ID', 'sample', 30)
group_df = group_df.merge(sample_tfidf, on='ID', how='left')
print('sample tfidf finished.')

sample tfidf finished.


In [25]:
w2v_df = w2v_feat(df, 'ID', 'lat_lon', 30)
group_df = group_df.merge(w2v_df, on='ID', how='left')
print('word2vec finished.')

start word2vec ...
word2vec finished.
