In [15]:
import gc

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report, f1_score
import lightgbm as lgb
from collections import Counter
# import warnings

# warnings.filterwarnings("ignore")


In [16]:
def gen_feats(txt_path, mode='is_train'):
    df_ = pd.read_csv(txt_path, sep=';', header=None)
    df_['link'] = df_[0].apply(lambda x: x.split(' ')[0])
    if mode == 'is_train':
        df_['label'] = df_[0].apply(lambda x: int(x.split(' ')[1]))
        df_['label'] = df_['label'].apply(lambda x: 3 if x > 3 else x)
        df_['label'] -= 1
        df_['current_slice_id'] = df_[0].apply(lambda x: int(x.split(' ')[2]))
        df_['future_slice_id'] = df_[0].apply(lambda x: int(x.split(' ')[3]))
    else:
        df_['label'] = -1
        df_['current_slice_id'] = df_[0].apply(lambda x: int(x.split(' ')[2]))
        df_['future_slice_id'] = df_[0].apply(lambda x: int(x.split(' ')[3]))

    df_.drop([0], axis=1, inplace=True)
    df_['time_diff'] = df_['future_slice_id'] - df_['current_slice_id']

    for ii in range(0, 5):
        df_['curr_state'] = df_[1].apply(lambda x: x.split(' ')[ii].split(':')[-1])
        if ii == 4:
            flg = 'curr'
        else:
            flg = f'rec_{(4 - ii) * 2}'
        df_[f'{flg}_speed'] = df_['curr_state'].apply(lambda x: x.split(',')[0])
        df_[f'{flg}_eta'] = df_['curr_state'].apply(lambda x: x.split(',')[1])
        df_[f'{flg}_cnt'] = df_['curr_state'].apply(lambda x: x.split(',')[3])
        df_[f'{flg}_state'] = df_['curr_state'].apply(lambda x: x.split(',')[2])
    df_.drop([1], axis=1, inplace=True)
    print('recent_gen complete')

    for ii in range(2, 6):
        df_['his_info'] = df_[ii].apply(lambda x: [j.split(':')[-1] for j in x.split(' ')])
        flg = f'his_{(6 - ii) * 7}'
        df_['his_speed'] = df_['his_info'].apply(lambda x: np.array([j.split(',')[0] for j in x], dtype='float16'))
        df_[f'{flg}_speed_min'] = df_['his_speed'].apply(lambda x: x.min())
        df_[f'{flg}_speed_max'] = df_['his_speed'].apply(lambda x: x.max())
        df_[f'{flg}_speed_mean'] = df_['his_speed'].apply(lambda x: x.mean())
        df_[f'{flg}_speed_std'] = df_['his_speed'].apply(lambda x: x.std())

        df_['his_eta'] = df_['his_info'].apply(lambda x: np.array([j.split(',')[1] for j in x], dtype='float16'))
        df_[f'{flg}_eta_min'] = df_['his_eta'].apply(lambda x: x.min())
        df_[f'{flg}_eta_max'] = df_['his_eta'].apply(lambda x: x.max())
        df_[f'{flg}_eta_mean'] = df_['his_eta'].apply(lambda x: x.mean())
        df_[f'{flg}_eta_std'] = df_['his_eta'].apply(lambda x: x.std())

        df_['his_cnt'] = df_['his_info'].apply(lambda x: np.array([j.split(',')[3] for j in x], dtype='int16'))
        df_[f'{flg}_cnt_min'] = df_['his_cnt'].apply(lambda x: x.min())
        df_[f'{flg}_cnt_max'] = df_['his_cnt'].apply(lambda x: x.max())
        df_[f'{flg}_cnt_mean'] = df_['his_cnt'].apply(lambda x: x.mean())
        df_[f'{flg}_cnt_std'] = df_['his_cnt'].apply(lambda x: x.std())

        df_['his_state'] = df_['his_info'].apply(lambda x: [int(j.split(',')[2]) for j in x])
        df_[f'{flg}_state'] = df_['his_state'].apply(lambda x: Counter(x).most_common()[0][0])
        df_.drop([ii, 'his_info', 'his_speed', 'his_eta', 'his_cnt', 'his_state'], axis=1, inplace=True)
    print('history_gen complete')
    if mode == 'is_train':
        save_path = f"{path}feature/{mode}_{txt_path.split('/')[-1][:-4]}.csv"
        df_.to_csv(save_path, index=False)
        return save_path
    else:
        save_path = f"{path}feature/is_test.csv"
        df_.to_csv(save_path, index=False)
        return save_path


In [17]:
def f1_score_eval(preds, valid_df):
    labels = valid_df.get_label()
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    scores = f1_score(y_true=labels, y_pred=preds, average=None)
    scores = scores[0]*0.2+scores[1]*0.2+scores[2]*0.6
    return 'f1_score', scores, True


In [18]:
def lgb_train(train_: pd.DataFrame, test_: pd.DataFrame, use_train_feats: list, id_col: str, label: str,
              n_splits: int, split_rs: int, is_shuffle=True, use_cart=False, cate_cols=None, ground_truth_test=False) -> pd.DataFrame:
    if not cate_cols:
        cate_cols = []
    print('data shape:\ntrain--{}\ntest--{}'.format(train_.shape, test_.shape))
    print('Use {} features ...'.format(len(use_train_feats)))
    print('Use lightgbm to train ...')
    n_class = train_[label].nunique()
    train_[f'{label}_pred'] = 0
    if ground_truth_test:
        test_[f'{label}_pred'] = 0
    test_pred = np.zeros((test_.shape[0], n_class))
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = use_train_feats

    folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=split_rs)
    train_user_id = train_[id_col].unique()

    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': 'None',
        'num_leaves': 31,
        'num_class': n_class,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 1,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': -1,
        'verbose': -1
    }

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):
        print('the {} training start ...'.format(n_fold))
        train_x, train_y = train_.loc[train_[id_col].isin(train_user_id[train_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[train_idx]), label]
        valid_x, valid_y = train_.loc[train_[id_col].isin(train_user_id[valid_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[valid_idx]), label]
        print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')

        if use_cart:
            dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)
            dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)
        else:
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=5000,
            valid_sets=[dvalid],
            valid_names=['valid'],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=f1_score_eval
        )
        fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance(importance_type='gain')
        train_.loc[train_[id_col].isin(train_user_id[valid_idx]), f'{label}_pred'] = np.argmax(
            clf.predict(valid_x, num_iteration=clf.best_iteration), axis=1)
        test_pred += clf.predict(test_[use_train_feats], num_iteration=clf.best_iteration) / folds.n_splits

    report = f1_score(train_[label], train_[f'{label}_pred'], average=None)
    print(classification_report(train_[label], train_[f'{label}_pred'], digits=4))
    print('Train score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)
    test_[f'{label}_pred'] = np.argmax(test_pred, axis=1)

    if ground_truth_test:
        report = f1_score(test_[label], test_[f'{label}_pred'], average=None)
        print(classification_report(test_[label], test_[f'{label}_pred'], digits=4))
        print('Test score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)
    else:
        test_[label] = np.argmax(test_pred, axis=1) + 1
    five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]
    fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)
    fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)
    pd.set_option('display.max_rows', None)
    print(fold_importance_df[['Feature', 'avg_imp']].head(20))
    return test_[[id_col, 'current_slice_id', 'future_slice_id', label]]


In [22]:
if __name__ == "__main__":
    path = './'
    train_path = {'./traffic/20190701.txt':61976, './traffic/20190702.txt':63637, 
                  './traffic/20190703.txt':63751, './traffic/20190704.txt':71428, 
                 './traffic/20190705.txt':74420, './traffic/20190706.txt':202425, 
                 './traffic/20190707.txt':66959, './traffic/20190708.txt':64188}
    test_path = './traffic/20190709.txt'
    # test_path = './test.txt'
    ground_truth_test_ = False  # whether test file is train file
    

In [23]:
    # gen_feats(train_path, mode='is_train')
    gen_feats(test_path, mode='is_test' if not ground_truth_test_ else 'is_train')
    

recent_gen complete
history_gen complete


'./feature/is_test.csv'

In [24]:
    attr = pd.read_csv('attr.txt', sep='\t',
                       names=['link', 'length', 'direction', 'path_class', 'speed_class', 'LaneNum', 'speed_limit',
                              'level', 'width'], header=None)
    attr.drop(['level'], axis=1, inplace=True)


In [25]:
    train = pd.DataFrame()
    for trainp, n_row in train_path.items():
        train_tmp = pd.read_csv(f"./feature/is_train_{trainp.split('/')[-1][:-4]}.csv")
        train_tmp['id'] = range(0, train_tmp.shape[0])
        train_row = pd.read_csv(f"{path}adversarial_validation/adversarial_validation_{trainp.split('/')[-1][:-4]}.csv", nrows=n_row)
        train_tmp = train_tmp.merge(train_row, on='id', how='right')
        train_tmp.drop(['id', 'preds'], axis=1, inplace=True)
        train = pd.concat([train, train_tmp], axis=0, ignore_index=True)
    del train_tmp

    test = pd.read_csv("./feature/is_test.csv" if not ground_truth_test_ else f"./feature/is_train_{test_path.split('/')[-1][:-4]}.csv")
    train = train.merge(attr, on='link', how='left')
    test = test.merge(attr, on='link', how='left')
    print(f'test size: {test.shape[0]}')
    del attr, train_row
    gc.collect()


test size: 504891


0

In [None]:
    use_cols = [i for i in train.columns if i not in ['link', 'label',
                                                      'label_pred', 'curr_cnt', 'rec_6_cnt', 'rec_2_cnt',
                                                      'rec_8_cnt', 'rec_4_cnt', 'rec_4_eta', 'rec_2_eta', 'width',
                                                      'rec_6_eta', 'speed_class']]

    sub = lgb_train(train, test, use_cols, 'link', 'label', 5, 2020, ground_truth_test=ground_truth_test_)


In [27]:
    if not ground_truth_test_:
        sub.to_csv(f'test_result_{len(train_path)}.csv', index=False, encoding='utf8')
