In [1]:
import gc

import numpy as np
import pandas as pd
import os.path

import lightgbm as lgb
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import KFold


In [2]:
def show_files(path_):
    file_list = os.listdir(path_)
    all_files_ = list()
    for file in file_list:
        cur_path = os.path.join(path_, file)
        all_files_.append(cur_path)
    return all_files_


def del_files(path_):
    for file in os.listdir(path_):
        cur_path = os.path.join(path_, file)
        if os.path.isfile(cur_path):
            os.remove(cur_path)
        else:
            del_files(cur_path)


In [3]:
def f1_score_eval(preds, valid_df):
    labels = valid_df.get_label()
    preds = np.argmax(preds.reshape(3, -1), axis=0)
    scores = f1_score(y_true=labels, y_pred=preds, average=None)
    scores = scores[0]*0.2+scores[1]*0.2+scores[2]*0.6
    return 'f1_score', scores, True


In [4]:
def split_time_diff(data_: pd.DataFrame, split_col_name: str, n_diff_split: int = 1, mode: str = 'is_train'):
    """
    Split dataframe into different parts according to 'time_diff',
    and write to different feather(mode 'is_train')/csv(mode 'is_test') format files.
    Note that pyarrow needed to be installed first.

    :param data_: DataFrame
        DataFrame needs to be split.
    :param split_col_name: str
        Column name by which the data is split.
    :param n_diff_split: int, default 1
        The number of timw_diff each part after splitting.
    :param mode: str, default 'is_train'
        The prefix of split data files.
    :return: list
        A list object contains strings of 'time_diff' values joined by '_' of each feather/csv files.
    """
    time_diff_array = np.sort(data_[split_col_name].unique())
    df = pd.DataFrame()
    data_copy = data_.copy()
    count = 0
    diff_list = list()
    fname_list = list()
    for diff in np.nditer(time_diff_array):
        df_temp = data_copy.query(f'{split_col_name} == {diff}')
        data_copy.drop(labels=df_temp.index.tolist(), axis=0, inplace=True)
        df = pd.concat([df, df_temp], axis=0, ignore_index=True if mode == 'is_train' else False)
        print(f'time_diff {diff} selected: {df_temp.shape[0]} frames')
        del df_temp

        count = count + 1
        diff_list.append(diff)
        if (count % n_diff_split == 0) or (count == len(time_diff_array)):
            fname = "{}diff_feature/{}_diff_{}".format(
                path, mode, '_'.join([str(i) for i in diff_list])) + ('.ft' if mode == 'is_train' else '.csv')
            if (mode == 'is_train') and os.path.isfile(fname):
                df = pd.concat([pd.read_feather(fname), df], axis=0, ignore_index=True)
            df.to_feather(fname) if mode == 'is_train' else df.to_csv(fname)
            df.drop(labels=df.index, inplace=True)
            fname_list.append('_'.join([str(i) for i in diff_list]))
            diff_list.clear()
    return fname_list


In [5]:
def lgb_train(train_: pd.DataFrame, test_: pd.DataFrame, use_train_feats: list, id_col: str, label: str,
              n_splits: int, split_rs: int, is_shuffle=True, use_cart=False, cate_cols=None, ground_truth_test=False) -> pd.DataFrame:
    if not cate_cols:
        cate_cols = []
    print('data shape:\ntrain--{}\ntest--{}'.format(train_.shape, test_.shape))
    print('Use {} features ...'.format(len(use_train_feats)))
    print('Use lightgbm to train ...')
    n_class = train_[label].nunique()
    train_[f'{label}_pred'] = 0
    if ground_truth_test:
        test_[f'{label}_pred'] = 0
    test_pred = np.zeros((test_.shape[0], n_class))
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = use_train_feats

    folds = KFold(n_splits=n_splits, shuffle=is_shuffle, random_state=split_rs)
    train_user_id = train_[id_col].unique()

    params = {
        'learning_rate': 0.05,
        'boosting_type': 'gbdt',
        'objective': 'multiclass',
        'metric': 'None',
        'num_leaves': 31,
        'num_class': n_class,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'seed': 1,
        'bagging_seed': 1,
        'feature_fraction_seed': 7,
        'min_data_in_leaf': 20,
        'nthread': 4,
        'verbose': -1
    }

    for n_fold, (train_idx, valid_idx) in enumerate(folds.split(train_user_id), start=1):
        print('the {} training start ...'.format(n_fold))
        train_x, train_y = train_.loc[train_[id_col].isin(train_user_id[train_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[train_idx]), label]
        valid_x, valid_y = train_.loc[train_[id_col].isin(train_user_id[valid_idx]), use_train_feats], train_.loc[
            train_[id_col].isin(train_user_id[valid_idx]), label]
        print(f'for train user:{len(train_idx)}\nfor valid user:{len(valid_idx)}')

        if use_cart:
            dtrain = lgb.Dataset(train_x, label=train_y, categorical_feature=cate_cols)
            dvalid = lgb.Dataset(valid_x, label=valid_y, categorical_feature=cate_cols)
        else:
            dtrain = lgb.Dataset(train_x, label=train_y)
            dvalid = lgb.Dataset(valid_x, label=valid_y)

        clf = lgb.train(
            params=params,
            train_set=dtrain,
            num_boost_round=5000,
            valid_sets=[dvalid],
            valid_names=['valid'],
            early_stopping_rounds=100,
            verbose_eval=100,
            feval=f1_score_eval
        )
        fold_importance_df[f'fold_{n_fold}_imp'] = clf.feature_importance(importance_type='gain')
        train_.loc[train_[id_col].isin(train_user_id[valid_idx]), f'{label}_pred'] = np.argmax(
            clf.predict(valid_x, num_iteration=clf.best_iteration), axis=1)
        test_pred += clf.predict(test_[use_train_feats], num_iteration=clf.best_iteration) / folds.n_splits

    report = f1_score(train_[label], train_[f'{label}_pred'], average=None)
    print(classification_report(train_[label], train_[f'{label}_pred'], digits=4))
    print('Train score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)
    test_[f'{label}_pred'] = np.argmax(test_pred, axis=1)

    if ground_truth_test:
        report = f1_score(test_[label], test_[f'{label}_pred'], average=None)
        print(classification_report(test_[label], test_[f'{label}_pred'], digits=4))
        print('Test score: ', report[0] * 0.2 + report[1] * 0.2 + report[2] * 0.6)
    else:
        test_[label] = np.argmax(test_pred, axis=1) + 1
    five_folds = [f'fold_{f}_imp' for f in range(1, n_splits + 1)]
    fold_importance_df['avg_imp'] = fold_importance_df[five_folds].mean(axis=1)
    fold_importance_df.sort_values(by='avg_imp', ascending=False, inplace=True)
    pd.set_option('display.max_rows', None)
    print(fold_importance_df[['Feature', 'avg_imp']].head(20))
    return test_[[id_col, 'current_slice_id', 'future_slice_id', label]]


In [6]:
if __name__ == "__main__":
    path = './'


In [None]:
    # generate brand new diff features, run with caution!!!
    del_files(path + 'diff_feature')
    all_files = show_files(path + 'feature')
    for file_path in all_files:
        data = pd.read_csv(file_path)
        rt = split_time_diff(data, 'time_diff', mode='is_train' if file_path.split('/')[-1][:8]=='is_train' else 'is_test')
        del data
        gc.collect()


In [7]:
    # if you didn't run the last cell, run this!
    rt = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30']
    

In [8]:
    attr = pd.read_csv(f'{path}attr.txt', sep='\t',
                       names=['link', 'length', 'direction', 'path_class', 'speed_class', 'LaneNum', 'speed_limit',
                              'level', 'width'], header=None)
    attr.drop(['level'], axis=1, inplace=True)


In [12]:
    for diff_suffix in rt:
        train = pd.read_feather(f"{path}diff_feature/is_train_diff_{diff_suffix}.ft")
        test = pd.read_csv(f"{path}diff_feature/is_test_diff_{diff_suffix}.csv", index_col=0)
        # print(test.head(10))
        train = train.merge(attr, on='link', how='left')
        # test = test.merge(attr, on='link', how='left')
        test = test.join(attr, on='link', rsuffix='_')
        test.drop(['link_'], axis=1, inplace=True)
        # print(test.head(10))

        use_cols = [i for i in train.columns if i not in ['link', 'label',
                                                          'label_pred', 'curr_cnt', 'rec_6_cnt', 'rec_2_cnt',
                                                          'rec_8_cnt', 'rec_4_cnt', 'rec_4_eta', 'rec_2_eta', 'width',
                                                          'rec_6_eta', 'speed_class']]

        print(f'--------------Training lgb for {diff_suffix} time diff----------------------')
        sub = lgb_train(train, test, use_cols, 'link', 'label', 5, 2020)

        sub.to_csv(f'{path}result/result_diff_{diff_suffix}.csv', index=True, encoding='utf8')


--------------Training lgb for 1 time diff----------------------
data shape:
train--(506086, 84)
test--(16965, 84)
Use 72 features ...
Use lightgbm to train ...
the 1 training start ...
for train user:11178
for valid user:2795
Training until validation scores don't improve for 100 rounds
[100]	valid's f1_score: 0.770506
[200]	valid's f1_score: 0.76985
Early stopping, best iteration is:
[100]	valid's f1_score: 0.770506
the 2 training start ...
for train user:11178
for valid user:2795
Training until validation scores don't improve for 100 rounds
[100]	valid's f1_score: 0.774255
Early stopping, best iteration is:
[80]	valid's f1_score: 0.774973
the 3 training start ...
for train user:11178
for valid user:2795
Training until validation scores don't improve for 100 rounds
[100]	valid's f1_score: 0.767985
Early stopping, best iteration is:
[47]	valid's f1_score: 0.768623
the 4 training start ...
for train user:11179
for valid user:2794
Training until validation scores don't improve for 100 r

In [13]:
    del attr
    gc.collect()
    sub = pd.concat([pd.read_csv(f'{path}result/result_diff_{i}.csv', index_col=0) for i in rt])


In [14]:
    sub.sort_index(inplace=True)
    sub.head(20)
    sub[['link', 'current_slice_id', 'future_slice_id', 'label']].to_csv(f'{path}result/result_diff.csv', index=False, encoding='utf8')
