In [3]:
# default_exp data.processing.eda

%reload_ext autoreload
%autoreload 2

# describe

In [2]:
# export
def describe(df):
    """
    描述df的
        data types
        percent missing
        unique values
        mode 众数
        count mode 众数计数
        % mode 众数占所有数据的百分比
        distribution stats  分布数据 分位数
    :param df:
    :return:
    """
    import pandas as pd
    pd.set_option('display.max_rows', 200)
    pd.set_option('display.max_columns', 100)  # 设置显示数据的最大列数，防止出现省略号…，导致数据显示不全
    pd.set_option('expand_frame_repr', False)  # 当列太多时不自动换行

    # data types
    dqr_data_types = pd.DataFrame(df.dtypes, columns=['Data Type'])
    # count missing
    dqr_count_missing = pd.DataFrame(df.isnull().sum(), columns=['count Missing'])
    # percent missing
    dqr_percent_missing = pd.DataFrame(100 * (df.isnull().sum() / len(df)).round(3), columns=['% Missing'])

    # unique values
    dqr_unique_values = pd.DataFrame(columns=['Unique Values'])
    for c in df:
        dqr_unique_values.loc[c] = df[c].nunique()

    # mode 众数
    dqr_mode = pd.DataFrame(df.mode().loc[0])
    dqr_mode.rename(columns={dqr_mode.columns[0]: "Mode"}, inplace=True)

    # count mode
    dqr_count_mode = pd.DataFrame(columns=['Count Mode'])
    for c in df:
        dqr_count_mode.loc[c] = df[c][df[c] == dqr_mode.loc[[c]].iloc[0]['Mode']].count()

        # % mode
    dqr_percent_mode = pd.DataFrame(100 * (dqr_count_mode['Count Mode'].values / len(df)), \
                                    index=dqr_count_mode.index, columns=['% Mode'])

    # distribution stats
    df['temp_1a2b3c__'] = 1
    dqr_stats = pd.DataFrame(df['temp_1a2b3c__'].describe())
    del df['temp_1a2b3c__']
    for c in df:
        dqr_stats = dqr_stats.join(pd.DataFrame(df[c].describe()))
    del dqr_stats['temp_1a2b3c__']
    dqr_stats = dqr_stats.transpose().drop('count', axis=1)

    print("num of records: {}, num of columns: {}".format(len(df), len(df.columns)))

    return dqr_data_types.join(dqr_unique_values[['Unique Values']].astype(int)).join(dqr_count_missing). \
        join(dqr_percent_missing).join(dqr_mode).join(dqr_count_mode[['Count Mode']].astype(int)).join(
        dqr_percent_mode).join(dqr_stats)

# aggregate_features

In [2]:
#export
def aggregate_features(df_, feat, agg_func_dict, prefix=''):
    """
    """
    from loguru import logger
    df = df_.copy()
    logger.info(f'gen 特征 for {",".join(feat)}...')
    

    agg_df = df.groupby(feat).agg(agg_func_dict)
#     print(agg_df.columns.values)
    agg_df.columns = [prefix + '_'.join(col).strip() for col in agg_df.columns.values]
    logger.info(f'gen 特征 for {",".join(feat)}...end')
    
    return agg_df

In [4]:
import pandas as pd

In [5]:
df = pd.DataFrame({'收据号': [1, 1, 2, 3, 3, 3],
                   '项目名称': ['挂号费', '药费', '挂号费', '挂号费', '治疗费', '床位费'],
                   '费用': [1, 2, 3, 4, 5, 6],
                   'claim': [4, 4, 4, 5, 5, 5]})

## 一阶聚合

In [7]:
agg_func = {
    '费用': ['sum','mean','median','max','min','std'],
    '收据号': ['count','nunique'],
}
aggregate_features(df, ['项目名称'], agg_func)

2020-06-23 15:52:35.781 | INFO     | __main__:aggregate_features:8 - gen 特征 for 项目名称...
2020-06-23 15:52:35.804 | INFO     | __main__:aggregate_features:14 - gen 特征 for 项目名称...end


Unnamed: 0_level_0,费用_sum,费用_mean,费用_median,费用_max,费用_min,费用_std,收据号_count,收据号_nunique
项目名称,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
床位费,6,6.0,6,6,6,,1,1
挂号费,8,2.666667,3,4,1,1.527525,3,3
治疗费,5,5.0,5,5,5,,1,1
药费,2,2.0,2,2,2,,1,1


## 多阶聚合

In [9]:
agg_func = {
    '费用': ['sum','mean','median','max','min','std'],
    '收据号': ['count','nunique'],
}
aggregate_features(df, ['项目名称', 'claim'], agg_func, 'claim_')

2020-06-23 15:53:43.494 | INFO     | __main__:aggregate_features:8 - gen 特征 for 项目名称,claim...
2020-06-23 15:53:43.515 | INFO     | __main__:aggregate_features:14 - gen 特征 for 项目名称,claim...end


Unnamed: 0_level_0,Unnamed: 1_level_0,claim_费用_sum,claim_费用_mean,claim_费用_median,claim_费用_max,claim_费用_min,claim_费用_std,claim_收据号_count,claim_收据号_nunique
项目名称,claim,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
床位费,5,6,6,6,6,6,,1,1
挂号费,4,4,2,2,3,1,1.414214,2,2
挂号费,5,4,4,4,4,4,,1,1
治疗费,5,5,5,5,5,5,,1,1
药费,4,2,2,2,2,2,,1,1


In [14]:
col2 = 'claim'

In [15]:
dfg2 = aggregate_features(df, ['项目名称', col2], agg_func)

2020-06-23 16:53:30.105 | INFO     | __main__:aggregate_features:8 - gen 特征 for 项目名称,claim...
2020-06-23 16:53:30.121 | INFO     | __main__:aggregate_features:14 - gen 特征 for 项目名称,claim...end


In [16]:
dfg2 = dfg2.unstack()

dfg2.columns = [f'{i[0]}_{col2}{i[1]}' for i in dfg2.columns]

dfg2 = dfg2.fillna(0)

In [17]:
dfg2.head(2)

Unnamed: 0_level_0,费用_sum_claim4,费用_sum_claim5,费用_mean_claim4,费用_mean_claim5,费用_median_claim4,费用_median_claim5,费用_max_claim4,费用_max_claim5,费用_min_claim4,费用_min_claim5,费用_std_claim4,费用_std_claim5,收据号_count_claim4,收据号_count_claim5,收据号_nunique_claim4,收据号_nunique_claim5
项目名称,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
床位费,0.0,6.0,0.0,6.0,0.0,6.0,0.0,6.0,0.0,6.0,0.0,0.0,0.0,1.0,0.0,1.0
挂号费,4.0,4.0,2.0,4.0,2.0,4.0,3.0,4.0,1.0,4.0,1.414214,0.0,2.0,1.0,2.0,1.0


# nb_export

In [4]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 00_template.ipynb.
Converted active_learning.ipynb.
Converted algo_dl_keras.ipynb.
Converted algo_ml_eda.ipynb.
Converted algo_ml_tree_catboost.ipynb.
Converted algo_rs_associated_rules.ipynb.
Converted algo_rs_match_deepmatch.ipynb.
Converted algo_rs_matrix.ipynb.
Converted algo_rs_search_vector_faiss.ipynb.
Converted algo_seq_embeding.ipynb.
Converted algo_seq_tfidf.ipynb.
Converted datastructure_dict_list.ipynb.
Converted datastructure_time.ipynb.
Converted engineering_concurrency.ipynb.
Converted engineering_nbdev.ipynb.
Converted engineering_panel.ipynb.
Converted engineering_snorkel.ipynb.
Converted index.ipynb.
Converted utils_functools.ipynb.
Converted utils_json.ipynb.
Converted utils_pickle.ipynb.


In [7]:
!nbdev_build_docs

No notebooks were modified
converting /Users/luoyonggui/PycharmProjects/nbdevlib/index.ipynb to README.md
