In [40]:
import pandas as pd
from tensorflow import keras
import os
import tensorflow as tf
import numpy as np

In [41]:
tf.__version__

'2.1.0'

In [42]:
import keras

In [43]:
'''
    返回census-income数据集
    @param train_data:训练集特征
    @param train_label:训练集标签
    @param test_data:测试集特征
    @param test_label:测试集标签
    @param validation_data:验证集特征
    @param validation_label:验证集标签
'''
def data_processing():
    # 数据集列表名
    column_names = ['age', 'class_worker', 'det_ind_code', 'det_occ_code', 'education', 'wage_per_hour', 'hs_college',
                    'marital_stat', 'major_ind_code', 'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member',
                    'unemp_reason', 'full_or_part_emp', 'capital_gains', 'capital_losses', 'stock_dividends',
                    'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat', 'det_hh_summ',
                    'instance_weight', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                    'num_emp', 'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                    'own_or_self', 'vet_question', 'vet_benefits', 'weeks_worked', 'year', 'income_50k']
    #读取训练数据
    train_df = pd.read_csv(
        './dataset/census-income.data.gz',
        delimiter=',',
        header=None,
        index_col=None,
        names=column_names
    )
    #读取测试数据
    other_df = pd.read_csv(
        './dataset/census-income.test',
        delimiter=',',
        header=None,
        index_col=None,
        names=column_names
    )
    # 特征名字
    categorical_columns = ['class_worker', 'det_ind_code', 'det_occ_code', 'education', 'hs_college', 'major_ind_code',
                           'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member', 'unemp_reason',
                           'full_or_part_emp', 'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat',
                           'det_hh_summ', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                           'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                           'vet_question']
    # First group of tasks according to the paper
    label_columns = ['income_50k', 'marital_stat']

    # One-hot encoding categorical columns
    categorical_columns = ['class_worker', 'det_ind_code', 'det_occ_code', 'education', 'hs_college', 'major_ind_code',
                           'major_occ_code', 'race', 'hisp_origin', 'sex', 'union_member', 'unemp_reason',
                           'full_or_part_emp', 'tax_filer_stat', 'region_prev_res', 'state_prev_res', 'det_hh_fam_stat',
                           'det_hh_summ', 'mig_chg_msa', 'mig_chg_reg', 'mig_move_reg', 'mig_same', 'mig_prev_sunbelt',
                           'fam_under_18', 'country_father', 'country_mother', 'country_self', 'citizenship',
                           'vet_question']
    train_raw_labels = train_df[label_columns]
    other_raw_labels = other_df[label_columns]
    transformed_train = pd.get_dummies(train_df.drop(label_columns, axis=1), columns=categorical_columns)
    transformed_other = pd.get_dummies(other_df.drop(label_columns, axis=1), columns=categorical_columns)
    #print(transformed_train.columns.values)
    #print(transformed_other.shape)
    #print(transformed_train.shape)
    
    transformed_other['det_hh_fam_stat_ Grandchild <18 ever marr not in subfamily'] = 0
    
    #获得标签量，并根据要求转换为one-hot向量
    train_income = keras.utils.to_categorical((train_raw_labels.income_50k == ' 50000+.').astype(int), num_classes=2)
    train_marital = keras.utils.to_categorical((train_raw_labels.marital_stat == ' Never married').astype(int), num_classes=2)
    other_income = keras.utils.to_categorical((other_raw_labels.income_50k == ' 50000+.').astype(int), num_classes=2)
    other_marital = keras.utils.to_categorical((other_raw_labels.marital_stat == ' Never married').astype(int), num_classes=2)
    
    # 转换为字典
    dict_outputs = {
        'income': train_income.shape[1],
        'marital': train_marital.shape[1]
    }
    dict_train_labels = {
        'income': train_income,
        'marital': train_marital
    }
    dict_other_labels = {
        'income': other_income,
        'marital': other_marital
    }
    output_info = [(dict_outputs[key], key) for key in sorted(dict_outputs.keys())]
    
    # 将测试集划分 为测试集和验证集 1:1的比例
    validation_indices = transformed_other.sample(frac=0.5, replace=False, random_state=1).index
    test_indices = list(set(transformed_other.index) - set(validation_indices))
    validation_data = transformed_other.iloc[validation_indices]
    validation_label = [dict_other_labels[key][validation_indices] for key in sorted(dict_other_labels.keys())]
    test_data = transformed_other.iloc[test_indices]
    test_label = [dict_other_labels[key][test_indices] for key in sorted(dict_other_labels.keys())]
    train_data = transformed_train
    train_label = [dict_train_labels[key] for key in sorted(dict_train_labels.keys())]

    return train_data, train_label, validation_data, validation_label, test_data, test_label

In [44]:
'''
    生成包含数据及标签的矩阵,因为在这里label是一个包含两个array的矩阵，因此需要有部分操作
    @param data 数据集合
    @param label 数据标签
    
    @return data 返回数据与数据集合一起的标签
'''
def generate_data(data, label):
    for i in range(0, len(label)):
        data = np.c_[data, label[i]]
    return data

In [45]:
train_data, train_label, val_data, val_label, test_data, test_label = data_processing()

In [46]:
train_data_np = generate_data(train_data, train_label)
test_data_np = generate_data(test_data, test_label)
val_data_np = generate_data(val_data, val_label)

In [47]:
output_dir = './dataset/generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [48]:
'''
    @param output_dir 输出目录
    @param data 输出数据
    @param name_prefix 文件名前缀
    @param header
    @param n_parts 划分文件数大小
    @return filenames 生成文件名list
'''
def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, '{}_{:02d}.csv')
    filenames = []
    
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header is not None:
                f.write(header + '\n')
            for row_index in row_indices:
                f.write(','.join([repr(col) for col in data[row_index]]))
                f.write('\n')
    return filenames

In [49]:
save_to_csv(output_dir, train_data_np, 'train_data', None, 10)

['./dataset/generate_csv\\train_data_00.csv',
 './dataset/generate_csv\\train_data_01.csv',
 './dataset/generate_csv\\train_data_02.csv',
 './dataset/generate_csv\\train_data_03.csv',
 './dataset/generate_csv\\train_data_04.csv',
 './dataset/generate_csv\\train_data_05.csv',
 './dataset/generate_csv\\train_data_06.csv',
 './dataset/generate_csv\\train_data_07.csv',
 './dataset/generate_csv\\train_data_08.csv',
 './dataset/generate_csv\\train_data_09.csv']

In [50]:
save_to_csv(output_dir, test_data_np, 'test_data', None, 10)
save_to_csv(output_dir, val_data_np, 'val_data', None,10)

['./dataset/generate_csv\\val_data_00.csv',
 './dataset/generate_csv\\val_data_01.csv',
 './dataset/generate_csv\\val_data_02.csv',
 './dataset/generate_csv\\val_data_03.csv',
 './dataset/generate_csv\\val_data_04.csv',
 './dataset/generate_csv\\val_data_05.csv',
 './dataset/generate_csv\\val_data_06.csv',
 './dataset/generate_csv\\val_data_07.csv',
 './dataset/generate_csv\\val_data_08.csv',
 './dataset/generate_csv\\val_data_09.csv']