# import

In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt

In [2]:
from config import *

In [3]:
os.makedirs(EXTERNAL_DATA_DIR, exist_ok=True)
os.makedirs(BASE_DATA_DIR, exist_ok=True)
os.makedirs(INTERIM_DATA_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Concatenate data and drop nonnumerical features

In [4]:
dataset_name = 'Anuran_Calls'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_csv(os.path.join(dataset_path, 'Frogs_MFCCs.csv'))
    df['label'] = df['Family']
    df = df.drop(['Family', 'Genus', 'Species', 'RecordID'], axis=1)
    df.to_csv(save_path, index=False)
    print(df.shape)

In [5]:
dataset_name = 'avila'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df_tr = pd.read_csv(os.path.join(dataset_path, 'avila-tr.txt'), header=None)
    df_ts = pd.read_csv(os.path.join(dataset_path, 'avila-ts.txt'), header=None)
    df = pd.concat([df_tr, df_ts])
    cols = ['intercolumnar distance', 'upper margin', 'lower margin', 'exploitation', 'row number', 
            'modular ratio', 'interlinear spacing', 'weight', 'peak number', 'MR/IS', 'label']
    df.columns = cols
    df.to_csv(save_path, index=False)
    print(df.shape)

In [6]:
dataset_name = 'Cardiotocography'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_excel(os.path.join(dataset_path, 'CTG.xls'), sheet_name='Data', header=1)
    cols = pd.read_excel(os.path.join(dataset_path, 'CTG.xls'), sheet_name='Data', header=None)[:1].dropna(axis=1).columns.tolist()
    df = df.iloc[:, cols].dropna()
    df['label'] = df['NSP']
    df = df.drop(['CLASS', 'NSP'], axis=1)
    df.to_csv(save_path, index=False)
    print(df.shape)

In [7]:
dataset_name = 'credit card'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_excel(os.path.join(dataset_path, 'default of credit card clients.xls'), sheet_name='Data', header=1)
    df['label'] = df['default payment next month']
    df = df.drop(['ID', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
                  'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'default payment next month'], axis=1)
    df.to_csv(save_path, index=False)
    print(df.shape)

In [8]:
dataset_name = 'EEG'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_csv(os.path.join(dataset_path, 'EEG Eye State.arff'), header=None, skiprows=19)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [9]:
dataset_name = 'HTRU2'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_csv(os.path.join(dataset_path, 'HTRU_2.csv'), header=None)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [10]:
dataset_name = 'image'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df_data = pd.read_csv(os.path.join(dataset_path, 'segmentation.data'), header=2)
    df_test = pd.read_csv(os.path.join(dataset_path, 'segmentation.test'), header=2)
    df = pd.concat([df_data, df_test])
    df['label'] = df.index
    df = df.reset_index(drop=True)
    df.to_csv(save_path, index=False)
    print(df.shape)

In [11]:
dataset_name = 'landsat'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df_tr = pd.read_csv(os.path.join(dataset_path, 'sat.trn'), header=None, sep=' ')
    df_ts = pd.read_csv(os.path.join(dataset_path, 'sat.tst'), header=None, sep=' ')
    df = pd.concat([df_tr, df_ts])
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [12]:
dataset_name = 'letter-recognition'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_csv(os.path.join(dataset_path, 'letter-recognition.data'), header=None)
    cols = ['label', 'x-box', 'y-box', 'width', 'high', 'onpix', 'x-bar', 'y-bar', 'x2bar',
            'y2bar', 'xybar', 'x2ybr', 'xy2br', 'x-ege', 'xegvy', 'y-ege', 'yegvx']
    df.columns = cols
    df = df.loc[:, cols[1:] + ['label']]
    df.to_csv(save_path, index=False)
    print(df.shape)

In [13]:
dataset_name = 'magic'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_csv(os.path.join(dataset_path, 'magic04.data'), header=None)
    df.columns = ['fLength', 'fWidth', 'fSize', 'fConc', 'fConc1', 'fAsym', 'fM3Long', 'fM3Trans', 'fAlpha', 'fDist', 'label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [14]:
dataset_name = 'occupancy'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df1 = pd.read_csv(os.path.join(dataset_path, 'datatest.txt'))
    df2 = pd.read_csv(os.path.join(dataset_path, 'datatest2.txt'))
    df3 = pd.read_csv(os.path.join(dataset_path, 'datatraining.txt'))
    df = pd.concat([df1, df2, df3])
    df = df.drop(['date'], axis=1)
    df.columns = df.columns[:-1].tolist() + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [15]:
dataset_name = 'page-blocks'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_fwf(os.path.join(dataset_path, 'page-blocks.data'), header=None)
    df.columns = ['height', 'lenght', 'area', 'eccen', 'p_black', 'p_and', 'mean_tr', 'blackpix', 'blackand', 'wb_trans', 'label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [16]:
dataset_name = 'pendigits'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df1 = pd.read_csv(os.path.join(dataset_path, 'pendigits.tes'), header=None)
    df2 = pd.read_csv(os.path.join(dataset_path, 'pendigits.tra'), header=None)
    df = pd.concat([df1, df2])
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [17]:
dataset_name = 'shuttle'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df_trn = pd.read_csv(os.path.join(dataset_path, 'shuttle.trn'), header=None, sep=' ')
    df_tst = pd.read_csv(os.path.join(dataset_path, 'shuttle.tst'), header=None, sep=' ')
    df = pd.concat([df_trn, df_tst])
    cols = ['feat%d'%i for i in range(9)] + ['label']
    df.columns = cols
    df.to_csv(save_path, index=False)
    print(df.shape)

In [18]:
dataset_name = 'Wall-following'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_csv(os.path.join(dataset_path, 'sensor_readings_24.data'), header=None)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [19]:
dataset_name = 'wifi_localization'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df = pd.read_fwf(os.path.join(dataset_path, 'wifi_localization.txt'), header=None)
    df.columns = ['feat%d'%i for i in range(len(df.columns)-1)] + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

In [20]:
dataset_name = 'wine'

save_path = os.path.join(BASE_DATA_DIR, dataset_name+'.csv')
if not os.path.exists(save_path):
    dataset_path = os.path.join(EXTERNAL_DATA_DIR, dataset_name)
    df_red = pd.read_csv(os.path.join(dataset_path, 'winequality-red.csv'), sep=';')
    df_wit = pd.read_csv(os.path.join(dataset_path, 'winequality-white.csv'), sep=';')
    df = pd.concat([df_red, df_wit])
    df.columns = df.columns[:-1].tolist() + ['label']
    df.to_csv(save_path, index=False)
    print(df.shape)

# Normalization by RobustScaler

In [21]:
def norm(df):
    rs = RobustScaler()
    rs.fit(df)
    norm_val = rs.transform(df)
    label = df.iloc[:, -1].values
    df = pd.DataFrame(norm_val[:, :-1], columns=df.columns[:-1])
    df['label'] = label
    return df

In [22]:
def plt_feature_range(df):
    fig, ax = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
    sns.boxplot(data=df, linewidth=1, orient='h', ax=ax[0], showfliers=False)
    sns.boxplot(data=df, linewidth=1, orient='h', ax=ax[1])
    ax[0].set_title('without outliers')
    ax[1].set_title('with outliers')
    plt.suptitle('range of feature values')
    plt.tight_layout()
    plt.show()

In [23]:
def make_normed_dataset(dataset_name, show_range=True):
    save_path = os.path.join(INTERIM_DATA_DIR, dataset + '_norm.csv')
    if os.path.exists(save_path):
        return

    path = os.path.join(BASE_DATA_DIR, dataset_name + '.csv')
    df = pd.read_csv(path)

    label_names = df['label'].astype('category').cat.categories
    print(label_names)
    df['label'] = df['label'].astype('category').cat.codes

    if show_range:
        plt_feature_range(df.iloc[:, :-1])
    df = norm(df)
    df.to_csv(save_path, index=False)
    if show_range:
        plt_feature_range(df.iloc[:, :-1])

In [24]:
datasets = ['Anuran_Calls', 'avila', 'Cardiotocography', 'credit card', 'EEG', 'HTRU2',
            'image', 'landsat', 'letter-recognition', 'magic', 'occupancy', 'page-blocks',
            'pendigits', 'shuttle', 'Wall-following', 'wifi_localization', 'wine']

In [25]:
for dataset in datasets:
    print(dataset)
    make_normed_dataset(dataset)

Anuran_Calls
avila
Cardiotocography
credit card
EEG
HTRU2
image
landsat
letter-recognition
magic
occupancy
page-blocks
pendigits
shuttle
Wall-following
wifi_localization
wine


# Binarization of class

In [26]:
min_minor_class_ratio = 0.2

for dataset in datasets:
    save_path = os.path.join(INTERIM_DATA_DIR, dataset + '_bin.csv')
    if os.path.exists(save_path):
        continue

    print('========================================================================')
    print(dataset)
    df = pd.read_csv(os.path.join(INTERIM_DATA_DIR, dataset + '_norm.csv'))
    display(pd.DataFrame(df['label'].value_counts().sort_index().sort_values(ascending=False)).T)

    tmp = df['label'].value_counts().sort_index().sort_values(ascending=False).cumsum()
    tmp = tmp / tmp.max()
    tmp = (tmp > (1 - min_minor_class_ratio)).astype(int)
    tmp.iloc[0] = 0
    df['__tmp'] = df['label'].copy()
    for k, v in tmp.items():
        df.loc[df['__tmp'] == k, 'label'] = v
    df = df.drop(['__tmp'], axis=1)
    df['label'] = df['label'].astype(bool).astype(int)
    display(pd.DataFrame(df['label'].value_counts().sort_index()).T)
    df.to_csv(save_path, index=False)

# Make irrelevant features

In [27]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import cross_validate
from sklearn.feature_selection import SelectKBest

In [28]:
def cross_validation(df, classifier=RandomForestClassifier):
    X = df.values[:, :-1]
    y = df.values[:, -1]

    scoring = ['accuracy', 'balanced_accuracy', 'f1_macro', 'f1_micro']
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0)
    clf = classifier(n_estimators=20, random_state=42)
    results = cross_validate(clf, X, y, cv=cv, scoring=scoring, return_estimator=True)

    scores = pd.DataFrame({k:results[k] for k in results.keys() if 'test' in k})

    importances = [clf.feature_importances_ for clf in results['estimator']]
    importances = pd.DataFrame(importances, columns=df.columns[:-1]).mean()
    return importances, scores

In [29]:
def make_irrelevant_features(df, irr_features, seed=0):
    idx = np.arange(len(df))
    np.random.seed(0)
    np.random.shuffle(idx)

    df_ = df.copy()
    df_.iloc[:, irr_features] = df.iloc[idx, irr_features].values
    return df_

In [30]:
def cv_with_shuffle(df, importances, ratio=0.3, classifier=RandomForestClassifier):
    feature_idx = importances.reset_index(drop=True).sort_values().index.values
    n_irr_features = int(len(feature_idx) * ratio)
    irr_features = feature_idx[:n_irr_features]
    df = make_irrelevant_features(df, irr_features)
    importances, scores = cross_validation(df, classifier)
    return df, importances, scores

In [31]:
for dataset in datasets:
    save_path = os.path.join(INTERIM_DATA_DIR, dataset + '_shuffled.csv')
    if os.path.exists(save_path):
        continue

    print(dataset)
    path = os.path.join(INTERIM_DATA_DIR, dataset + '_bin.csv')
    df = pd.read_csv(path)
    importances, scores = cross_validation(df)
    df_shuffled, importances_shuffled, scores_shuffled = cv_with_shuffle(df, importances)
    display(pd.DataFrame([scores.mean(), scores_shuffled.mean()]).round(3))
    display(pd.DataFrame([importances, importances_shuffled]).round(3))
    df_shuffled.to_csv(save_path, index=False)

# summary of datasets

In [32]:
dfs = []
for dataset in datasets:
    df = pd.read_csv(os.path.join(INTERIM_DATA_DIR, dataset+'_shuffled.csv'))
    class_ratio = (df['label'].value_counts() / len(df['label'])) * 100
    class_ratio = '%.1f%% : %.1f%%'%(class_ratio[0], class_ratio[1])
    dfs.append([dataset, df.shape[0], df.shape[1]-1, class_ratio])
df = pd.DataFrame(dfs, columns=['Dataset', '# of instances', '# of features', 'Class ratio'])
df.to_latex(os.path.join(OUTPUT_DIR, 'dataset.txt'), index=False)
df

Unnamed: 0,Dataset,# of instances,# of features,Class ratio
0,Anuran_Calls,7195,22,61.4% : 38.6%
1,avila,20867,10,78.3% : 21.7%
2,Cardiotocography,2126,21,77.8% : 22.2%
3,credit card,30000,13,77.9% : 22.1%
4,EEG,14980,14,55.1% : 44.9%
5,HTRU2,17898,8,90.8% : 9.2%
6,image,2310,19,71.4% : 28.6%
7,landsat,6435,36,79.3% : 20.7%
8,letter-recognition,20000,16,77.8% : 22.2%
9,magic,19020,10,64.8% : 35.2%
