In [1]:
import numpy as np
import pandas as pd
import scipy.io.arff as arff
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from tqdm import tqdm
import glob
import os
import arff as arff1

In [2]:
dir_path = '../../datasets_drift/'
arff_files = glob.glob(os.path.join(dir_path, '**/*.arff'), recursive=True)

In [3]:
def write_arff(file_path, data, attributes, relation):
    f = open(file_path, "w")
    
    f.write('@relation \'{0}\' \n\n'.format(relation))
    
    for attr_name, attr_type in attributes:
        if type(attr_type) == type([]):
            attr_type = '{{{0}}}'.format(','.join(attr_type))
        f.write('@attribute {0} {1} \n'.format(attr_name, attr_type))
    
    f.write('\n')
    f.write('@data\n')
    f.write('\n')
    
    for line in data:
        f.write(','.join([str(x) for x in line])+',\n')    
    
    f.close()

In [4]:
for file_path in tqdm(arff_files):
    topic = os.path.basename(file_path).replace('.arff', '')
    data, meta = arff.loadarff(file_path)
    dataset = pd.DataFrame(data)
    
    class_column = dataset.pop('class')
    class_column = class_column.str.decode('utf-8') if type(class_column[0]) == type(b'') else class_column

    numeric_columns = [column for column in dataset.columns if meta[column][0] == 'numeric']
    nominal_columns = [column for column in dataset.columns if meta[column][0] == 'nominal']

    scaler = MinMaxScaler()
    dataset[numeric_columns] = scaler.fit_transform(dataset[numeric_columns])

    for column in nominal_columns:
        if type(dataset[column][0]) == type(b''):
            dataset[column] = dataset[column].str.decode('utf-8')
        onehotencoder = OneHotEncoder()
        x = onehotencoder.fit_transform(dataset[[column]]).toarray()
        new_columns = [column+'_'+str(i) for i in range(x.shape[1])]
        for i, new_col in enumerate(new_columns):
            dataset[new_col] = x[:,i]
        dataset.pop(column)

    dataset['class'] = class_column
    
    attributes = [(c, 'NUMERIC') for c in dataset.columns.values[:-1]]
    attributes += [('class', dataset['class'].unique().astype(str).tolist())]
    
    data = dataset.values.tolist()
        
    write_arff(file_path, data, relation=topic, attributes=attributes)

100%|██████████| 14/14 [06:02<00:00, 25.87s/it]
