In [1]:
import pandas as pd
import os
import numpy as np
from copy import copy
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split




In [2]:
def loadData(data_dir, filename):
    """
    Loads csv files where column 0 represents labels.
    """
    df = pd.read_csv(os.path.join(data_dir, filename), header=None)
    df2 = np.array(df)
    labels = df2[:,0]
    df2 = df2[:,1:]
    return df2, labels

In [3]:
def makeBinary(data, labels, classLabel):
    """
    Makes the dataset binary by changing the given class label to 1. 
    """
    zippedData = list(zip(data, labels))
    zippedData = [[dataPoint, 1] if label == 8 else [dataPoint, 0] for dataPoint, label in zippedData]
    data, labels = list(zip(*zippedData))
    data, labels = np.array(data), np.array(labels)
    assert data.shape[0] == labels.shape[0]
    print(data.shape, labels.shape)
    return data, labels

In [4]:
def makeBinaryBalanced(data, labels, posClassLabel, negClassLabel):
    """
    Makes the dataset binary and balanced by changing the posClassLabel class to 0, and the negClassLabel class to 1.
    """
    zippedData = list(zip(data, labels))
    zippedData = [[dataPoint, 1] if label == posClassLabel else [dataPoint, 0] for dataPoint, label in zippedData 
                  if label in [posClassLabel, negClassLabel]]
    
    data, labels = list(zip(*zippedData))
    data, labels = np.array(data), np.array(labels)
    assert data.shape[0] == labels.shape[0]
    print(data.shape, labels.shape)
    return data, labels

In [5]:
def split_padded(matrix,labels, n):
    a = np.arange(matrix.shape[1])
    padding = (-len(a))%n
    index_arrays = np.split(np.concatenate((a,np.zeros(padding))).astype(int),n)
    index_arrays[-1] = np.trim_zeros(index_arrays[-1] , "b")
    matrices = [np.hstack((labels[:,np.newaxis], matrix[:, index_arrays[i]])) for i in range(len(index_arrays))]
    return matrices

In [6]:
def verticalPartition(data, labels):
    assert round(sum(probVector),3) == 1
    df = copy(data)
    numFeatures = data.shape[1]
    splitDfs = []
    for i in range(len(probVector)):
        numFeats = probVector[i]*numFeatures
        tempDf = df[:,0:min(int(numFeats), df.shape[1]-1)]
        tempDf = np.hstack((labels[:,np.newaxis], tempDf))
        splitDfs.append(tempDf)
        df = df[:, int(numFeats)+1:]
    return splitDfs

In [7]:
def verticalPartition(data, labels, probVector):
    """
    Vertically partitions the dataset by dividing the features according to probVector. 
    probVector must add to 1, as we don't consider overlapping partitions as of now.
    """
    assert round(sum(probVector),3) == 1
    df = copy(data)
    numFeatures = data.shape[1]
    splitDfs = []
    for i in range(len(probVector)):
        numFeats = probVector[i]*numFeatures
        tempDf = df[:,0:min(int(numFeats), df.shape[1]-1)]
        tempDf = np.hstack((labels[:,np.newaxis], tempDf))
        splitDfs.append(tempDf)
        df = df[:, int(numFeats)+1:]
    return splitDfs

In [8]:
def saveSplitFiles(data_dir, baseFilename, splitDfs):
    """
    Saves files into respective CSV files.
    """
    for i in range(len(splitDfs)):
        temp_df = splitDfs[i]
        temp_filename = baseFilename.split(".")[0] + "_" + str(i) + ".csv"        
        temp_df = pd.DataFrame(data=temp_df, index=None)
        temp_df.to_csv(os.path.join(data_dir, temp_filename), index=False, header=False)
        print("File saved in {}".format(os.path.join(data_dir, temp_filename)))


In [107]:
data[1:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [176]:
#MNIST BALANCED
# Process training set
data_dir = "../dl4j-examples/dl4j-examples/data/mnistbalanced"
trainFilename = "mnistbalanced_train.csv"
testFilename = "mnistbalanced_test.csv"
data, labels = loadData(data_dir, trainFilename)
data, labels = makeBinaryBalanced(data, labels, 3, 8)
data = data/255.
splitDfs = split_padded(data, labels, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(labels))

df_to_save = pd.DataFrame(np.hstack((labels.reshape(-1,1), data)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, trainFilename.split(".")[0]+ "_binary.csv"), index=False, header=False)




# Process test set
data, labels = loadData(data_dir, testFilename)
data, labels = makeBinaryBalanced(data, labels, 3, 8)
data = data/255.
splitDfs = split_padded(data, labels, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(labels))
np.min(data), np.max(data)


# Save all

df_to_save = pd.DataFrame(np.hstack((labels.reshape(-1,1), data)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, testFilename.split(".")[0] + "_binary.csv"), index=False, header=False)


(11982, 784) (11982,)
[(11982, 80), (11982, 80), (11982, 80), (11982, 80), (11982, 80), (11982, 80), (11982, 80), (11982, 80), (11982, 80), (11982, 74)]
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_0.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_1.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_2.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_3.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_4.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_5.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_6.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_7.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnistbalanced\mnistbalanced_train_8.csv
File saved in ../dl4j-examples/dl4j-e

In [135]:


#MNIST
# Process training set

data_dir = "../dl4j-examples/dl4j-examples/data/mnist"
trainFilename = "mnist_train.csv"
testFilename = "mnist_test.csv"
numSplits = 10

data, labels = loadData(data_dir, trainFilename)
data, labels = makeBinary(data, labels, 8)
data = data/255.
#splitDfs = verticalPartition(data, labels, [1/numSplits]*numSplits)
splitDfs = split_padded(data, labels, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(labels))

df_to_save = pd.DataFrame(np.hstack((labels.reshape(-1,1), data)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, trainFilename.split(".")[0] + "_binary.csv"), index=False, header=False)




# Process test set
data, labels = loadData(data_dir, testFilename)
data, labels = makeBinary(data, labels, 8)
data = data/255.
#splitDfs = verticalPartition(data, labels, [1/numSplits]*numSplits)
splitDfs = split_padded(data, labels, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(labels))
np.min(data), np.max(data)


# Save all

df_to_save = pd.DataFrame(np.hstack((labels.reshape(-1,1), data)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, testFilename.split(".")[0] + "_binary.csv"), index=False, header=False)


(60000, 784) (60000,)
[(60000, 80), (60000, 80), (60000, 80), (60000, 80), (60000, 80), (60000, 80), (60000, 80), (60000, 80), (60000, 80), (60000, 74)]
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_0.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_1.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_2.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_3.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_4.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_5.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_6.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_7.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_8.csv
File saved in ../dl4j-examples/dl4j-examples/data/mnist\mnist_train_9.csv
Counter({0: 54149, 1: 5851})
(60000, 785)
(10000, 784) (10000,)
[(10000, 80), (10000, 80), (10000, 80), (10

In [15]:
# Now we deal with CIFAR-10
data_dir = "../data/cifar-10/"
cifar10TrainFileName = "cifar10_train.csv"
cifar10TestFileName = "cifar10_test.csv"

In [16]:
def unpickle(file):
    import pickle
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

In [17]:
def loadCifar10(data_dir, dataset="train"):
    if dataset == "train":
        files = [os.path.join(data_dir, "data_batch_" + str(i)) for i in range(1,6)]
        data_dics = [unpickle(file) for file in files]
        data = np.array([d[b'data'] for d in data_dics]).reshape(-1,3072)
        labels = np.squeeze(np.array([d[b'labels'] for d in data_dics]).reshape(-1,1))
    else:
        file = os.path.join(data_dir, "test_batch")
        data_dic = unpickle(file)
        data = data_dic[b'data']
        labels = np.squeeze(np.array(data_dic[b'labels']).reshape(-1,1))
    return data, labels

In [18]:
cifarData, cifarLabels = loadCifar10(data_dir, "train")

In [26]:
#MNIST
# Process training set

data_dir = "../data/cifar-10/"
trainFilename = "cifar-10_train.csv"
testFilename = "cifar-10_test.csv"
numSplits = 10

cifarData, cifarLabels = loadCifar10(data_dir, "train")
cifarData, cifarLabels = makeBinary(cifarData, cifarLabels, 8)
cifarData = cifarData/255.
#splitDfs = verticalPartition(data, labels, [1/numSplits]*numSplits)
splitDfs = split_padded(cifarData, cifarLabels, numSplits)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(cifarLabels))

df_to_save = pd.DataFrame(np.hstack((cifarLabels.reshape(-1,1), cifarData)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, trainFilename.split(".")[0] + "_binary.csv"), index=False, header=False)


cifarTestData, cifarTestLabels = loadCifar10(data_dir, "test")
cifarTestData, cifarTestLabels = makeBinary(cifarTestData, cifarTestLabels, 8)
cifarTestData = cifarTestData/255.
#splitDfs = verticalPartition(data, labels, [1/numSplits]*numSplits)
splitDfs = split_padded(cifarTestData, cifarTestLabels, numSplits)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(cifarTestLabels))




(50000, 3072) (50000,)
[(50000, 309), (50000, 309), (50000, 309), (50000, 309), (50000, 309), (50000, 309), (50000, 309), (50000, 309), (50000, 309), (50000, 301)]
File saved in ../data/cifar-10/cifar-10_train_0.csv
File saved in ../data/cifar-10/cifar-10_train_1.csv
File saved in ../data/cifar-10/cifar-10_train_2.csv
File saved in ../data/cifar-10/cifar-10_train_3.csv
File saved in ../data/cifar-10/cifar-10_train_4.csv
File saved in ../data/cifar-10/cifar-10_train_5.csv
File saved in ../data/cifar-10/cifar-10_train_6.csv
File saved in ../data/cifar-10/cifar-10_train_7.csv
File saved in ../data/cifar-10/cifar-10_train_8.csv
File saved in ../data/cifar-10/cifar-10_train_9.csv
Counter({0: 45000, 1: 5000})
(50000, 3073)
(10000, 3072) (10000,)
[(10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 301)]
File saved in ../data/cifar-10/cifar-10_test_0.csv
File saved in ../data/cifar-10/cifar-10_test_1.csv
File sa

(10000, 3072) (10000,)
[(10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 309), (10000, 301)]
File saved in ../data/cifar-10/cifar-10_test_0.csv
File saved in ../data/cifar-10/cifar-10_test_1.csv
File saved in ../data/cifar-10/cifar-10_test_2.csv
File saved in ../data/cifar-10/cifar-10_test_3.csv
File saved in ../data/cifar-10/cifar-10_test_4.csv
File saved in ../data/cifar-10/cifar-10_test_5.csv
File saved in ../data/cifar-10/cifar-10_test_6.csv
File saved in ../data/cifar-10/cifar-10_test_7.csv
File saved in ../data/cifar-10/cifar-10_test_8.csv
File saved in ../data/cifar-10/cifar-10_test_9.csv
Counter({0: 9000, 1: 1000})


NameError: name 'cifarTestlabels' is not defined

In [25]:
df_to_save = pd.DataFrame(np.hstack((cifarTestLabels.reshape(-1,1), cifarTestData)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, testFilename.split(".")[0] + "_binary.csv"), index=False, header=False)



(10000, 3073)


In [7]:

data_dir = "../dl4j-examples/dl4j-examples/data/synthetic"
trainFilename = "synthetic_train.csv"
testFilename = "synthetic_test.csv"
numSplits = 10
# Load and process synthetic data
# Process training set
data, labels = loadData(data_dir, trainFilename)
print(data.shape, labels.shape)
splitDfs = verticalPartition(data, labels, [1/numSplits]*numSplits)
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(labels))



# Process test set
data, labels = loadData(data_dir, testFilename)
splitDfs = verticalPartition(data, labels, [1/numSplits]*numSplits)
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(labels))

(1401, 200) (1401,)
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_0.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_1.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_2.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_3.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_4.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_5.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_6.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_7.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_8.csv
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_train_9.csv
Counter({0.0: 705, 1.0: 696})
File saved in ../dl4j-examples/dl4j-examples/data/synthetic\synthetic_test_0.csv
File saved in ../dl4j-examples/dl4j-examples/data

In [140]:
# Sonar data

data_dir = "../dl4j-examples/dl4j-examples/data/sonar"
trainFilename = "sonar_train.csv"
testFilename = "sonar_test.csv"

numSplits = 10
# Load and process synthetic data
# Process training set
data, labels = loadData(data_dir, trainFilename)
print(data.shape, labels.shape)
splitDfs = split_padded(data, labels, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(labels))
df_to_save = pd.DataFrame(np.hstack((labels.reshape(-1,1), data)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, trainFilename.split(".")[0] + "_binary.csv"), index=False, header=False)



# Process test set
data, labels = loadData(data_dir, testFilename)
splitDfs = split_padded(data, labels, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(labels))

df_to_save = pd.DataFrame(np.hstack((labels.reshape(-1,1), data)))
print(df_to_save.shape)
df_to_save.to_csv(os.path.join(data_dir, testFilename.split(".")[0] + "_binary.csv"), index=False, header=False)


(166, 60) (166,)
[(166, 7), (166, 7), (166, 7), (166, 7), (166, 7), (166, 7), (166, 7), (166, 7), (166, 7), (166, 7)]
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_0.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_1.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_2.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_3.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_4.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_5.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_6.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_7.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_8.csv
File saved in ../dl4j-examples/dl4j-examples/data/sonar\sonar_train_9.csv
Counter({1.0: 85, 0.0: 81})
(166, 61)
[(42, 7), (42, 7), (42, 7), (42, 7), (42, 7), (42, 7), (42, 7), (42, 7), (42, 7), (42, 7)]
File saved in

In [10]:
import datapackage

data_url = 'https://datahub.io/machine-learning/madelon/datapackage.json'

# to load Data Package into storage
package = datapackage.Package(data_url)

# to load only tabular data
resources = package.resources
for resource in resources:
    if resource.tabular:
        madelon_data = pd.read_csv(resource.descriptor['path'])



In [12]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

data_dir = "../data/madelon/"
X_madelon, y_madelon = madelon_data.iloc[:, :-1], madelon_data.iloc[:, -1]
X_madelon = np.array(X_madelon)
y_madelon = np.array(y_madelon) 
X_madelon = MinMaxScaler().fit_transform(X_madelon) # scale
y_madelon[y_madelon == 1] = 0
y_madelon[y_madelon == 2] = 1
X_madelon_train, X_madelon_test, y_madelon_train, y_madelon_test = train_test_split(X_madelon, y_madelon, test_size=0.2, random_state=42)

all_train = np.hstack((y_madelon_train.reshape(-1,1), X_madelon_train))
all_test = np.hstack((y_madelon_test.reshape(-1,1), X_madelon_test))
pd.DataFrame(all_train).to_csv(os.path.join(data_dir, "madelon_train_binary.csv"), index=None, header=None)
pd.DataFrame(all_test).to_csv(os.path.join(data_dir, "madelon_test_binary.csv"), index=None, header=None)
all_train[:5,:]



array([[0.        , 0.53658537, 0.48927039, ..., 0.46      , 0.69395018,
        0.35263158],
       [1.        , 0.48780488, 0.54077253, ..., 0.33      , 0.3772242 ,
        0.4       ],
       [0.        , 0.70731707, 0.43347639, ..., 0.39      , 0.70106762,
        0.31578947],
       [1.        , 0.48780488, 0.66094421, ..., 0.45      , 0.5088968 ,
        0.59473684],
       [1.        , 0.48780488, 0.57939914, ..., 0.45      , 0.52669039,
        0.43684211]])

In [14]:
numSplits = 2
data_dir = "../data/madelon"
trainFilename = "madelon_train.csv"
testFilename = "madelon_test.csv"
# Load and process synthetic data
# Process training set
splitDfs = split_padded(X_madelon_train, y_madelon_train, numSplits)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(y_madelon_train))


# Process test set
splitDfs = split_padded(X_madelon_test, y_madelon_test, numSplits)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(y_madelon_test))

[(2080, 251), (2080, 251)]
File saved in ../data/madelon\madelon_train_0.csv
File saved in ../data/madelon\madelon_train_1.csv
Counter({0: 1043, 1: 1037})
[(520, 251), (520, 251)]
File saved in ../data/madelon\madelon_test_0.csv
File saved in ../data/madelon\madelon_test_1.csv
Counter({1: 263, 0: 257})


In [143]:
data_dir = "../dl4j-examples/dl4j-examples/data/gisette/"
dataFilename = "gisette_scale.csv"
gisette_data = pd.read_csv(os.path.join(data_dir, dataFilename))
X_gisette, y_gisette = gisette_data.iloc[:, :-1], gisette_data.iloc[:, -1]
X_gisette = np.array(X_gisette)
y_gisette = np.array(y_gisette) 
y_gisette[y_gisette == -1] = 0
y_gisette[y_gisette == 1] = 1

X_gisette_train, X_gisette_test, y_gisette_train, y_gisette_test = train_test_split(X_gisette, y_gisette, test_size=0.2, random_state=42)
all_train = np.hstack((y_gisette_train.reshape(-1,1), X_gisette_train))
all_test = np.hstack((y_gisette_test.reshape(-1,1), X_gisette_test))
pd.DataFrame(all_train).to_csv(os.path.join(data_dir, "gisette_train_binary.csv"), index=None, header=None)
pd.DataFrame(all_test).to_csv(os.path.join(data_dir, "gisette_test_binary.csv"), index=None, header=None)

In [145]:
numSplits = 10
data_dir = "../dl4j-examples/dl4j-examples/data/gisette"
trainFilename = "gisette_train.csv"
testFilename = "gisette_test.csv"
# Load and process synthetic data
# Process training set
splitDfs = split_padded(X_gisette_train, y_gisette_train, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(y_gisette_train))


# Process test set
splitDfs = split_padded(X_gisette_test, y_gisette_test, 10)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(y_gisette_test))

[(4800, 501), (4800, 501), (4800, 501), (4800, 501), (4800, 501), (4800, 501), (4800, 501), (4800, 501), (4800, 501), (4800, 501)]
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_0.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_1.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_2.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_3.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_4.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_5.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_6.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_7.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_8.csv
File saved in ../dl4j-examples/dl4j-examples/data/gisette\gisette_train_9.csv
Counter({1: 2407, 0: 2393})
[(1200, 501), (1200, 501), (1200, 501), (1200, 501), (1200, 5

In [12]:
import pandas as pd
arcene_df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.data', delimiter=" ", header=None)
arcene_labels = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/arcene/ARCENE/arcene_train.labels', delimiter=" ", header=None)
arcene_df.shape, arcene_labels.shape

((100, 10001), (100, 1))

In [18]:
data_dir = "../data/arcene/"
num_splits = 10
trainFilename = "arcene_train.csv"
testFilename = "arcene_test.csv"
X_arcene, y_arcene = arcene_df.iloc[:, :-1], np.squeeze(arcene_labels)
X_arcene = np.array(X_arcene)
y_arcene = np.array(y_arcene) 

X_arcene = MinMaxScaler().fit_transform(X_arcene)
y_arcene[y_arcene == -1] = 0
y_arcene[y_arcene == 1] = 1

X_arcene_train, X_arcene_test, y_arcene_train, y_arcene_test = train_test_split(X_arcene, y_arcene, test_size=0.2, random_state=42)
all_train = np.hstack((y_arcene_train.reshape(-1,1), X_arcene_train))
all_test = np.hstack((y_arcene_test.reshape(-1,1), X_arcene_test))

# Process training set
splitDfs = split_padded(X_arcene_train, y_arcene_train, num_splits)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, trainFilename, splitDfs)
print(Counter(y_arcene_train))


# Process test set
splitDfs = split_padded(X_arcene_test, y_arcene_test, num_splits)
print([a.shape for a in splitDfs])
saveSplitFiles(data_dir, testFilename, splitDfs)
print(Counter(y_arcene_test))

pd.DataFrame(all_train).to_csv(os.path.join(data_dir, "arcene_train_binary.csv"), index=None, header=None)
pd.DataFrame(all_test).to_csv(os.path.join(data_dir, "arcene_test_binary.csv"), index=None, header=None)



[(80, 1001), (80, 1001), (80, 1001), (80, 1001), (80, 1001), (80, 1001), (80, 1001), (80, 1001), (80, 1001), (80, 1001)]
File saved in ../data/arcene/arcene_train_0.csv
File saved in ../data/arcene/arcene_train_1.csv
File saved in ../data/arcene/arcene_train_2.csv
File saved in ../data/arcene/arcene_train_3.csv
File saved in ../data/arcene/arcene_train_4.csv
File saved in ../data/arcene/arcene_train_5.csv
File saved in ../data/arcene/arcene_train_6.csv
File saved in ../data/arcene/arcene_train_7.csv
File saved in ../data/arcene/arcene_train_8.csv
File saved in ../data/arcene/arcene_train_9.csv
Counter({0: 43, 1: 37})
[(20, 1001), (20, 1001), (20, 1001), (20, 1001), (20, 1001), (20, 1001), (20, 1001), (20, 1001), (20, 1001), (20, 1001)]
File saved in ../data/arcene/arcene_test_0.csv
File saved in ../data/arcene/arcene_test_1.csv
File saved in ../data/arcene/arcene_test_2.csv
File saved in ../data/arcene/arcene_test_3.csv
File saved in ../data/arcene/arcene_test_4.csv
File saved in ../da

In [17]:
import arff, numpy as np
dataset = arff.load(open('../data/dexter/dexter.arff'))
data = np.array(dataset['data'])

UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 858: character maps to <undefined>