In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import load_iris, load_digits, load_wine


In [None]:
# X= np.loadtxt('/content/drive/My Drive/Thesis/Datasets/madelon/madelon_train_data.txt', dtype= np.float32)
# Y= np.loadtxt('/content/drive/My Drive/Thesis/Datasets/madelon/madelon_train_label.txt', dtype= np.float32)

# df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Lung/lung-cancer.data', sep=",",header=None)
# # df = pd.read_excel ('/content/drive/My Drive/Thesis/Datasets/BreastTissue/BreastTissue.xls', sheet_name='Data')
# df = df.replace(['?'], np.nan)
# X_df = df.iloc[:,:-1]
# Y_df = df.iloc[:,-1]
# print(df.shape)
# print(df.isna().sum().sum())
# df.head()


In [None]:
# (train, test)

dataset_train_test_size={
  'iris' : (100,50),
  'digits' : (500, 1297),
  'wine' : (100,78),
  'arrhythmia' : (250, 202),
  'australian' : (400, 290),
  'breastTissue' : (80, 26),
  'dermatology' : (200, 166),
  'german' : (400, 600),
  'glass' : (120, 94),
  'libras' : (200, 160),
  'lung' : (30, 2),
  'madelon' : (400, 1600),
  'multipleFeature' : (500, 1500),
  'parkinsons' : (110, 85),
  'pima' : (400, 368),
  'sonar' : (165, 43),
  'yeast' : (500, 984),
  'waveform' : (400, 4600),
  'heart' : (170, 100),
  'segmentation' : (210, 2100),
  'ionosphere' : (235, 116),
  'ecoli' : (200, 136),
  'cns' : (59, 1),
  'colon' : (61, 1),
  'leukemia' : (71, 1),
  'leukemia3c' : (71, 1),
  'leukemia4c' : (71, 1),
  'lung2' : (202, 1),
  'lymphoma' : (65, 1),
  'mll' : (71, 1),
  'ovarian' : (252, 1),
  'srbct' : (82, 1),
  'merged_GDS3341' : (40,1),
  'merged_GDS3610' : (27,1),
  'merged_GDS3837' : (119,1),
  'merged_GDS3858' : (33,1),
  'merged_GDS4167' : (51,1),
  'merged_GDS4168' : (51,1),
  'merged_GDS4431' : (145,1),
  'merged_GDS4824' : (20,1),
  'merged_GDS5306' : (37,1),
  'merged_GSE106291' : (234,1),
  }

In [None]:
def check_numeric(X):
  newX = np.array(X).reshape(-1)
  # for i in range(X.shape[0]):
  #   for j in range(X.shape[1]):
  #     if isinstance(X[i,j], str):
  #       print(i,j)
  notStr = [not isinstance(n, str) for n in newX]
  print(notStr)
  return all(notStr)

In [None]:
# X and Y are dataframe
def dataset_to_dict(name, X_df, Y_df, categoricalX= 'off', featureNames = np.array([])):
    return {
        'name': name,
        'attributes' : X_df.fillna(X_df.mean()).to_numpy(), #X_df.fillna(X_df.mean()) fills nan values of columns with mean of that columns
        'target' : np.squeeze(Y_df.to_numpy()),
        'categoricalX' : categoricalX,
        'trainSize': dataset_train_test_size[name][0],
        'testSize': dataset_train_test_size[name][1],
        'featureNames': featureNames
    }
    

In [None]:
# X and Y are numpy array
def dataset_array_to_dict(name, X, Y, categoricalX= 'off', featureNames = np.array([])):
    return {
        'name': name,
        'attributes' : X,
        'target' : np.squeeze(Y),
        'categoricalX' : categoricalX,
        'trainSize': dataset_train_test_size[name][0],
        'testSize': dataset_train_test_size[name][1],
        'featureNames': featureNames
    }
    

In [None]:
def load_dataset_iris():
  X, Y = load_iris(return_X_y= True)
  print('dataset: iris')
  # print(X)
  # print(Y)
  print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))
  return dataset_array_to_dict('iris', X, Y)

In [None]:

def load_dataset_digits():
    X, Y = load_digits(return_X_y= True)
    print('dataset: digits')
    # print(X)
    # print(Y)
    print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))
    return dataset_array_to_dict('digits', X, Y)

In [None]:
def load_dataset_wine():
    X, Y = load_wine(return_X_y= True)
    print('dataset: wine')
    # print(X)
    # print(Y)
    print('instances = {}, features= {} '.format(X.shape[0], X.shape[1]))
    return dataset_array_to_dict('wine', X, Y)

In [None]:
def load_dataset_arrhythmia():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Arrhythmia/arrhythmia.data', sep=",",header=None)
#     replacing missing values '?' with nan
    df = df.replace(['?'], np.nan)
    df = df.astype('float64')
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: arrhythmia')
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('arrhythmia', X_df, Y_df)

# print(check_numeric(load_dataset_arrhythmia()['attributes']))

In [None]:
# mix attribute
def load_dataset_australian():
    df = pd.read_csv ('/content/drive/My Drive/Thesis/Datasets/Australian/australian.dat', sep=r'\s+',header=None)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: australian')
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('australian', X_df, Y_df)

In [None]:
def load_dataset_breastTissue():
  df = pd.read_excel('/content/drive/My Drive/Thesis/Datasets/BreastTissue/BreastTissue.xls', sheet_name='Data')
  X_df = df.iloc[:, 1:-1]
  Y_df = df.iloc[:,-1]
  print('dataset: breastTissue')
  # print(X_df.head())
  # print(Y_df.head())
  print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
  return dataset_to_dict('breastTissue', X_df, Y_df)
# load_dataset_breastTissue()

In [None]:
def load_dataset_dermatology():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Dermatology/dermatology.data', sep=",",header=None)
    df = df.replace(['?'], np.nan)
    df = df.astype('float64')
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: dermatology')
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('dermatology', X_df, Y_df)

# print(check_numeric(load_dataset_dermatology()['attributes']))    

In [None]:
# this dataset contains mixed data(categorical and numeric) so it can not be used for now

def load_dataset_german():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/German/german.data', sep=r'\s+',header=None)
    df = df.replace(['?'], np.nan)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: german')
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('german', X_df, Y_df)

# print(check_numeric(load_dataset_german()['attributes']))

In [None]:
def load_dataset_glass():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Glass/glass.data', sep=",",header=None)
    X_df = df.iloc[:,1:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: glass')
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('glass', X_df, Y_df)

In [None]:
def load_dataset_libras():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Libras/movement_libras.data', sep=",",header=None)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: libras')
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('libras', X_df, Y_df)

In [None]:
def load_dataset_lung():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Lung/lung-cancer.data', sep=",",header=None)
    df = df.replace(['?'], np.nan)
    df = df.astype('float64')
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: lung')
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('lung', X_df, Y_df)

In [None]:
def load_dataset_madelon():
    df_X = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Madelon/MADELON/madelon_train.data', sep=r'\s+',header=None)
    df_Y = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Madelon/MADELON/madelon_train_labels.txt', sep=r'\s+',header=None)
    X_df = df_X.iloc[:,:]  
    Y_df = df_Y
    print('dataset: madelon')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('madelon', X_df, Y_df)

In [None]:
def load_dataset_multipleFeature():
    fileNames = ['mfeat-fac', 'mfeat-fou', 'mfeat-kar', 'mfeat-mor', 'mfeat-pix', 'mfeat-zer']
    X = np.array([])
    Y = np.array([])
    for i in range(len(fileNames)):
        path = '/content/drive/My Drive/Thesis/Datasets/MultipleFeature/'+fileNames[i]+'.txt'
        if i==0:
            X = np.loadtxt(path)
        else:
            X = np.append(X, np.loadtxt(path), axis =1)

    for i in range(10):
        temp_Y = np.ones((200,1))*i
        Y= np.append(Y, temp_Y)
    print('dataset: multipleFeature')
#     print(X.shape)
#     print(Y.shape)
#     print(X)
#     print(Y)

    X_df = pd.DataFrame(X)  
    Y_df = pd.DataFrame(Y)

    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    # return dataset_array_to_dict('multipleFeature', X, Y)
    return dataset_to_dict('multipleFeature', X_df, Y_df)

# print(load_dataset_multipleFeature()['attributes'].shape)

In [None]:
def load_dataset_parkinsons():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Parkinsons/parkinsons.data', sep=",",header=None)
    df = df.iloc[1:, 1:] #removing first column and first row
    X_df = df.iloc[:,[i for i in range(len(df.columns)) if i!=16]]
    Y_df = df.iloc[:,16] #17th column contains label
    X_df = X_df.astype('float64')
    print('dataset: parkinsons')
#     print('X dataframe:')
    # print(X_df.head())
#     print('Y dataframe:')
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('parkinsons', X_df, Y_df)

# print(check_numeric(load_dataset_parkinsons()['attributes']))

In [None]:
def load_dataset_pima():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Pima/diabetes.csv', sep=",",header=0)
    print(df.head())
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]

    X_df = X_df.astype('float64')

    print('dataset: pima')
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y dataframe:')
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('pima', X_df, Y_df)

# load_dataset_pima() 

In [None]:
def load_dataset_sonar():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Sonar/sonar.all-data', sep=",",header=None)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: sonar')
    # print('X_df shape = ',X_df.shape)
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y_df shape = ',Y_df.shape)
    # print('Y dataframe:')
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('sonar', X_df, Y_df)

In [None]:
def load_dataset_yeast():
    # here '\s+' is regular expression for any number of whitespace
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Yeast/yeast.data', sep= r'\s+',header=None)
    X_df = df.iloc[:,1:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: yeast')
    # print('X_df shape = ',X_df.shape)
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y_df shape = ',Y_df.shape)
    # print('Y dataframe:')
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('yeast', X_df, Y_df)

In [None]:
def load_dataset_waveform():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Waveform/waveform.data', sep= ",",header=None)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: waveform')
    # print('X_df shape = ',X_df.shape)
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y_df shape = ',Y_df.shape)
    # print('Y dataframe:')
    # print(Y_df.unique())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('waveform', X_df, Y_df)

load_dataset_waveform()

dataset: waveform
instances = 5000, features= 21 


{'attributes': array([[-1.23, -1.56, -1.75, ...,  1.2 ,  0.24, -0.56],
        [-0.69,  2.43,  0.61, ...,  1.78,  0.6 ,  2.42],
        [-0.12, -0.94,  1.29, ..., -0.01, -0.79, -0.12],
        ...,
        [ 0.01, -1.99,  0.16, ...,  1.  ,  1.21, -0.27],
        [-0.4 ,  0.41, -0.48, ...,  2.08,  1.31,  1.37],
        [ 0.63, -0.07,  2.71, ...,  0.09,  0.01,  0.6 ]]),
 'categoricalX': 'off',
 'featureNames': array([], dtype=float64),
 'name': 'waveform',
 'target': array([2, 1, 0, ..., 1, 0, 1]),
 'testSize': 4600,
 'trainSize': 400}

In [None]:
def load_dataset_heart():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Heart/heart.dat', sep= " ",header=None)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: heart')
    # print('X_df shape = ',X_df.shape)
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y_df shape = ',Y_df.shape)
    # print('Y dataframe:')
    # print(Y_df.unique())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('heart', X_df, Y_df)

In [None]:
def load_dataset_segmentation():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Segmentation/segment.dat', sep= r'\s+',header=None)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: segmentation')
    # print('X_df shape = ',X_df.shape)
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y_df shape = ',Y_df.shape)
    # print('Y dataframe:')
    # print(Y_df.unique())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('segmentation', X_df, Y_df)

In [None]:
def load_dataset_ionosphere():
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Ionosphere/ionosphere.data', sep= ",",header=None)
    X_df = df.iloc[:,:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: ionosphere')
    # print('X_df shape = ',X_df.shape)
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y_df shape = ',Y_df.shape)
    # print('Y dataframe:')
    # print(Y_df.unique())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('ionosphere', X_df, Y_df)

In [None]:
def load_dataset_ecoli():
    # here '\s+' is regular expression for any number of whitespace
    df = pd.read_csv('/content/drive/My Drive/Thesis/Datasets/Ecoli/ecoli.data', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:-1]
    Y_df = df.iloc[:,-1]
    print('dataset: ecoli')
    # print('X_df shape = ',X_df.shape)
    # print('X dataframe:')
    # print(X_df.head())
    # print('Y_df shape = ',Y_df.shape)
    # print('Y dataframe:')
    # print(Y_df.unique())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('ecoli', X_df, Y_df)

# Gene/microarray Datasets

In [None]:
def load_dataset_cns():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/CNS.txt', sep="\t",header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: cns')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('cns', X_df, Y_df)
# dc = load_dataset_cns()
# print(dc)

In [None]:
def load_dataset_colon():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/colon.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: colon')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('colon', X_df, Y_df)
# dc = load_dataset_colon()
# print(dc)

In [None]:
def load_dataset_leukemia():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/leukemia.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')
    Y_df = df.iloc[:,0]
    print('dataset: leukemia')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('leukemia', X_df, Y_df)
# dc = load_dataset_leukemia()
# print(dc)

In [None]:
def load_dataset_leukemia3c():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/LEUKEMIA3C.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: leukemia3c')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('leukemia3c', X_df, Y_df)
# dc = load_dataset_leukemia3c()
# print(dc)

In [None]:
def load_dataset_leukemia4c():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/LEUKEMIA4C.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: leukemia4c')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('leukemia4c', X_df, Y_df)
# dc = load_dataset_leukemia4c()
# print(dc)

In [None]:
def load_dataset_lung2():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/Lung.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: lung2')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('lung2', X_df, Y_df)
# dc = load_dataset_lung2()
# print(dc)

In [None]:
def load_dataset_lymphoma():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/lymphoma.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: lymphoma')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('lymphoma', X_df, Y_df)
# dc = load_dataset_lymphoma()
# print(dc)

In [None]:
def load_dataset_mll():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/MLL.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: mll')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('mll', X_df, Y_df)
# dc = load_dataset_mll()
# print(dc)

In [None]:
def load_dataset_ovarian():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/OVARIAN.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: ovarian')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('ovarian', X_df, Y_df)
# dc = load_dataset_ovarian()
# print(dc)

In [None]:
def load_dataset_srbct():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data/SRBCT.txt', sep= r'\s+',header=None)
    # print(df)
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: srbct')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('srbct', X_df, Y_df)
# dc = load_dataset_srbct()
# print(dc)

# Large Gene Datasets

In [None]:
def load_dataset_merged_GDS3341():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS3341.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS3341')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    # print(df.columns.to_numpy())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS3341', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS3341()
# print(np.unique(dc['target']))
# print(dc)

In [None]:
def load_dataset_merged_GDS3610():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS3610.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS3610')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS3610', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS3610()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GDS3837():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS3837.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS3837')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS3837', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS3837()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GDS3858():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS3858.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS3858')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS3858', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS3858()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GDS4167():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS4167.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS4167')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS4167', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS4167()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GDS4168():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS4168.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS4168')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS4168', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS4168()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GDS4431():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS4431.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS4431')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS4431', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS4431()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GDS4824():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS4824.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS4824')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS4824', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS4824()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GDS5306():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GDS5306.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GDS5306')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GDS5306', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GDS5306()
# print(dc)
# print(np.unique(dc['target']))

In [None]:
def load_dataset_merged_GSE106291():
    df = pd.read_csv('/content/drive/My Drive/Thesis/gene_data_large/merged_GSE106291.csv', sep=",",header=0)
    # print(df.head())
    X_df = df.iloc[:,1:].astype('float64')  
    Y_df = df.iloc[:,0]
    print('dataset: merged_GSE106291')
    # print(X_df.shape)
    # print(Y_df.shape)
    # print(X_df.head())
    # print(Y_df.head())
    # print(df.columns)
    print('instances = {}, features= {} '.format(X_df.shape[0], X_df.shape[1]))
    return dataset_to_dict('merged_GSE106291', X_df, Y_df, featureNames = df.columns.to_numpy()[1:])
# dc = load_dataset_merged_GSE106291()
# print(dc)
# print(np.unique(dc['target']))

## Dataset loading
load_dataset(dataset) function takes a parameter dataset = name of dataset to load.
<br>
returns a dictionary like, <br>

```
{
        'name': dataset name,
        'attributes' : X,
        'target' : Y,
        'categoricalX' : 'on'/'off'
}
```



In [None]:
dataset_dict={
  'iris' : load_dataset_iris,
  'digits' : load_dataset_digits,
  'wine' : load_dataset_wine,
  'arrhythmia' : load_dataset_arrhythmia,
  # 'australian' : load_dataset_australian,
  'breastTissue' : load_dataset_breastTissue,
  'dermatology' : load_dataset_dermatology,
  # 'german' : load_dataset_german,
  'glass' : load_dataset_glass,
  'libras' : load_dataset_libras,
  'lung' : load_dataset_lung,
  'madelon' : load_dataset_madelon,
  'multipleFeature' : load_dataset_multipleFeature,
  'parkinsons' : load_dataset_parkinsons,
  'pima' : load_dataset_pima,
  'sonar' : load_dataset_sonar,
  'yeast' : load_dataset_yeast,
  'waveform' : load_dataset_waveform,
  'heart' : load_dataset_heart,
  'segmentation' : load_dataset_segmentation,
  'ionosphere' : load_dataset_ionosphere,
  'ecoli' : load_dataset_ecoli,
  }

def get_dataset_names():
   dataset_names = np.array([key for key, value in dataset_dict.items()])
   return dataset_names
   
def load_dataset(dataset):    
    return dataset_dict[dataset]()

In [None]:
gene_dataset_dict={
  'cns' : load_dataset_cns,
  'colon' : load_dataset_colon,
  'leukemia' : load_dataset_leukemia,
  'leukemia3c' : load_dataset_leukemia3c,
  'leukemia4c' : load_dataset_leukemia4c,
  'lung2' : load_dataset_lung2,
  'lymphoma' : load_dataset_lymphoma,
  'mll' : load_dataset_mll,
  'ovarian' : load_dataset_ovarian,
  'srbct' : load_dataset_srbct,
  'merged_GDS3341' : load_dataset_merged_GDS3341,
  'merged_GDS3610' : load_dataset_merged_GDS3610,
  'merged_GDS3837' : load_dataset_merged_GDS3837,
  'merged_GDS3858' : load_dataset_merged_GDS3858,
  'merged_GDS4167' : load_dataset_merged_GDS4167,
  'merged_GDS4168' : load_dataset_merged_GDS4168,
  'merged_GDS4431' : load_dataset_merged_GDS4431,
  'merged_GDS4824' : load_dataset_merged_GDS4824,
  'merged_GDS5306' : load_dataset_merged_GDS5306,
  'merged_GSE106291' : load_dataset_merged_GSE106291,
  }

def get_gene_dataset_names():
   dataset_names = np.array([key for key, value in gene_dataset_dict.items()])
   return dataset_names
   
def load_gene_dataset(dataset):    
    return gene_dataset_dict[dataset]()

In [None]:
# datasets = get_dataset_names()
# print(datasets.shape)

# for dataset in datasets:
#   print(load_dataset(dataset)['attributes'].shape)

In [None]:
# load_dataset('iris')['target'].shape

In [None]:
# print(get_gene_dataset_names())
# print(load_gene_dataset('colon'))