In [1]:
import numpy as np
import pandas as pd
import scipy.io
from scipy.io import arff
from sklearn.feature_selection import VarianceThreshold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PowerTransformer
dfs=[]

In [2]:
def to_df(mat):
    mats=[list(x)+list(y) for x,y in zip(mat['X'], mat['Y'])]
    cols=['x'+str(i) for i in range(len(mat['X'][0]))]
    cols.append('y')
    return pd.DataFrame(mats, columns =cols)

In [3]:
mat = scipy.io.loadmat('../input/ml4-datasetsfiles/CLL-SUB-111.mat')
dfs.append(to_df(mat))
mat = scipy.io.loadmat('../input/ml4-datasetsfiles/ALLAML.mat')
dfs.append(to_df(mat))
mat = scipy.io.loadmat('../input/ml4-datasetsfiles/BASEHOCK.mat')
dfs.append(to_df(mat))
mat = scipy.io.loadmat('../input/ml4-datasetsfiles/COIL20.mat')
dfs.append(to_df(mat))
mat = scipy.io.loadmat('../input/ml4-datasetsfiles/Carcinom.mat')
dfs.append(to_df(mat))

In [4]:
dfs.append(pd.read_csv('../input/ml4-datasetsfiles/pone.0202167.s016.csv'))
dfs.append(pd.read_csv('../input/ml4-datasetsfiles/pone.0202167.s017.csv'))
dfs.append(pd.read_csv('../input/ml4-datasetsfiles/bladderbatch.csv', index_col='Unnamed: 0').T.rename(columns={"CancerClass": "y"}))
dfs.append(pd.read_csv('../input/ml4-datasetsfiles/ayeastCC.csv', index_col='Unnamed: 0').T.rename(columns={"Class": "y"}))
dfs.append(pd.read_csv('../input/ml4-datasetsfiles/breastCancerVDX.csv', index_col='Unnamed: 0').T.rename(columns={"oestrogenreceptorsClass": "y"}))
dfs.append(pd.read_csv('../input/ml4-datasetsfiles/curatedOvarianData.csv', index_col='Unnamed: 0').T.rename(columns={"GradeClass": "y"}))
dfs.append(pd.read_csv('../input/ml4-datasetsfiles/leukemiasEset.csv', index_col='Unnamed: 0').T.rename(columns={"LeukemiaTypeClass": "y"}))

In [5]:
data = arff.loadarff('../input/ml4-datasetsfiles/Lung.arff')
dfs.append(pd.DataFrame(data[0]))
data = arff.loadarff('../input/ml4-datasetsfiles/Lymphoma.arff')
dfs.append(pd.DataFrame(data[0]))
data = arff.loadarff('../input/ml4-datasetsfiles/MLL.arff')
dfs.append(pd.DataFrame(data[0]))
data = arff.loadarff('../input/ml4-datasetsfiles/SRBCT.arff')
dfs.append(pd.DataFrame(data[0]))
data = arff.loadarff('../input/ml4-datasetsfiles/CNS.arff')
dfs.append(pd.DataFrame(data[0]))
data = arff.loadarff('../input/ml4-datasetsfiles/pone.0202167.s011.arff')
dfs.append(pd.DataFrame(data[0]))
data = arff.loadarff('../input/ml4-datasetsfiles/pone.0202167.s012.arff')
dfs.append(pd.DataFrame(data[0]))

data = arff.loadarff('../input/ml4-datasetsfiles/pone.0202167.s015.arff')
dfs.append(pd.DataFrame(data[0]))

In [33]:
def variance_threshold_selector(data):
    var_thr = VarianceThreshold(threshold = 0.001)
    var_thr.fit(data)
    concol = [column for column in data.columns 
          if column not in data.columns[var_thr.get_support()]]
    if 'y' in concol:
        concol.remove('y')
    data=data.drop(concol,axis=1)
    return data

In [34]:
def category_transformer(df):
    for col in df.columns.to_list():
        if df[col].dtype == 'object':
            vals=df[col].unique()
            df[col]=df[col].apply(lambda x: np.where(vals == x)[0][0])

In [35]:
i=0
dfs_class_col=[]
for i, df in enumerate(dfs):
    cols=df.columns.to_list()
    if('y' not in cols):
        if ('class' in cols):
            df=df.rename(columns={"class": "y"})
        elif ('CLASS' in cols):
            df=df.rename(columns={"CLASS": "y"})
        elif ('type' in cols):
            df=df.rename(columns={"type": "y"})
        elif (' class' in cols):
            df=df.rename(columns={" class": "y"})
        elif ('Class' in cols):
            df=df.rename(columns={"Class": "y"})
        else:
            print(cols)
    dfs_class_col.append(df)


In [36]:
for df in dfs_class_col:
    category_transformer(df)

In [37]:
def fill_missing(data):
    imp_mean = SimpleImputer(missing_values = np.nan, strategy='most_frequent')
    imp_mean.fit(data)
    idf=pd.DataFrame(imp_mean.fit_transform(data))
    idf.columns=data.columns
    idf.index=data.index
    return idf

In [38]:
dfs_filled=[fill_missing(df) for df in dfs_class_col]

In [39]:
dfs_after_drops=[variance_threshold_selector(df) for df in dfs_filled]

In [40]:
def power_transform(data):
    with np.errstate(divide='raise'):
        try:
            x=data.drop(columns=['y'])
            pt = PowerTransformer()
            pt.fit(x)
            idf=pd.DataFrame(pt.fit_transform(x))
            idf['y']=data['y']
            idf.columns=data.columns
            idf.index=data.index
            return idf
        except:
            return data

In [41]:
dfs_normalized=[power_transform(df) for df in dfs_after_drops]

In [42]:
for i, df in enumerate(dfs_normalized):
    df.to_csv(f'dataset_{i}.csv')  