In [2]:
import pandas as pd
import numpy as np

## Data Preparation:
- only using non-NaN values. I drop NaN values because the dataset is not very big regardless, and we are not dropping very many values.
- Convert binary variables to a numeric representation, and one-hot-encode categorical variables. We do not want to use label encoder since a label encoder would make it 

In [77]:
cols = [ f"A{i}" for i in range(1,16)]
cols.append('label')

In [89]:
df = pd.read_csv('crx.data', names=cols)\
    .replace(to_replace='?', value=np.nan).dropna()
print(df.shape, "\n ------- \n")
print(df.head(2))

(653, 16) 
 ------- 

  A1     A2    A3 A4 A5 A6 A7    A8 A9 A10  A11 A12 A13    A14  A15 label
0  b  30.83  0.00  u  g  w  v  1.25  t   t    1   f   g  00202    0     +
1  a  58.67  4.46  u  g  q  h  3.04  t   t    6   f   g  00043  560     +


In [80]:
def to_binary(df, col):
    u = df[col].unique()
    mapping =dict(zip(u, [i for i in range(0,len(u))]))
    return df[col].map(mapping)

In [81]:
df.A1.head()

0    b
1    a
2    a
3    b
4    b
Name: A1, dtype: object

In [82]:
#convert to float
for col in ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']:
    df[col] = df[col].astype(float)
    
#binarize
for col in ['A1', 'A9', 'A10', 'A12', 'label']:
    df[col] = to_binary(df, col)
    
onehot_cols = ['A4', 'A5', 'A6', 'A7', 'A13']

#perform one hot encoding, and drop original columns
df  = df.join(pd.get_dummies(df[onehot_cols], dtype=int))\
                                .drop(onehot_cols, axis=1)

In [83]:
df.dtypes

A1         int64
A2       float64
A3       float64
A8       float64
A9         int64
A10        int64
A11      float64
A12        int64
A14      float64
A15      float64
label      int64
A4_l       int64
A4_u       int64
A4_y       int64
A5_g       int64
A5_gg      int64
A5_p       int64
A6_aa      int64
A6_c       int64
A6_cc      int64
A6_d       int64
A6_e       int64
A6_ff      int64
A6_i       int64
A6_j       int64
A6_k       int64
A6_m       int64
A6_q       int64
A6_r       int64
A6_w       int64
A6_x       int64
A7_bb      int64
A7_dd      int64
A7_ff      int64
A7_h       int64
A7_j       int64
A7_n       int64
A7_o       int64
A7_v       int64
A7_z       int64
A13_g      int64
A13_p      int64
A13_s      int64
dtype: object

In [90]:
#get features and labels as numpy arrays which we can convert to tensors
features = df.drop('label', axis=1).values
labels = df['label'].values