# Read dataset

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Read dataset from directory
dir_data = './data/'
raw_data = os.path.join(dir_data, 'crx.data')
data = np.genfromtxt(raw_data, delimiter=",", dtype=str)
label = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'A16']

# Put txt files into DataFrame

In [2]:
arrange_data = []
for line in data:
    # processing with missimg value
    for index in range(len(line)):
        if(line[index] == '?'):
            line[index] = '0'  # replace missing value with 0
    arrange_data.append(line)
    
df = pd.DataFrame(arrange_data)
df.columns = label
df=df.astype({'A2':'float32',
              'A3':'float32',
              'A8':'float32',})
df

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,A16
0,b,30.830000,0.000,u,g,w,v,1.250,t,t,01,f,g,00202,0,+
1,a,58.669998,4.460,u,g,q,h,3.040,t,t,06,f,g,00043,560,+
2,a,24.500000,0.500,u,g,q,h,1.500,t,f,0,f,g,00280,824,+
3,b,27.830000,1.540,u,g,w,v,3.750,t,t,05,t,g,00100,3,+
4,b,20.170000,5.625,u,g,w,v,1.710,t,f,0,f,s,00120,0,+
5,b,32.080002,4.000,u,g,m,v,2.500,t,f,0,t,g,00360,0,+
6,b,33.169998,1.040,u,g,r,h,6.500,t,f,0,t,g,00164,31285,+
7,a,22.920000,11.585,u,g,cc,v,0.040,t,f,0,f,g,00080,1349,+
8,b,54.419998,0.500,y,p,k,h,3.960,t,f,0,f,g,00180,314,+
9,b,42.500000,4.915,y,p,w,v,3.165,t,f,0,t,g,00052,1442,+


# Mean

In [3]:
df.mean()

A2    31.019157
A3     4.758725
A8     2.223407
dtype: float32

# Median

In [4]:
df.median()

A2    28.17
A3     2.75
A8     1.00
dtype: float32

# Standard

In [5]:
df.std()

A2    12.552082
A3     4.978165
A8     3.346511
dtype: float32

# Split training data & testing data

In [6]:
crx_data = df[df.columns[:-1]]
crx_label = df['A16']
train_data , test_data , train_label , test_label = train_test_split(crx_data, crx_label, test_size=0.33, stratify=crx_label)
train_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15
406,a,40.330002,8.125000,y,p,k,v,0.165,f,t,02,f,g,0,18
281,b,23.920000,0.585000,y,p,cc,h,0.125,f,f,0,f,g,00240,1
270,b,37.580002,0.000000,0,0,0,0,0.000,f,f,0,f,p,0,0
350,a,26.170000,2.000000,u,g,j,j,0.000,f,f,0,t,g,00276,1
106,b,28.750000,1.165000,u,g,k,v,0.500,t,f,0,f,s,00280,0
405,a,69.500000,6.000000,u,g,ff,ff,0.000,f,f,0,f,s,00000,0
111,a,24.500000,1.040000,y,p,ff,ff,0.500,t,t,03,f,g,00180,147
210,b,39.330002,5.875000,u,g,cc,h,10.000,t,t,14,t,g,00399,0
129,a,28.420000,3.500000,u,g,w,v,0.835,t,f,0,f,s,00280,0
473,b,19.170000,4.000000,y,p,i,v,1.000,f,f,0,t,g,00360,1000
