# Data Preprocessing
Below are various preprocessing procedures for each dataset

###### dependencies

In [None]:
# third-party imports
import pandas as pd

# Sample Dataframe Manipulation

In [4]:
# votes = pd.DataFrame(pd.read_csv('voting-records/house-votes-84.data',header=None))
# votes[votes=="?"] = np.nan
# votes=votes.dropna(axis = 0)
# votes[votes=="y"] = 1
# votes[votes=="n"] = 0
# votes[votes=="democrat"] = 1
# votes[votes=="republican"] = 0
# votes.columns=["democrat", "handicapped-infants", "water-project-cost-sharing", "adoption-of-the-budget-resolution", 
#                "physician-fee-freeze", "el-salvador-aid", "religious-groups-in-schools", "anti-satellite-test-ban",
#                "aid-to-nicaraguan-contras", "mx-missile", "immigration", "synfuels-corporation-cutback", 
#                "education-spending", "superfund-right-to-sue", "crime", "duty-free-exports", 
#                "export-administration-act-south-africa"
#               ]
# votes.to_csv('voting-records-binary.csv',index=False,sep=',')

# Monk 1

In [28]:
# monk1
monk1 = pd.DataFrame(pd.read_csv('data/monks-problems/monks-1.train',header=None,delimiter=" "))
monk1 = monk1.drop(columns=[0,8])
monk1 = monk1.applymap(str)
monk1 = pd.get_dummies(monk1, prefix=["class", "a1", "a2", "a3", "a4", "a5", "a6"])
monk1 = monk1.drop(columns=['class_0', 'a1_3', 'a2_3', 'a3_2', 'a4_3', 'a5_4', 'a6_2'])
columns = ["a1_1","a1_2","a2_1","a2_2","a3_1","a4_1","a4_2","a5_1","a5_2","a5_3","a6_1","class_1"]
monk1 = monk1[columns]

monk1.to_csv('data/preprocessed/monk1-train.csv',index=False,sep=';')
monk1; # remove semicolon to reveal the table

# Monk 2

In [29]:
# monk2
monk2 = pd.DataFrame(pd.read_csv('data/monks-problems/monks-2.train',header=None,delimiter=" "))
monk2 = monk2.drop(columns=[0,8])
monk2 = monk2.applymap(str)
monk2 = pd.get_dummies(monk2, prefix=["class", "a1", "a2", "a3", "a4", "a5", "a6"])
monk2 = monk2.drop(columns=['class_0', 'a1_3', 'a2_3', 'a3_2', 'a4_3', 'a5_4', 'a6_2'])
columns = ["a1_1","a1_2","a2_1","a2_2","a3_1","a4_1","a4_2","a5_1","a5_2","a5_3","a6_1","class_1"]
monk2 = monk2[columns]

monk2.to_csv('data/preprocessed/monk2-train.csv',index=False,sep=';')
monk2; # remove semicolon to reveal the table

# Monk 3

In [30]:
# monk3
monk3 = pd.DataFrame(pd.read_csv('data/monks-problems/monks-3.train',header=None,delimiter=" "))
monk3 = monk3.drop(columns=[0,8])
monk3 = monk3.applymap(str)
monk3 = pd.get_dummies(monk3, prefix=["class", "a1", "a2", "a3", "a4", "a5", "a6"])
monk3 = monk3.drop(columns=['class_0', 'a1_3', 'a2_3', 'a3_2', 'a4_3', 'a5_4', 'a6_2'])
columns = ["a1_1","a1_2","a2_1","a2_2","a3_1","a4_1","a4_2","a5_1","a5_2","a5_3","a6_1","class_1"]
monk3 = monk3[columns]

monk3.to_csv('data/preprocessed/monk3-train.csv',index=False,sep=';')
monk3; # remove semicolon to reveal the table

# Balance Scale

In [11]:
balance_scale = pd.DataFrame(pd.read_csv('data/balance-scale/balance-scale.data',header=None,delimiter=","))
balance_scale.columns = ["class", "a1", "a2", "a3", "a4"]
balance_scale['class'] = (balance_scale['class']=="B")
balance_scale

balance_scale = balance_scale.applymap(str)
balance_scale = pd.get_dummies(balance_scale, prefix=["class", "a1", "a2", "a3", "a4"])
balance_scale['class']=balance_scale['class_True']
balance_scale = balance_scale.drop(columns=['class_True','class_False', 'a1_5', 'a2_5','a3_5','a4_5'])

balance_scale.to_csv('data/preprocessed/balance-scale.csv',index=False,sep=';')
balance_scale; # remove semicolon to reveal the table

# Car Evaluation

In [31]:
car = pd.DataFrame(pd.read_csv('data/car/car.data',header=None,delimiter=","))
car.columns = ["a1", "a2", "a3", "a4", "a5", "a6", "class"]
car['class'] = 1-(car['class']=="unacc")
car = car.applymap(str)
car = pd.get_dummies(car)
car = car.drop(columns=['a1_vhigh', 'a2_vhigh', 'a3_5more', 'a4_more', 'a5_small', 'a6_med', 'class_0'])
car.to_csv('data/preprocessed/car-evaluation.csv',index=False,sep=';')
car; # remove semicolon to reveal the table

# COMPAS

In [32]:
# COMPAS is already in a compatible format
compas = pd.DataFrame(pd.read_csv('data/compas/compas-binary.csv'))
compas['age:<21']=compas['age:18-20']
compas['age:<23']=compas['age:18-20']+compas['age:21-22']
compas['age:<26']=compas['age:18-20']+compas['age:21-22']+compas['age:23-25']
compas['age:<46']=compas['age:18-20']+compas['age:21-22']+compas['age:23-25']+compas['age:26-45']

compas = compas.drop(columns=['age:18-20', 'age:21-22', 'age:23-25', 'age:26-45', 'age:>45'])

columns = ['sex:Female','age:<21','age:<23','age:<26', 'age:<46', 
           'juvenile-felonies:=0', 'juvenile-misdemeanors:=0', 'juvenile-crimes:=0',
           'priors:=0', 'priors:=1', 'priors:2-3', 'priors:>3', 'recidivate-within-two-years:1']

compas = compas[columns]
compas
compas.to_csv('data/preprocessed/compas-binary.csv',index=False,sep=';')

compas; # remove semicolon to reveal the table

In [None]:
# I see, I don't think it's necessary to store the partitioned data copies on disk
# We now have a library function to do crss validation in memory so the stuff below can go away

## 3 Folds

In [22]:
from sklearn.model_selection import KFold # import KFold

In [23]:
kf = KFold(n_splits=10,random_state=2018,shuffle=True) # Define the split - into 10 folds

In [27]:
compas = pd.DataFrame(pd.read_csv('data/preprocessed/compas-binary.csv',sep=";"))
monk1 = pd.DataFrame(pd.read_csv('data/preprocessed/monk1-train.csv',sep=";"))
monk2 = pd.DataFrame(pd.read_csv('data/preprocessed/monk2-train.csv',sep=";"))
monk3 = pd.DataFrame(pd.read_csv('data/preprocessed/monk3-train.csv',sep=";"))
balance = pd.DataFrame(pd.read_csv('data/preprocessed/balance-scale.csv',sep=";"))
tictactoe = pd.DataFrame(pd.read_csv('data/preprocessed/tic-tac-toe.csv',sep=";"))
car = pd.DataFrame(pd.read_csv('data/preprocessed/car-evaluation.csv',sep=";"))
fico = pd.DataFrame(pd.read_csv('data/preprocessed/fico_binary.csv',sep=";"))


In [5]:
i=0
for train_index, test_index in kf.split(compas):
    i=i+1
    compas_train = compas.iloc[train_index]
    compas_test = compas.iloc[test_index]
    
    compas_train.to_csv('preprocessed/compas-binary.csv.train'+str(i)+'.csv',index=False,sep=';')
    compas_test.to_csv('preprocessed/compas-binary.csv.test'+str(i)+'.csv',index=False,sep=';')

In [6]:
i=0
for train_index, test_index in kf.split(monk1):
    i=i+1
    monk1_train = monk1.iloc[train_index]
    monk1_test = monk1.iloc[test_index]
    
    monk1_train.to_csv('preprocessed/monk1-train.csv.train'+str(i)+'.csv',index=False,sep=';')
    monk1_test.to_csv('preprocessed/monk1-train.csv.test'+str(i)+'.csv',index=False,sep=';')

In [7]:
i=0
for train_index, test_index in kf.split(monk2):
    i=i+1
    monk2_train = monk2.iloc[train_index]
    monk2_test = monk2.iloc[test_index]
    
    monk2_train.to_csv('preprocessed/monk2-train.csv.train'+str(i)+'.csv',index=False,sep=';')
    monk2_test.to_csv('preprocessed/monk2-train.csv.test'+str(i)+'.csv',index=False,sep=';')

In [8]:
i=0
for train_index, test_index in kf.split(monk3):
    i=i+1
    monk3_train = monk3.iloc[train_index]
    monk3_test = monk3.iloc[test_index]
    
    monk3_train.to_csv('preprocessed/monk3-train.csv.train'+str(i)+'.csv',index=False,sep=';')
    monk3_test.to_csv('preprocessed/monk3-train.csv.test'+str(i)+'.csv',index=False,sep=';')

In [9]:
i=0
for train_index, test_index in kf.split(balance):
    i=i+1
    balance_train = balance.iloc[train_index]
    balance_test = balance.iloc[test_index]
    
    balance_train.to_csv('preprocessed/balance-scale.csv.train'+str(i)+'.csv',index=False,sep=';')
    balance_test.to_csv('preprocessed/balance-scale.csv.test'+str(i)+'.csv',index=False,sep=';')

In [10]:
i=0
for train_index, test_index in kf.split(tictactoe):
    i=i+1
    tictactoe_train = tictactoe.iloc[train_index]
    tictactoe_test = tictactoe.iloc[test_index]
    
    tictactoe_train.to_csv('preprocessed/tic-tac-toe.csv.train'+str(i)+'.csv',index=False,sep=';')
    tictactoe_test.to_csv('preprocessed/tic-tac-toe.csv.test'+str(i)+'.csv',index=False,sep=';')

In [11]:
i=0
for train_index, test_index in kf.split(car):
    i=i+1
    car_train = car.iloc[train_index]
    car_test = car.iloc[test_index]
    
    car_train.to_csv('preprocessed/car-evaluation.csv.train'+str(i)+'.csv',index=False,sep=';')
    car_test.to_csv('preprocessed/car-evaluation.csv.test'+str(i)+'.csv',index=False,sep=';')

In [12]:
i=0
for train_index, test_index in kf.split(fico):
    i=i+1
    fico_train = fico.iloc[train_index]
    fico_test = fico.iloc[test_index]
    
    fico_train.to_csv('preprocessed/fico_binary.csv.train'+str(i)+'.csv',index=False,sep=';')
    fico_test.to_csv('preprocessed/fico_binary.csv.test'+str(i)+'.csv',index=False,sep=';')