In [1]:
from os import makedirs
from urllib.request import urlretrieve
from pandas import read_csv, Categorical, to_numeric
import sklearn.preprocessing as sklearn
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 10)

In [2]:
# Setup folders if they don't exist already
try:
    makedirs("raw-data")
    makedirs("raw-data/iris")
    makedirs("cleaned-data")
    makedirs("cleaned-data/iris")
except OSError as e:
    print("Folders already exist.")

In [3]:
# Get the Iris Data Set
urlretrieve("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data", "raw-data/iris/iris.data")
urlretrieve("http://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.names", "raw-data/iris/iris.names")

('raw-data/iris/iris.names', <http.client.HTTPMessage at 0x1d9e994acc8>)

In [4]:
# Put data sets into dataframes
df_iris = read_csv("raw-data/iris/iris.data",
                   header=None)
df_iris

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
def MinMaxScalar(df):
    for col in df.columns:
        maximum = df[col].max()
        minimum = df[col].min()
        for index, val in enumerate(df[col]):
            df[col][index] = (df[col][index] - minimum) / maximum
    return df

In [6]:
df_iris[[0, 1, 2, 3]] = MinMaxScalar(df_iris.copy()[[0, 1, 2, 3]])
df_iris[4] = Categorical(df_iris[4])
df_iris[4] = df_iris[4].cat.codes
df_iris = df_iris.sample(frac=1).reset_index(drop=True)
df_iris

Unnamed: 0,0,1,2,3,4
0,0.177215,0.181818,0.449275,0.48,1
1,0.075949,0.090909,0.333333,0.36,1
2,0.227848,0.227273,0.521739,0.52,1
3,0.101266,0.295455,0.101449,0.16,0
4,0.151899,0.500000,0.057971,0.04,0
...,...,...,...,...,...
145,0.253165,0.181818,0.594203,0.56,2
146,0.430380,0.181818,0.826087,0.76,2
147,0.113924,0.318182,0.057971,0.04,0
148,0.303797,0.113636,0.695652,0.68,2


In [7]:
# rows = df_iris.shape[0]
# df_iris.iloc[0:int(rows*0.7), :].to_json("cleaned-data/iris/iris-training.json", orient='values')
# df_iris.iloc[int(rows*0.7):, :].to_json("cleaned-data/iris/iris-testing.json", orient='values')
df_iris.to_json("cleaned-data/iris/iris-formatted.json", orient='values')

In [8]:
try:
    makedirs("raw-data/breast-cancer-wisconsin")
    makedirs("cleaned-data/breast-cancer-wisconsin")
except OSError as e:
    print("Folders already exist.")

In [9]:
# Get the Breast Cancer Wisconsin Data Set
urlretrieve("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data", "raw-data/breast-cancer-wisconsin/breast-cancer-wisconsin.data")
urlretrieve("http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.names", "raw-data/breast-cancer-wisconsin/breast-cancer-wisconsin.names")

('raw-data/breast-cancer-wisconsin/breast-cancer-wisconsin.names',
 <http.client.HTTPMessage at 0x1d9e99cf6c8>)

In [10]:
# Put data sets into dataframes
df_breastcancer = read_csv("raw-data/breast-cancer-wisconsin/breast-cancer-wisconsin.data",
                   header=None)
df_breastcancer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [11]:
# Convert Ouptut to Binary Classification (0: Benign, 1: Malignant) 
df_breastcancer[10] = df_breastcancer[10].map({2: 0, 4: 1})

df_breastcancer

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,0
695,841769,2,1,1,1,2,1,1,1,1,0
696,888820,5,10,10,3,7,3,8,10,2,1
697,897471,4,8,6,4,3,4,10,6,1,1


In [12]:
# Remove Id Column Corresponding To Entry
del df_breastcancer[0]

df_breastcancer

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,0
695,2,1,1,1,2,1,1,1,1,0
696,5,10,10,3,7,3,8,10,2,1
697,4,8,6,4,3,4,10,6,1,1


In [13]:
# Remove All Entries With Missing Data
pd.set_option('display.max_rows', 10)
df_breastcancer = df_breastcancer[~df_breastcancer[6].isin(['?'])]
df_breastcancer = df_breastcancer.astype(float)
df_breastcancer

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,0.0
1,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,0.0
2,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,0.0
3,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,0.0
4,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
694,3.0,1.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,0.0
695,2.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0,1.0,0.0
696,5.0,10.0,10.0,3.0,7.0,3.0,8.0,10.0,2.0,1.0
697,4.0,8.0,6.0,4.0,3.0,4.0,10.0,6.0,1.0,1.0


In [14]:
# Scaling Data
names = df_breastcancer.columns[0:10]
scaler = sklearn.MinMaxScaler()
scaled_df_breastcancer = scaler.fit_transform(df_breastcancer.iloc[:,0:10]) 
scaled_df_breastcancer = pd.DataFrame(scaled_df_breastcancer, columns=names)
scaled_df_breastcancer[10] = to_numeric(scaled_df_breastcancer[10], downcast="integer") 
scaled_df_breastcancer

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,0.444444,0.000000,0.000000,0.000000,0.111111,0.000000,0.222222,0.000000,0.000000,0
1,0.444444,0.333333,0.333333,0.444444,0.666667,1.000000,0.222222,0.111111,0.000000,0
2,0.222222,0.000000,0.000000,0.000000,0.111111,0.111111,0.222222,0.000000,0.000000,0
3,0.555556,0.777778,0.777778,0.000000,0.222222,0.333333,0.222222,0.666667,0.000000,0
4,0.333333,0.000000,0.000000,0.222222,0.111111,0.000000,0.222222,0.000000,0.000000,0
...,...,...,...,...,...,...,...,...,...,...
678,0.222222,0.000000,0.000000,0.000000,0.222222,0.111111,0.000000,0.000000,0.000000,0
679,0.111111,0.000000,0.000000,0.000000,0.111111,0.000000,0.000000,0.000000,0.000000,0
680,0.444444,1.000000,1.000000,0.222222,0.666667,0.222222,0.777778,1.000000,0.111111,1
681,0.333333,0.777778,0.555556,0.333333,0.222222,0.333333,1.000000,0.555556,0.000000,1


In [15]:
# Saving Cleaned Data for Training and Testing
# rows = scaled_df_breastcancer.shape[0]
# scaled_df_breastcancer.iloc[0:int(rows*0.7), :].to_json("cleaned-data/breast-cancer-wisconsin/breast-cancer-wisconsin-training.json", orient='values')
# scaled_df_breastcancer.iloc[int(rows*0.7):, :].to_json("cleaned-data/breast-cancer-wisconsin/breast-cancer-wisconsin-testing.json", orient='values')
scaled_df_breastcancer.to_json("cleaned-data/breast-cancer-wisconsin/breast-cancer-wisconsin-formatted.json", orient='values')