In [1]:
import pandas as pd
import numpy as np

from interpret.glassbox import ExplainableBoostingClassifier
from interpret import show

# Prepare training data
df_train = pd.read_csv(
    "plant-train.csv")#,
    #skiprows = 1)
df_train.columns = [
    "sl", "sw", "pl", "pw", "target"
]

#normalisation - not required by packages used in this assignment - EBM and Decision Tree, but normalising based on observations
# Columns 3 and 4 have a range which match to a target. Simple split
df_train["pl"] = np.where(df_train["pl"] > 2, 1, 0) # Column 3, all values greater than 2 have a target value of virginica
df_train["pw"] = np.where(df_train["pw"] < 1, 0, 1) # Column 4, all values less than 1 have a target value of setosa

#Target is one value or another, so setting setosa to 0, and virginica to 1
df_train["target"] = np.where(df_train["target"] == "setosa", 0, 1)

#Column 1 in training data has almost a perfect split between target values. 
#Values in the range 4.9 to 5.8 could indicate either target value.
#If value is between 4.4 and 4.9 inclusive the target is setosa - assign a value of 0
#If value is between 5.8 and 7.9 inclusive the target is virginica - assign a value of 1
#Otherwise it could be either (value between 4.9 and 5.8), so assigning a target of 2
df_train["sl"] = np.where(df_train["sl"] > 5.8, 1,(np.where(df_train["sl"] < 4.9, 0, 2)))

#Column 2, going to try and group the data if possible
# > 3.8 is setosa, so assuming 
# < 3.9 could be either
df_train["sw"] = np.where(df_train["sw"] > 3.8, 0, 2)


In [2]:
pd.set_option('display.max_rows', None)
print(df_train)

    sl  sw  pl  pw  target
0    2   2   0   0       0
1    2   2   0   0       0
2    0   2   0   0       0
3    0   2   0   0       0
4    2   2   0   0       0
5    2   0   0   0       0
6    0   2   0   0       0
7    2   2   0   0       0
8    0   2   0   0       0
9    2   0   0   0       0
10   2   0   0   0       0
11   2   0   0   0       0
12   2   2   0   0       0
13   2   2   0   0       0
14   2   2   0   0       0
15   2   2   0   0       0
16   2   2   0   0       0
17   0   2   0   0       0
18   2   2   0   0       0
19   0   2   0   0       0
20   2   2   0   0       0
21   2   2   0   0       0
22   2   2   0   0       0
23   2   2   0   0       0
24   0   2   0   0       0
25   0   2   0   0       0
26   2   2   0   0       0
27   2   0   0   0       0
28   2   0   0   0       0
29   2   2   0   0       0
30   2   2   0   0       0
31   2   2   0   0       0
32   2   2   0   0       0
33   0   2   0   0       0
34   2   2   0   0       0
35   2   2   0   0       0
3

In [3]:
# Prepare Test data
df_test = pd.read_csv(
    "plant-test.csv")#,
    #skiprows = 1)
df_test.columns = [
    "sl", "sw", "pl", "pw", "target"
]

#normalisation - not required by packages used in this assignment - EBM and Decision Tree, but normalising based on observations
# Columns 3 and 4 have a range which match to a target. Simple split
df_test["pl"] = np.where(df_test["pl"] > 2, 1, 0) # Column 3, all values greater than 2 have a target value of virginica
df_test["pw"] = np.where(df_test["pw"] < 1, 0, 1) # Column 4, all values less than 1 have a target value of setosa

#Target is one value or another, so setting setosa to 0, and virginica to 1
df_test["target"] = np.where(df_test["target"] == "setosa", 0, 1)

#Column 1 in testing data has almost a perfect split between target values. 
#Values in the range 4.9 to 5.8 could indicate either target value.
#If value is between 4.4 and 4.9 inclusive the target is setosa - assign a value of 0
#If value is between 5.8 and 7.9 inclusive the target is virginica - assign a value of 1
#Otherwise it could be either (value between 4.9 and 5.8), so assigning a target of 2
df_test["sl"] = np.where(df_test["sl"] > 5.8, 1,(np.where(df_test["sl"] < 4.9, 0, 2)))

#Column 2, going to try and group the data if possible
# > 3.8 is setosa, so assuming 
# < 3.9 could be either
df_test["sw"] = np.where(df_test["sw"] > 3.8, 0, 2)

In [None]:
print(df_test)

In [12]:
import pandas as pd

df_train = pd.read_csv(
    "plant-train.csv")
df_train.columns = [
    "sl", "sw", "pl", "pw", "target"
]
train_cols = df_train.columns[0:-1]
train_label = df_train.columns[-1]
X_train = df_train[train_cols]
y_train = df_train[train_label]
print(X_train.index)

RangeIndex(start=0, stop=80, step=1)


In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
 
loan=pd.DataFrame(scaler.fit_transform(X_train),
            columns=train_cols, index=X_train.index) 
pd.set_option('display.max_rows', None)
print(loan)

# Note: MinMaxScaler is a normalization technigue, based on range from o to 1. Other techniques exist for standardization, which use the mean and standard deviation.
# You can either normalise or standardize. Here I am just going to normalise. With decisions trees it is not really necessary, but maybe with ebm it is more so as
# a gradient is used? - Not sure how correct that is, but will check it.

          sl        sw        pl        pw
0   0.200000  0.590909  0.067797  0.041667
1   0.142857  0.363636  0.067797  0.041667
2   0.085714  0.454545  0.050847  0.041667
3   0.057143  0.409091  0.084746  0.041667
4   0.171429  0.636364  0.067797  0.041667
5   0.285714  0.772727  0.118644  0.125000
6   0.057143  0.545455  0.067797  0.083333
7   0.171429  0.545455  0.084746  0.041667
8   0.000000  0.318182  0.067797  0.041667
9   0.400000  0.818182  0.033898  0.041667
10  0.371429  1.000000  0.084746  0.125000
11  0.285714  0.772727  0.050847  0.125000
12  0.200000  0.590909  0.067797  0.083333
13  0.371429  0.727273  0.118644  0.083333
14  0.200000  0.727273  0.084746  0.083333
15  0.285714  0.545455  0.118644  0.041667
16  0.200000  0.681818  0.084746  0.125000
17  0.057143  0.636364  0.000000  0.041667
18  0.200000  0.500000  0.118644  0.166667
19  0.114286  0.545455  0.152542  0.041667
20  0.171429  0.363636  0.101695  0.041667
21  0.171429  0.545455  0.101695  0.125000
22  0.22857