In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

In [2]:
xl = pd.ExcelFile('../data/PCA_data.xlsx')
df = xl.parse('new')#,header=None)

print(df.head())
print(df.shape)

   Amplitude  Duration  Accumulated dose         N        DGR
0       1000       168              1000  2.609411  10.090759
1       1000       168              1000  2.979771   5.136975
2       1000       168              1000  2.865221   6.164093
3       2000         4              2000  1.999982 -15.801756
4       2000         4              2000  2.760342 -14.282713
(42, 5)


In [3]:
df.dtypes

Amplitude             int64
Duration              int64
Accumulated dose      int64
N                   float64
DGR                 float64
dtype: object

In [4]:
df['Amplitude'].unique()

array([1000, 2000,  500,  200], dtype=int64)

In [5]:
df['Duration'].unique()

array([168,   4], dtype=int64)

In [6]:
df['Accumulated dose'].unique()

array([1000, 2000,  500, 1500], dtype=int64)

In [7]:
df['N'].unique()

array([2.60941124, 2.97977114, 2.86522079, 1.99998224, 2.76034188,
       2.02693844, 2.17550278, 2.12057209, 1.13963091, 2.87504005,
       3.13118458, 2.93488932, 3.76830077, 4.23490095, 3.91720724,
       2.74445581, 3.35874772, 2.40742111, 2.3934567 , 2.25557375,
       2.75702429, 2.57193875, 2.94725585, 2.03067064, 2.21811104,
       2.73235106, 2.18040442, 2.63888049, 2.36608624,        nan,
       3.49629664, 1.91922283, 3.15910721, 1.45922875, 1.83478403,
       2.3710568 , 3.31671357, 2.26910806, 3.03569031, 1.61825013,
       1.79612362, 1.91841853])

In [8]:
df['DGR'].unique()

array([ 10.09075937,   5.1369751 ,   6.16409341, -15.80175557,
       -14.28271304,  -4.96772008,  -4.21029912,  -6.60111967,
        -2.45999912,   6.36253373,  -0.28819403,   5.55504115,
         8.05551088,   5.76040426,   7.50879956,  -1.18409955,
        11.62953285,  13.18704143,  12.49508536,  13.05067071,
        12.63552954,  11.48128541,  16.76895331,  15.96349639,
        17.86473089,  16.6559142 ,  13.59034112,  14.1150525 ,
        15.48771661,  16.99308128,  17.10418622,  16.1970621 ,
        16.54221405,  14.87595436,  17.97101142,  13.9852281 ,
        12.2109996 ,  12.35358121])

In [9]:
len(df.loc[(np.isnan(df.N) == True)])

1

In [10]:
df.loc[(np.isnan(df.N) == True)]

Unnamed: 0,Amplitude,Duration,Accumulated dose,N,DGR
29,500,168,1000,,13.187041


In [11]:
len(df)

42

In [12]:
df_no_missing_N = df.loc[(np.isnan(df.N) == False)].copy()

In [13]:
df_no_missing_N

Unnamed: 0,Amplitude,Duration,Accumulated dose,N,DGR
0,1000,168,1000,2.609411,10.090759
1,1000,168,1000,2.979771,5.136975
2,1000,168,1000,2.865221,6.164093
3,2000,4,2000,1.999982,-15.801756
4,2000,4,2000,2.760342,-14.282713
5,2000,4,2000,2.026938,-4.96772
6,500,4,500,2.175503,-4.210299
7,500,4,500,2.120572,-6.60112
8,500,4,500,1.139631,-2.459999
9,1000,168,1000,2.87504,6.362534


In [14]:
len(df_no_missing_N)

41

In [15]:
len(df)

42

In [16]:
X1 = df.iloc[:,:-2]
X1

# X1 is missing one row - figure out why

Unnamed: 0,Amplitude,Duration,Accumulated dose
0,1000,168,1000
1,1000,168,1000
2,1000,168,1000
3,2000,4,2000
4,2000,4,2000
5,2000,4,2000
6,500,4,500
7,500,4,500
8,500,4,500
9,1000,168,1000


In [17]:
X2 = df_no_missing_N.iloc[:,:-2]
X2#.head()

Unnamed: 0,Amplitude,Duration,Accumulated dose
0,1000,168,1000
1,1000,168,1000
2,1000,168,1000
3,2000,4,2000
4,2000,4,2000
5,2000,4,2000
6,500,4,500
7,500,4,500
8,500,4,500
9,1000,168,1000


In [18]:
len(X2)

41

In [19]:
y1 = df['DGR'].copy()
y2 = df_no_missing_N['N'].copy()

In [20]:
y1,y2

(0     10.090759
 1      5.136975
 2      6.164093
 3    -15.801756
 4    -14.282713
 5     -4.967720
 6     -4.210299
 7     -6.601120
 8     -2.459999
 9      6.362534
 10     6.164093
 11    -0.288194
 12     5.555041
 13     8.055511
 14     5.760404
 15     7.508800
 16    -1.184100
 17    11.629533
 18    13.187041
 19    12.495085
 20    13.050671
 21    12.635530
 22    11.481285
 23    16.768953
 24    15.963496
 25    17.864731
 26    16.655914
 27    13.590341
 28    14.115053
 29    13.187041
 30    15.487717
 31    14.115053
 32    16.993081
 33    17.104186
 34    16.197062
 35    16.542214
 36    14.875954
 37    17.971011
 38    13.985228
 39    14.115053
 40    12.211000
 41    12.353581
 Name: DGR, dtype: float64,
 0     2.609411
 1     2.979771
 2     2.865221
 3     1.999982
 4     2.760342
 5     2.026938
 6     2.175503
 7     2.120572
 8     1.139631
 9     2.875040
 10    3.131185
 11    2.934889
 12    3.768301
 13    4.234901
 14    3.917207
 15    2.744456
 1

In [21]:
df

Unnamed: 0,Amplitude,Duration,Accumulated dose,N,DGR
0,1000,168,1000,2.609411,10.090759
1,1000,168,1000,2.979771,5.136975
2,1000,168,1000,2.865221,6.164093
3,2000,4,2000,1.999982,-15.801756
4,2000,4,2000,2.760342,-14.282713
5,2000,4,2000,2.026938,-4.96772
6,500,4,500,2.175503,-4.210299
7,500,4,500,2.120572,-6.60112
8,500,4,500,1.139631,-2.459999
9,1000,168,1000,2.87504,6.362534


In [22]:
# data types

# Amplitude - cat / int
# Duration - cat / int 
# Accumulated dose / cat / int

X1.dtypes

Amplitude           int64
Duration            int64
Accumulated dose    int64
dtype: object

In [25]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,random_state=42)
X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,random_state=42)

clf_dt1 = DecisionTreeClassifier(random_state=42)
clf_dt1 = clf_dt1.fit(X1_train,y1_train)

clf_dt2 = DecisionTreeClassifier(random_state=42)
clf_dt2 = clf_dt2.fit(X2_train,y2_train)

ValueError: Unknown label type: 'continuous'

In [None]:
X1_test

In [None]:
X1.dtypes