In [2]:
import numpy as np
import pandas as pd

from sklearn import tree
from sklearn.metrics import zero_one_loss
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [3]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [4]:
%cd /content/gdrive/MyDrive/Colab Notebooks/stats208/data

/content/gdrive/MyDrive/Colab Notebooks/stats208/data


# Single Classification Tree

1. Split data into test set T and learning set L, L reasonably large
2. Construct classification tree with hyperparameters chosen from 10-fold CV
3. Compute missclassification rate es
4. The random division of the data is repeated 100 times and the reported es is the average over the 100 iterations. 

In [5]:
parameters={ "criterion": ["gini", "entropy", "log_loss"],
            "splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,15],
           # "min_samples_split":[1,2,3,4,5,6,7,8,9,10],
           #"min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
          # "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5],
           #"max_features":["log2","sqrt",None],
           # "min_impurity_decrease":
          # "ccp_alpha"
           #"max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] 
            }

In [11]:
def single_classifier_tree(data, n, cv_n=10):
  es_list = [] 

  for i in range(100):
    learning_set = data.sample(n=n, random_state=i)
    test_set = data[~data.index.isin(learning_set.index)]

    L_x = learning_set.loc[ : , learning_set.columns!='Class']
    L_y = learning_set['Class']
    T_x = test_set.loc[ : , learning_set.columns!='Class']
    T_y = test_set['Class']

    clf = tree.DecisionTreeClassifier()
    tuning_model = RandomizedSearchCV(clf, param_distributions=parameters, scoring='accuracy', cv=cv_n, verbose=0)
    tuning_model.fit(L_x, L_y)

    T_pred = tuning_model.predict(T_x)
    es = zero_one_loss(T_y, T_pred)
    es_list.append(es)
  
  return np.mean(es_list)

#Bagging Classification Trees

1. A bootstrap sample Lb is selected from L, and a tree grown using Lb and 10-fold CV. Repeat 50 times giving 50 tree classifiers. 
2. Predict the class of data xn by doing majority voting on the prediction of 50 classifiers. The proportion of times the estimated class differs from the true class is the bagging missclassification rate eb
3. The random division of the data is repeated 100 times and the reported eb is the average over the 100 iterations. 

In [12]:
def bagging_decision_trees(data, n, bootstrap_rep=50, cv_n=10):
  eb_list = [] 

  for j in range(100):
    learning_set = data.sample(n=n, random_state=j)
    test_set = data[~data.index.isin(learning_set.index)]

    T_x = test_set.loc[ : , learning_set.columns!='Class']
    T_y = test_set['Class']

    preds_df = pd.DataFrame() 

    for i in range(bootstrap_rep):
      Lb = learning_set.sample(frac=1, replace=True, random_state=i)
      Lb_x = Lb.loc[ : , learning_set.columns!='Class']
      Lb_y = Lb['Class']

      clf = tree.DecisionTreeClassifier()
      tuning_model = RandomizedSearchCV(clf, param_distributions=parameters, scoring='accuracy', cv=cv_n, verbose=0)
      tuning_model.fit(Lb_x, Lb_y)

      preds_df[str(i)] = list(tuning_model.predict(T_x))

    T_pred = list(preds_df.mode(axis='columns').iloc[:,0])
    eb = zero_one_loss(T_y, T_pred)
    eb_list.append(eb)
  
  return np.mean(eb_list)

# Waveform 

In [13]:
with open('waveform.names') as f:
    print(f.read())

1. Title: Waveform Database Generator (written in C)
 
2. Source:
   (a) Breiman,L., Friedman,J.H., Olshen,R.A., & Stone,C.J. (1984). 
       Classification and Regression Trees.  Wadsworth International
       Group: Belmont, California.  (see pages 43-49).
   (b) Donor: David Aha 
   (c) Date: 11/10/1988

3. Past Usage:
     1. CART book (above):
        -- Optimal Bayes classification rate: 86% accuracy
        -- CART decision tree algorithm: 72%
        -- Nearest Neighbor Algorithm: 78%
           -- 300 training and 5000 test instances

4. Relevant Information:
     -- 3 classes of waves
     -- 21 attributes, all of which include noise
     -- See the book for details (49-55, 169)
     -- waveform.data.Z contains 5000 instances

5. Number of Instances: chosen by user

6. Number of Attributes:
    -- 21 attributes with continuous values between 0 and 6

7. Attribute Information:
    -- Each class is generated from a combination of 2 of 3 "base" waves
    -- Each instance is gene

In [14]:
data =  pd.read_csv('waveform.data', header=None).reset_index()
data = data.drop(columns=['index'])

In [15]:
data.iloc[:, 21].value_counts()

2    1696
0    1657
1    1647
Name: 21, dtype: int64

In [16]:
data = data.rename(columns={ data.columns[len(list(data))-1]: "Class" })
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,Class
0,-1.23,-1.56,-1.75,-0.28,0.6,2.22,0.85,0.21,-0.2,0.89,...,2.89,7.75,4.59,3.15,5.12,3.32,1.2,0.24,-0.56,2
1,-0.69,2.43,0.61,2.08,2.3,3.25,5.52,4.55,2.97,2.22,...,1.24,1.89,1.88,-1.34,0.83,1.41,1.78,0.6,2.42,1
2,-0.12,-0.94,1.29,2.59,2.42,3.55,4.94,3.25,1.9,2.07,...,2.5,0.12,1.41,2.78,0.64,0.62,-0.01,-0.79,-0.12,0
3,0.86,0.29,2.19,-0.02,1.13,2.51,2.37,5.45,5.45,4.84,...,2.58,1.4,1.24,1.41,1.07,-1.43,2.84,-1.18,1.12,1
4,1.16,0.37,0.4,-0.59,2.66,1.0,2.69,4.06,5.34,3.53,...,4.3,1.84,1.73,0.21,-0.18,0.13,-0.21,-0.8,-0.68,1


In [None]:
single_classifier_tree(data, 3500)

0.23992000000000002

In [17]:
bagging_decision_trees(data, 3500, 10)

KeyboardInterrupt: ignored

# Heart

In [None]:
with open('heart-disease.names') as f:
    print(f.read())

Publication Request: 
   >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
   This file describes the contents of the heart-disease directory.

   This directory contains 4 databases concerning heart disease diagnosis.
   All attributes are numeric-valued.  The data was collected from the
   four following locations:

     1. Cleveland Clinic Foundation (cleveland.data)
     2. Hungarian Institute of Cardiology, Budapest (hungarian.data)
     3. V.A. Medical Center, Long Beach, CA (long-beach-va.data)
     4. University Hospital, Zurich, Switzerland (switzerland.data)

   Each database has the same instance format.  While the databases have 76
   raw attributes, only 14 of them are actually used.  Thus I've taken the
   liberty of making 2 copies of each database: one with all the attributes
   and 1 with the 14 attributes actually used in past experiments.

   The authors of the databases have requested:

      ...that any publications resulting from the use of th

In [None]:
data1 =  pd.read_csv('processed.hungarian.data', header=None)
data2 =  pd.read_csv('processed.va.data', header=None)
data3 =  pd.read_csv('processed.switzerland.data', header=None)
data4 =  pd.read_csv('processed.cleveland.data', header=None)

In [None]:
data = pd.concat([data1, data2, data3, data4], axis=0)

In [None]:
data.iloc[:, len(list(data))-1].value_counts()

0    411
1    265
2    109
3    107
4     28
Name: 13, dtype: int64

In [None]:
data = data.replace('?',0)

In [None]:
data = data.rename(columns={ data.columns[len(list(data))-1]: "Class" })
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,Class
0,28.0,1.0,2.0,130,132,0,2,185,0,0.0,0,0,0,0
1,29.0,1.0,2.0,120,243,0,0,160,0,0.0,0,0,0,0
2,29.0,1.0,2.0,140,0,0,0,170,0,0.0,0,0,0,0
3,30.0,0.0,1.0,170,237,0,1,170,0,0.0,0,0,6,0
4,31.0,0.0,2.0,100,219,0,1,150,0,0.0,0,0,0,0


In [None]:
single_classifier_tree(data, 920-170)

0.4110445661112642

In [None]:
bagging_decision_trees(data, 920-170)



0.426891844391979

# Breast Cancer

In [None]:
with open('breast-cancer-wisconsin.names') as f:
    print(f.read())

Citation Request:
   This breast cancer databases was obtained from the University of Wisconsin
   Hospitals, Madison from Dr. William H. Wolberg.  If you publish results
   when using this database, then please include this information in your
   acknowledgements.  Also, please cite one or more of:

   1. O. L. Mangasarian and W. H. Wolberg: "Cancer diagnosis via linear 
      programming", SIAM News, Volume 23, Number 5, September 1990, pp 1 & 18.

   2. William H. Wolberg and O.L. Mangasarian: "Multisurface method of 
      pattern separation for medical diagnosis applied to breast cytology", 
      Proceedings of the National Academy of Sciences, U.S.A., Volume 87, 
      December 1990, pp 9193-9196.

   3. O. L. Mangasarian, R. Setiono, and W.H. Wolberg: "Pattern recognition 
      via linear programming: Theory and application to medical diagnosis", 
      in: "Large-scale numerical optimization", Thomas F. Coleman and Yuying
      Li, editors, SIAM Publications, Philadelphia 199

In [None]:
col_names = ['Clump Thickness',
   'Uniformity of Cell Size',
   'Uniformity of Cell Shape',
   'Marginal Adhesion',
   'Single Epithelial Cell Size',
   'Bare Nuclei',
   'Bland Chromatin',
   'Normal Nucleoli', 
   'Mitoses',
   'Class']

In [None]:
data =  pd.read_csv('breast-cancer-wisconsin.data', names = col_names).reset_index()
data = data.drop(columns=['index'])
data = data.replace('?',0)
data.head()

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2


In [None]:
for col in col_names:
    data[col] = data[col].astype('category',copy=False)

# Ionosphere

In [None]:
with open('ionosphere.names') as f:
    print(f.read())

1. Title: Johns Hopkins University Ionosphere database

2. Source Information:
   -- Donor: Vince Sigillito (vgs@aplcen.apl.jhu.edu)
   -- Date: 1989
   -- Source: Space Physics Group
              Applied Physics Laboratory
              Johns Hopkins University
              Johns Hopkins Road
              Laurel, MD 20723 

3. Past Usage:
   -- Sigillito, V. G., Wing, S. P., Hutton, L. V., \& Baker, K. B. (1989).
      Classification of radar returns from the ionosphere using neural 
      networks. Johns Hopkins APL Technical Digest, 10, 262-266.

      They investigated using backprop and the perceptron training algorithm
      on this database.  Using the first 200 instances for training, which
      were carefully split almost 50% positive and 50% negative, they found
      that a "linear" perceptron attained 90.7%, a "non-linear" perceptron
      attained 92%, and backprop an average of over 96% accuracy on the 
      remaining 150 test instances, consisting of 123 "good" and 

In [None]:
data =  pd.read_csv('ionosphere.data', header=None).reset_index()
data = data.drop(columns=['index'])
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,34
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [None]:
data = data.rename(columns={ data.columns[len(list(data))-1]: "Class" })
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,25,26,27,28,29,30,31,32,33,Class
0,1,0,0.99539,-0.05889,0.85243,0.02306,0.83398,-0.37708,1.0,0.0376,...,-0.51171,0.41078,-0.46168,0.21266,-0.3409,0.42267,-0.54487,0.18641,-0.453,g
1,1,0,1.0,-0.18829,0.93035,-0.36156,-0.10868,-0.93597,1.0,-0.04549,...,-0.26569,-0.20468,-0.18401,-0.1904,-0.11593,-0.16626,-0.06288,-0.13738,-0.02447,b
2,1,0,1.0,-0.03365,1.0,0.00485,1.0,-0.12062,0.88965,0.01198,...,-0.4022,0.58984,-0.22145,0.431,-0.17365,0.60436,-0.2418,0.56045,-0.38238,g
3,1,0,1.0,-0.45161,1.0,1.0,0.71216,-1.0,0.0,0.0,...,0.90695,0.51613,1.0,1.0,-0.20099,0.25682,1.0,-0.32382,1.0,b
4,1,0,1.0,-0.02401,0.9414,0.06531,0.92106,-0.23255,0.77152,-0.16399,...,-0.65158,0.1329,-0.53206,0.02431,-0.62197,-0.05707,-0.59573,-0.04608,-0.65697,g


In [None]:
data.iloc[:, len(list(data))-1].value_counts()

g    225
b    126
Name: Class, dtype: int64

In [None]:
single_classifier_tree(data, 326)

0.11479999999999999

In [None]:
bagging_decision_trees(data, 326)

0.07400000000000001

# Diabetes

In [1]:
data =  pd.read_csv('diabetes.csv')
#data = data.drop(columns=['index'])
data.head()

NameError: ignored

In [None]:
data.iloc[:, len(list(data))-1].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [None]:
data = data.rename(columns={ data.columns[len(list(data))-1]: "Class" })
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [None]:
# to equalize the classes, the diabetes cases were duplicated giving a total sample size of 1036
data = data.loc[data.index.repeat(data.Class + 1)]
len(data)

1036

In [None]:
data.iloc[:, len(list(data))-1].value_counts()

1    536
0    500
Name: Class, dtype: int64

In [None]:
single_classifier_tree(data, 786)

0.2845149276926937

In [None]:
bagging_decision_trees(data, 786)

0.24099451058266644

# Glass

In [None]:
with open('glass.names') as f:
    print(f.read())

1. Title: Glass Identification Database

2. Sources:
    (a) Creator: B. German
        -- Central Research Establishment
           Home Office Forensic Science Service
           Aldermaston, Reading, Berkshire RG7 4PN
    (b) Donor: Vina Spiehler, Ph.D., DABFT
               Diagnostic Products Corporation
               (213) 776-0180 (ext 3014)
    (c) Date: September, 1987

3. Past Usage:
    -- Rule Induction in Forensic Science
       -- Ian W. Evett and Ernest J. Spiehler
       -- Central Research Establishment
          Home Office Forensic Science Service
          Aldermaston, Reading, Berkshire RG7 4PN
       -- Unknown technical note number (sorry, not listed here)
       -- General Results: nearest neighbor held its own with respect to the
             rule-based system

4. Relevant Information:n
      Vina conducted a comparison test of her rule-based system, BEAGLE, the
      nearest-neighbor algorithm, and discriminant analysis.  BEAGLE is 
      a product available 

In [None]:
data =  pd.read_csv('glass.data', header=None).reset_index()
data = data.drop(data.columns[[0,1]], axis=1)
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [None]:
data = data.rename(columns={ data.columns[len(list(data))-1]: "Class" })
data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,Class
0,1.52101,13.64,4.49,1.1,71.78,0.06,8.75,0.0,0.0,1
1,1.51761,13.89,3.6,1.36,72.73,0.48,7.83,0.0,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.0,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.0,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.0,0.0,1


In [None]:
data.iloc[:, len(list(data))-1].value_counts()

2    76
1    70
7    29
3    17
5    13
6     9
Name: Class, dtype: int64

In [None]:
single_classifier_tree(data, 214-20)



0.3385

In [None]:
bagging_decision_trees(data, 214-20)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m


0.22399999999999998

# Soybean

In [None]:
with open('soybean-large.names') as f:
    print(f.read())

1. Title: Large Soybean Database

2. Sources:
     (a) R.S. Michalski and R.L. Chilausky "Learning by Being Told and
         Learning from Examples: An Experimental Comparison of the Two
	 Methods of Knowledge Acquisition in the Context of Developing
	 an Expert System for Soybean Disease Diagnosis", International
	 Journal of Policy Analysis and Information Systems, Vol. 4,
	 No. 2, 1980.
     (b) Donor: Ming Tan & Jeff Schlimmer (Jeff.Schlimmer%cs.cmu.edu)
     (c) Date: 11 July 1988

3. Past Usage:
      1. See above.
      2. Tan, M., & Eshelman, L. (1988). Using weighted networks to represent
         classification knowledge in noisy domains.  Proceedings of the Fifth
         International Conference on Machine Learning (pp. 121-134). Ann Arbor,
         Michigan: Morgan Kaufmann.
         -- IWN recorded a 97.1% classification accuracy 
            -- 290 training and 340 test instances
      3. Fisher,D.H. & Schlimmer,J.C. (1988). Concept Simplification and
         Predictiv

In [18]:
data =  pd.read_csv('soybean-large.data', header=None).reset_index()
data = data.drop(data.columns[[0]], axis=1)
data.head() # note that the label is 0th column 

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [19]:
data = data.rename(columns={ data.columns[0]: "Class" })
data.head()

Unnamed: 0,Class,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,diaporthe-stem-canker,6,0,2,1,0,1,1,1,0,...,0,0,0,4,0,0,0,0,0,0
1,diaporthe-stem-canker,4,0,2,1,0,2,0,2,1,...,0,0,0,4,0,0,0,0,0,0
2,diaporthe-stem-canker,3,0,2,1,0,1,0,2,1,...,0,0,0,4,0,0,0,0,0,0
3,diaporthe-stem-canker,3,0,2,1,0,1,0,2,0,...,0,0,0,4,0,0,0,0,0,0
4,diaporthe-stem-canker,6,0,2,1,0,2,0,1,0,...,0,0,0,4,0,0,0,0,0,0


In [20]:
data = data.replace('?',0)

In [None]:
single_classifier_tree(data, 307-25)

In [None]:
bagging_decision_trees(data, 307-25, 100)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
