<a href="https://colab.research.google.com/github/marcelagga/MSC_Assignments/blob/main/smalldata.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [125]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from deepforest import CascadeForestClassifier
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [92]:
iris = datasets.load_iris()

In [93]:
df = pd.DataFrame()
data = pd.DataFrame(data= np.c_[iris['data'], iris['target']],
                     columns= iris['feature_names'] + ['target'])

In [145]:
def get_stratify_sample(df,col,frac):
    return data.groupby(col, group_keys=False).apply(lambda x: x.sample(frac=frac,random_state=42))


def results_clf(x,y,clf,model_type='other'):
    pred = clf.predict(x).flatten()
    if model_type == 'DNN':
      loss, accuracy = clf.evaluate(x, y, verbose=0)
      accuracy = round(accuracy,2)
    else:
      hits = sum(pred==y)
      total = len(y)
      accuracy = 100*round(hits/total,2)
    return accuracy
  
def get_results_by_size_model(data,clf,fracs,model_type=None):
  results = {}

  for frac in fracs:
    if model_type == 'DF':
      clf = CascadeForestClassifier(random_state=0,verbose=0)

    df_stratify = get_stratify_sample(data,'target',frac)
    x = df_stratify.iloc[:,:-1]
    y = df_stratify.iloc[:,-1]
    # ensure all data are floating point values
    x = x.astype('float32')
    # encode strings to integer
    y = LabelEncoder().fit_transform(y)
    x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                            test_size=0.33, 
                                                            random_state=42)

    n_features = x_train.shape[1]

    if model_type == 'DNN':
      # define model
      clf = Sequential()
      clf.add(Dense(10, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
      clf.add(Dense(8, activation='relu', kernel_initializer='he_normal'))
      clf.add(Dense(3, activation='softmax'))
      clf.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
      clf.fit(x_train, y_train, epochs=150, batch_size=32, verbose=0)
      # evaluate the model
      #loss, acc = model.evaluate(X_test, y_test, verbose=0)
      #print('Test Accuracy: %.3f' % acc)
    else:
      clf.fit(np.array(x_train),y_train)

    accuracy_train = results_clf(np.array(x_train),y_train,clf,model_type)
    accuracy_test = results_clf(np.array(x_test),y_test,clf,model_type)
    results[frac] = {'accuracy_train':accuracy_train,
                        'accuracy_test':accuracy_test}
  return results



In [146]:
fracs = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
get_results_by_size_model(data,None,fracs,'DNN')

{0.1: {'accuracy_test': 0.8, 'accuracy_train': 1.0},
 0.2: {'accuracy_test': 0.9, 'accuracy_train': 0.8},
 0.3: {'accuracy_test': 1.0, 'accuracy_train': 1.0},
 0.4: {'accuracy_test': 1.0, 'accuracy_train': 0.98},
 0.5: {'accuracy_test': 0.84, 'accuracy_train': 0.84},
 0.6: {'accuracy_test': 0.97, 'accuracy_train': 0.93},
 0.7: {'accuracy_test': 0.97, 'accuracy_train': 0.93},
 0.8: {'accuracy_test': 1.0, 'accuracy_train': 0.96},
 0.9: {'accuracy_test': 1.0, 'accuracy_train': 0.96},
 1: {'accuracy_test': 0.98, 'accuracy_train': 0.97}}

In [147]:
fracs = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
clf = DecisionTreeClassifier(random_state=42)
get_results_by_size_model(data,clf,fracs)

{0.1: {'accuracy_test': 80.0, 'accuracy_train': 100.0},
 0.2: {'accuracy_test': 90.0, 'accuracy_train': 100.0},
 0.3: {'accuracy_test': 93.0, 'accuracy_train': 100.0},
 0.4: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.5: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.6: {'accuracy_test': 93.0, 'accuracy_train': 100.0},
 0.7: {'accuracy_test': 94.0, 'accuracy_train': 100.0},
 0.8: {'accuracy_test': 92.0, 'accuracy_train': 100.0},
 0.9: {'accuracy_test': 89.0, 'accuracy_train': 100.0},
 1: {'accuracy_test': 96.0, 'accuracy_train': 100.0}}

In [148]:
clf = RandomForestClassifier(random_state=21)
get_results_by_size_model(data,clf,fracs)

{0.1: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.2: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.3: {'accuracy_test': 93.0, 'accuracy_train': 100.0},
 0.4: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.5: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.6: {'accuracy_test': 93.0, 'accuracy_train': 100.0},
 0.7: {'accuracy_test': 97.0, 'accuracy_train': 100.0},
 0.8: {'accuracy_test': 98.0, 'accuracy_train': 100.0},
 0.9: {'accuracy_test': 93.0, 'accuracy_train': 100.0},
 1: {'accuracy_test': 94.0, 'accuracy_train': 100.0}}

In [149]:
fracs = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1]
get_results_by_size_model(data,None,fracs,'DF')

{0.1: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.2: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.3: {'accuracy_test': 93.0, 'accuracy_train': 100.0},
 0.4: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.5: {'accuracy_test': 100.0, 'accuracy_train': 100.0},
 0.6: {'accuracy_test': 97.0, 'accuracy_train': 98.0},
 0.7: {'accuracy_test': 97.0, 'accuracy_train': 99.0},
 0.8: {'accuracy_test': 98.0, 'accuracy_train': 100.0},
 0.9: {'accuracy_test': 96.0, 'accuracy_train': 100.0},
 1: {'accuracy_test': 94.0, 'accuracy_train': 100.0}}

In [150]:
get_results_by_size_model(data,None,fracs,'DNN')

{0.1: {'accuracy_test': 0.8, 'accuracy_train': 0.9},
 0.2: {'accuracy_test': 0.6, 'accuracy_train': 0.7},
 0.3: {'accuracy_test': 0.47, 'accuracy_train': 0.6},
 0.4: {'accuracy_test': 0.2, 'accuracy_train': 0.45},
 0.5: {'accuracy_test': 0.96, 'accuracy_train': 0.96},
 0.6: {'accuracy_test': 0.67, 'accuracy_train': 0.67},
 0.7: {'accuracy_test': 1.0, 'accuracy_train': 0.94},
 0.8: {'accuracy_test': 0.98, 'accuracy_train': 0.96},
 0.9: {'accuracy_test': 0.96, 'accuracy_train': 0.96},
 1: {'accuracy_test': 0.98, 'accuracy_train': 0.97}}

In [45]:
data

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0
