<a href="https://colab.research.google.com/github/nisanuro/CNG562-Assignment-2/blob/naive/CNG562_Assignment2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import matplotlib.pyplot as plt  
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, cross_val_score
from sklearn import metrics, datasets, preprocessing
%matplotlib inline
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB, ComplementNB, MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [0]:
def randomOneHoldout(X_train, Y_train):

  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=0)

  return x_train, x_test, y_train, y_test

In [0]:
def stratifiedOneHoldout(X_train, Y_train):
  
  x_train, x_test, y_train, y_test = train_test_split(X_train, Y_train, test_size=0.2, random_state=0, stratify=Y_train)
  
  return x_train, x_test, y_train, y_test

In [0]:
def NaiveBayes(X, Y):
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    gaussian = GaussianNB()
    categorical = CategoricalNB()
    bernoulli = BernoulliNB()
    complement = ComplementNB()
    multinomial = MultinomialNB()

    models = [gaussian, categorical, bernoulli, complement, multinomial]

    # 5-Fold
    print("\n5-Fold")
    for i in models:
        cv = cross_val_score(i, X_train, Y_train, cv=5, scoring='accuracy')
        print(str(i).split('N')[0] + " Naive Bayes Accuracy: ", cv.mean()*100)

    # 10-Fold
    print("\n10-Fold")
    for i in models:
        cv = cross_val_score(i, X_train, Y_train, cv=10, scoring='accuracy')
        print(str(i).split('N')[0] + " Naive Bayes Accuracy: ", cv.mean()*100)

    # Random One Holdout
    x_train, x_test, y_train, y_test = randomOneHoldout(X_train, Y_train)

    print("\nRandom One Holdout")
    for i in models:
        i.fit(x_train, y_train)
        y_pred = i.predict(x_test)
        print(str(i).split('N')[0] + " Naive Bayes Accuracy: ", metrics.accuracy_score(y_test, y_pred)*100)


    # Stratified One Holdout
    x_train, x_test, y_train, y_test = stratifiedOneHoldout(X_train, Y_train)
    
    print("\nStratified One Holdout")
    for i in models:
        i.fit(x_train, y_train)
        y_pred = i.predict(x_test)
        print(str(i).split('N')[0] + " Naive Bayes Accuracy: ", metrics.accuracy_score(y_test, y_pred)*100)                  


In [0]:
def DecisionTree(X, Y):
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
    
    # 5-Fold
    print("\n5-Fold: ")
    tuningDepth(X_train, Y_train, 0)

    # 10-Fold
    print("\n10-Fold: ")
    tuningDepth(X_train, Y_train, 1)

    # Random One Holdout
    print("\nRandom One Holdout: ")   
    tuningDepth(X_train, Y_train, 2)
  
    # Stratified One Holdout
    print("\nStratified One Holdout: ")
    tuningDepth(X_train, Y_train, 3)
    
    #
    #   Continue with 5-Fold, Depth = 5
    #
    
    print("5-Fold, Depth=5\n")
    tuningSplit(X_train, Y_train)
    
    #
    #   Continue with criterion = 'gini', splitter = 'best', min_samples_split = 2
    #                     all of them are default values
    print("5-Fold, depth = 5, criterion = 'gini', splitter = 'best, min_samples_split = 2\n")
    tuningClassWeight(X_train, Y_train)

    #
    #   Continue with class_weight = None,   default
    #
    
    clf = DecisionTreeClassifier(max_depth = 5, random_state = 0)
    clf.fit(X_train, Y_train)
    
    y_pred = clf.predict(X_test)
    print("Accuracy: ", metrics.accuracy_score(Y_test, y_pred)*100)




In [0]:
def tuningClassWeight(X_train, Y_train):
    # No class weight
    clf = DecisionTreeClassifier(max_depth = 5, random_state = 0)
    clf.fit(X_train, Y_train)

    cv = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
    print("Class weight: None           Accuracy: ", cv.mean()*100)

    # Balanced class weight
    clf = DecisionTreeClassifier(max_depth = 5, random_state = 0, class_weight ='balanced')
    clf.fit(X_train, Y_train)

    cv = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
    print("Class weight: Balanced       Accuracy: ", cv.mean()*100)

In [0]:
def tuningSplit(X_train, Y_train):
    criterion = ["gini", "entropy"]
    splitter = ["best", "random"]

    for i in criterion:
        for j in splitter:
            clf = DecisionTreeClassifier(criterion = i, splitter = j, max_depth = 5, random_state = 0)
            clf.fit(X_train, Y_train)

            cv = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
            print("Criterion: ", i, "   Splitter: ", j, "   Accuracy: ", cv.mean()*100)

    for i in range(2, 10):
        clf = DecisionTreeClassifier(max_depth = 5, min_samples_split = i, random_state = 0)
        clf.fit(X_train, Y_train)

        cv = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
        print("min_samples_split: ", i, "   Accuracy: ", cv.mean()*100)

In [0]:
def tuningDepth(X_train, Y_train, val):

    max_depth_range = list(range(1, 10))
    
    for depth in max_depth_range:
        if (val == 0):
            clf = DecisionTreeClassifier(max_depth = depth, random_state = 0)
            clf.fit(X_train, Y_train)

            cv = cross_val_score(clf, X_train, Y_train, cv=5, scoring='accuracy')
            print("Depth: ", depth, " Accuracy: ",cv.mean()*100)

        elif (val == 1):
            clf = DecisionTreeClassifier(max_depth = depth, random_state = 0)
            clf.fit(X_train, Y_train)

            cv = cross_val_score(clf, X_train, Y_train, cv=10, scoring='accuracy')
            print("Depth: ", depth, " Accuracy: ",cv.mean()*100)

        elif(val == 2):
            x_train, x_test, y_train, y_test = randomOneHoldout(X_train, Y_train)

            clf = DecisionTreeClassifier(max_depth = depth, random_state = 0)
            clf.fit(x_train, y_train)

            score = clf.score(x_test, y_test)
            print("Depth: ", depth, " Accuracy: ", score*100)

        elif(val == 3):
            x_train, x_test, y_train, y_test = stratifiedOneHoldout(X_train, Y_train)

            clf = DecisionTreeClassifier(max_depth = depth, random_state = 0)
            clf.fit(x_train, y_train)

            score = clf.score(x_test, y_test)
            print("Depth: ", depth, " Accuracy: ", score*100)

        else:
            print("Invalid validation tech.")




In [0]:
def displayAccuracy(X, Y):
    
    
    #NaiveBayes(X, Y)
    DecisionTree(X, Y)

In [511]:
if __name__ == '__main__':

  iris = datasets.load_iris()
  
  X = iris.data
  Y = iris.target
  
  # L1 normalization
  l1_norm = preprocessing.normalize(X, norm="l1")
  # Mean removal
  mean_removal = preprocessing.scale(X)

  #Displaying result according to each type of methods and regression model
  print("\nRaw: ")
  displayAccuracy(X,Y)
  '''
  print("\nL1 Normalization: ")
  displayAccuracy(l1_norm,Y)
  print("\nMean Removal: ")
  displayAccuracy(mean_removal,Y)'''

  df = pd.DataFrame(iris.data, columns=iris.feature_names)
  df['target'] = iris.target


Raw: 

5-Fold: 
Depth:  1  Accuracy:  69.16666666666667
Depth:  2  Accuracy:  94.16666666666667
Depth:  3  Accuracy:  93.33333333333333
Depth:  4  Accuracy:  93.33333333333333
Depth:  5  Accuracy:  94.16666666666667
Depth:  6  Accuracy:  94.16666666666667
Depth:  7  Accuracy:  94.16666666666667
Depth:  8  Accuracy:  94.16666666666667
Depth:  9  Accuracy:  94.16666666666667

10-Fold: 
Depth:  1  Accuracy:  69.16666666666667
Depth:  2  Accuracy:  93.33333333333333
Depth:  3  Accuracy:  92.5
Depth:  4  Accuracy:  92.5
Depth:  5  Accuracy:  93.33333333333333
Depth:  6  Accuracy:  93.33333333333333
Depth:  7  Accuracy:  93.33333333333333
Depth:  8  Accuracy:  93.33333333333333
Depth:  9  Accuracy:  93.33333333333333

Random One Holdout: 
Depth:  1  Accuracy:  70.83333333333334
Depth:  2  Accuracy:  87.5
Depth:  3  Accuracy:  91.66666666666666
Depth:  4  Accuracy:  91.66666666666666
Depth:  5  Accuracy:  91.66666666666666
Depth:  6  Accuracy:  91.66666666666666
Depth:  7  Accuracy:  91.6666

In [512]:
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2
