# Load Packages

In [44]:
# scikit-learn, scikit-image are needed
# If those packages are missed, then, install them by using the following command
# pip install scikit-learn scikit-image

import os

import sklearn.datasets
import sklearn.linear_model
import sklearn.svm
import sklearn.tree
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics

import skimage.io
import skimage.transform
import skimage.color

import numpy as np

import matplotlib.pyplot as plt 
%matplotlib inline

# Load Additional Packages (if you want to use other modules in Scikit Learn)

In [45]:
# Load additional scikit learn packages! if you need
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,accuracy_score, confusion_matrix

# Load Data Points (Do not modify the following block)

In [None]:
image_size = 64
labels = ['glioma_tumor','meningioma_tumor','no_tumor','pituitary_tumor']

images = []
y = []
for i in labels:
    folderPath = os.path.join('./tumor_dataset/Training',i)
    for j in os.listdir(folderPath):
        img = skimage.io.imread(os.path.join(folderPath,j),)
        img = skimage.transform.resize(img,(image_size,image_size))
        img = skimage.color.rgb2gray(img)
        images.append(img)
        y.append(i)
        
images = np.array(images)

X = images.reshape((-1, image_size**2))
y = np.array(y)

In [None]:
j = 0
for i in range(len(y)):
    if y[i] in labels[j]:
        plt.imshow(images[i])
        plt.title("[Index:{}] Label:{}".format(i, y[i]))
        plt.show()
        j += 1
    if j >= len(labels):
        break

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# Classification with Scikit Learn Library (Programming Assignment)
### Variable Explanation (Do not change variable names)
- 'X_train' is feature vectors of training dataset
- 'y_train' is target labels of training dataset
- 'X_test' is feature vectors of test dataset
- 'y_test' is target labels of test dataset
- 'y_pred' was initialized as zero vectors and fill 'y_pred' with predicted labels

### Find the best model and hyperparameter for tumor classification
- Find the best random seed as well and fix it to reproduce your result on other computers.

In [None]:
#TODO
#1. Create a classification object in scikit learn package (such as perceptron, logistic regression, or other classification algorithms)
#2. Fit the object to training dataset
#3. Predict the label of test data point (X_test)
# - Do not change the variable name "y_pred"
#n_estimators=5000, random_state=3,
                             # max_features=15, max_leaf_nodes=10000, oob_score=True
for i in range(1, 1001, 10):
    model = RandomForestClassifier(n_estimators=i, class_weight=None, criterion='gini',
                           max_depth=None, max_features='sqrt', max_leaf_nodes=None,
                           min_samples_leaf=5, min_samples_split=7,
                           min_weight_fraction_leaf=0.0,
                           random_state=None)
    model.fit(X_train, y_train)
    print(i,"번째훈련 세트 정확도: {:.3f}".format(model.score(X_train, y_train)) )
    print("테스트 세트 정확도: {:.3f}".format(model.score(X_test, y_test)) )
    #print("OOB 샘플의 정확도: {:.3f}".format(model.oob_score_) )

#params = {'max_depth' : [10,15,20],
#          'max_samples' : [0.2, 0.8, 1],
#          'max_features' : [0.2, 0.8, 1.3]
#          }

#RandomForestClassifier 객체 생성 후 GridSearchCV 수행
#rf_clf = RandomForestClassifier(random_state = 221, n_jobs = -1)
#grid_cv = GridSearchCV(rf_clf,
#                       param_grid = params,
#                       cv = 3, 
#                       n_jobs = -1)
#grid_cv.fit(X_train, y_train)

#print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
#print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

#model.fit(X_train, y_train)

# 평가

#y_pred = model.predict(X_test)

from sklearn.neighbors import KNeighborsClassifier
knn_clf1 = KNeighborsClassifier(n_neighbors=1)
from sklearn.svm import SVC
svc_clf1 = SVC(kernel='rbf', C=10, coef0=5,degree=9,gamma=1.2,probability=True)
svc_clf2 = SVC(kernel='rbf', C=10, coef0=8,degree=3,gamma=1.4,probability=True)
svc_clf3 = SVC(kernel='rbf', C=10, coef0=12,degree=6,gamma=1.0,probability=True)
svc_clf4 = SVC(kernel='rbf', C=10, coef0=10,degree=11,gamma=1.8,probability=True) 
from sklearn.ensemble import ExtraTreesClassifier
xtc_clf1 = ExtraTreesClassifier(n_estimators=3, random_state=1898,criterion='entropy')
xtc_clf2 = ExtraTreesClassifier(n_estimators=3, random_state=7567,criterion='entropy')
xtc_clf3 = ExtraTreesClassifier(n_estimators=3, random_state=9244,criterion='entropy')
xtc_clf4 = ExtraTreesClassifier(n_estimators=3, random_state=5147,criterion='entropy')
xtc_clf5 = ExtraTreesClassifier(n_estimators=3, random_state=6161,criterion='entropy')
from sklearn.ensemble import VotingClassifier
vote = VotingClassifier(estimators=[
                                    ('KNN1',knn_clf1),
                                    ('SVC1',svc_clf1),
                                    ('SVC2',svc_clf2),
                                    ('SVC3',svc_clf3),
                                    ('SVC4',svc_clf4),
                                    ('XTC1',xtc_clf1),
                                    ('XTC2',xtc_clf2),
                                    ('XTC3',xtc_clf3),
                                    ('XTC4',xtc_clf4),
                                    ('XTC5',xtc_clf5),
                                   ])
vote.fit(X_train, y_train)
y_pred = vote.predict(X_test)



In [None]:
y_pred = model.predict(X_test)

### Print accuracy (do not modify the following block)

In [None]:
print('Accuracy: %.2f' % sklearn.metrics.accuracy_score(y_test, y_pred))