## This file is for analysis wavelengths

In [1]:
import numpy as np
import os
from time import time
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
#from sklearn.datasets import make_moons, make_circles, make_classification
#from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from sklearn.metrics import roc_auc_score

### Try to find best bands for classify Healthy and disease

In [302]:
label_to_name = {0:'Healthy', 1: 'Disease'}
name_to_label = {'Healthy':0, 'Disease':1}

In [303]:
full_data = pd.read_excel("full_data.xlsx")
full_data.head()

Unnamed: 0,label,394.6,396.7,398.7,400.8,402.8,404.9,406.9,409,411,...,866.7,868.8,870.9,872.9,875,877,879.1,881.1,883.2,885.2
0,0,0.04827,0.04468,0.04008,0.03521,0.0303,0.02578,0.02212,0.01985,0.01826,...,0.3162,0.3168,0.3175,0.318,0.3184,0.3187,0.319,0.3194,0.3195,0.3194
1,0,0.05322,0.04898,0.04387,0.03805,0.03224,0.02683,0.02242,0.0195,0.01754,...,0.3379,0.3388,0.3393,0.3402,0.3408,0.3412,0.3415,0.3417,0.3419,0.3416
2,0,0.0471,0.04375,0.03963,0.03504,0.03022,0.0259,0.02229,0.01998,0.01833,...,0.3627,0.3634,0.3638,0.3643,0.3648,0.3651,0.3651,0.3649,0.3651,0.3648
3,0,0.04965,0.04648,0.0423,0.03775,0.03321,0.0289,0.0252,0.02265,0.0204,...,0.3373,0.3383,0.339,0.3399,0.3406,0.3412,0.3419,0.3423,0.3429,0.3432
4,0,0.04562,0.04221,0.03784,0.03332,0.02895,0.02493,0.02176,0.02012,0.01829,...,0.2413,0.242,0.2431,0.2438,0.2444,0.2452,0.2458,0.2463,0.2465,0.2471


#### Create X and label for training, test

Xarray-like of shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and n_features is the number of features.

yarray-like of shape (n_samples, n_output) or (n_samples,), default=None

In [347]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
label

array([0, 0, 0, ..., 1, 1, 1])

In [348]:
cols = full_data.columns.tolist()
cols = cols[1:]

In [349]:
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x

array([[0.01768],
       [0.0179 ],
       [0.01819],
       ...,
       [0.02279],
       [0.02006],
       [0.02306]])

In [350]:
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.3, random_state=42)

In [351]:
y_train.shape

(2992,)

In [352]:
# #############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'kernel':['rbf','sigmod','linear'],'C': [1e2,5e2,1e3, 5e3, 1e4],
              'gamma': [0.001, 0.005, 0.01,0.05,0.1,0.2]}
print("Set Grid parameters")
clf = GridSearchCV(
    SVC(class_weight='balanced',probability=True), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
Set Grid parameters
Start to fit




done in 331.124s
Best estimator found by grid search:
SVC(C=500.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.05, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


In [353]:
clf.classes_

array([0, 1])

In [354]:
clf.score(x_test,y_test)

0.7287607170693686

In [344]:
y_pred = clf.predict_proba(x_test)
y_pred.shape

(1283, 2)

In [355]:
y_pred

array([[0.75      , 0.25      ],
       [0.90728477, 0.09271523],
       [0.83333333, 0.16666667],
       ...,
       [0.83870968, 0.16129032],
       [0.90728477, 0.09271523],
       [1.        , 0.        ]])

In [359]:
y_pred[:10,:], y_pred[-10:-1,:]

(array([[0.75      , 0.25      ],
        [0.90728477, 0.09271523],
        [0.83333333, 0.16666667],
        [1.        , 0.        ],
        [0.78947368, 0.21052632],
        [1.        , 0.        ],
        [0.82954545, 0.17045455],
        [1.        , 0.        ],
        [0.78070175, 0.21929825],
        [0.78070175, 0.21929825]]), array([[1.        , 0.        ],
        [0.90728477, 0.09271523],
        [1.        , 0.        ],
        [0.        , 1.        ],
        [1.        , 0.        ],
        [1.        , 0.        ],
        [1.        , 0.        ],
        [0.83870968, 0.16129032],
        [0.90728477, 0.09271523]]))

In [356]:
roc_auc_score(y_test,y_pred[:,1]), roc_auc_score(y_test,y_pred[:,0])

(0.7730597800293019, 0.22694021997069808)

### Decision Tree

In [360]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,1)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.3, random_state=42)

In [361]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    DecisionTreeClassifier(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
Set Grid parameters
Start to fit
done in 1.387s
Best estimator found by grid search:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=19,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [362]:
score = clf.score(x_test, y_test)
score

0.8620420888542478

In [363]:
y_pred = clf.predict_proba(x_test)

In [365]:
y_pred[:10,:], y_pred[-10:-1,:]

(array([[0.75      , 0.25      ],
        [0.90728477, 0.09271523],
        [0.83333333, 0.16666667],
        [1.        , 0.        ],
        [0.78947368, 0.21052632],
        [1.        , 0.        ],
        [0.82954545, 0.17045455],
        [1.        , 0.        ],
        [0.78070175, 0.21929825],
        [0.78070175, 0.21929825]]), array([[1.        , 0.        ],
        [0.90728477, 0.09271523],
        [1.        , 0.        ],
        [0.        , 1.        ],
        [1.        , 0.        ],
        [1.        , 0.        ],
        [1.        , 0.        ],
        [0.83870968, 0.16129032],
        [0.90728477, 0.09271523]]))

In [391]:
y_pred[(y_pred[:,1] == 1)].size

234

In [393]:
y_test.sum()

294

In [366]:
roc_auc_score(y_test,y_pred[:,1]), roc_auc_score(y_test,y_pred[:,0])

(0.7730597800293019, 0.22694021997069808)

#### Random Forests

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

In [None]:
# To train each wavelength and get scores
result_scores = []
for col in cols:
    # Xarray-like of shape (n_samples, n_features) Training vector, 
    # where n_samples is the number of samples and n_features is the number of features.
    x = full_data[col].tolist()
    x = np.asarray(x).reshape(-1,1)

    x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

    clf = RandomForestClassifier(n_estimators=100, max_depth=19)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)

    result_score = dict()
    result_score['WaveLength'] = col
    result_score['score']= score
    result_scores.append(result_score)

fd_result_score = pd.DataFrame(result_scores)

In [None]:
fd_result_score.to_excel('full_data_random_forest_score.xlsx')

In [None]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'n_estimators':[50,100,150],'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    RandomForestClassifier(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
score = clf.score(x_test, y_test)
score

#### Gaussian process classification (GPC) based on Laplace approximation

#### this method is very slow 

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
kernel = 1.0 * RBF(1.0)
clf = GaussianProcessClassifier(kernel=kernel, random_state=46)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### AdaBoostClassifier

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = AdaBoostClassifier(n_estimators=100, random_state=46)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### KNeighborsClassifier

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### Gaussian Naive Bayes (GaussianNB)

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = GaussianNB()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### Quadratic Discriminant Analysis

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = QuadraticDiscriminantAnalysis()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

### Classify data with label from 0-7

In [None]:
full_data = pd.read_excel("full_data_with_class.xlsx")

In [None]:
full_data.tail()

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

In [None]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'n_estimators':[50,100,150],'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    RandomForestClassifier(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
clf.score(x_test, y_test)

In [None]:
result_scores = []
for col in cols:
    # Xarray-like of shape (n_samples, n_features) Training vector, 
    # where n_samples is the number of samples and n_features is the number of features.
    x = full_data[col].tolist()
    x = np.asarray(x).reshape(-1,1)

    x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

    clf = RandomForestClassifier(n_estimators=150,criterion='entropy', max_depth=19)
    clf.fit(x_train, y_train)
    score = clf.score(x_test, y_test)

    result_score = dict()
    result_score['WaveLength'] = col
    result_score['score']= score
    result_scores.append(result_score)

fd_result_score = pd.DataFrame(result_scores)

In [None]:
fd_result_score.to_excel('full_data_with_class_random_forest_score.xlsx')

## Classify only disease data

In [None]:
full_data = pd.read_excel("full_disease_data.xlsx")
full_data.tail()

### Random Forest

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = RandomForestClassifier()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

In [None]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'n_estimators':[50,100,150],'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    RandomForestClassifier(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
clf.score(x_test, y_test)

### The result of only disease data isn't good, so try to other algorithm

### SVM

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)

cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)

In [None]:
# #############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e2,5e2,1e3, 5e3, 1e4],
              'gamma': [0.001, 0.005, 0.01,0.05,0.1,0.2]}
print("Set Grid parameters")
clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced'), param_grid, cv=5)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
clf.score(x_test,y_test)

### decision Tree

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,1)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    DecisionTreeClassifier(), param_grid, cv=5)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
clf.score(x_test,y_test)

#### Gaussian process classification (GPC) based on Laplace approximation

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
kernel = 1.0 * RBF(1.0)
clf = GaussianProcessClassifier(kernel=kernel, random_state=46)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### AdaBoostClassifier

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = AdaBoostClassifier(n_estimators=100, random_state=46)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### KNeighborsClassifier

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### Gaussian Naive Bayes (GaussianNB)

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = GaussianNB()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### Quadratic Discriminant Analysis

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=42)

In [None]:
clf = QuadraticDiscriminantAnalysis()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

## Try for single disease file

In [None]:
full_data = pd.read_excel("sc16_29_new.xlsx")
full_data.tail()

In [None]:
health_rows = full_data[full_data['label'] == 0]
disease_rows = full_data[full_data['label'] != 0]
length = disease_rows['label'].count()
selected_rows = health_rows.sample(length)
new_pd = selected_rows.append(disease_rows, ignore_index=True)

### Random Forest

In [None]:
label = new_pd['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = new_pd.columns.tolist()
cols = cols[1:]
x = new_pd[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.3, random_state=99)

In [None]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'n_estimators':[50,100,150],'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    RandomForestClassifier(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
clf.score(x_test, y_test)

### SVM

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=46)

In [None]:
# #############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'C': [1e2,5e2,1e3, 5e3, 1e4],
              'gamma': [0.001, 0.005, 0.01,0.05,0.1,0.2]}
print("Set Grid parameters")
clf = GridSearchCV(
    SVC(kernel='rbf', class_weight='balanced'), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
clf.score(x_test,y_test)

### decision Tree

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,1)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=88)

In [None]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    DecisionTreeClassifier(), param_grid, cv=5)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

In [None]:
clf.score(x_test,y_test)

#### Gaussian process classification (GPC) based on Laplace approximation

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=55)

In [None]:
kernel = 1.0 * RBF(1.0)
clf = GaussianProcessClassifier(kernel=kernel, random_state=46)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### AdaBoostClassifier

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=66)

In [None]:
clf = AdaBoostClassifier(n_estimators=100, random_state=46)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### KNeighborsClassifier

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=66)

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### Gaussian Naive Bayes (GaussianNB)

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=33)

In [None]:
clf = GaussianNB()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

#### Quadratic Discriminant Analysis

In [None]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)
cols = full_data.columns.tolist()
cols = cols[1:]
x = full_data[cols[10]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state=11)

In [None]:
clf = QuadraticDiscriminantAnalysis()
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

In [None]:
y_test

#### Note, seems the score are good enough.The reason is maybe each file only has two class: 0 and other value.
Another thing is when I split train and test dataset, the value of <b>random_state</b> changed, the final score will changed. 
The reason is maybe dataset isn't large enough.

#### Use Ada boost for each file 

In [None]:
file_list = ['sc1_new.xlsx', 'sc2_4_new.xlsx', 'sc5_7_new.xlsx', 'sc8_15_new.xlsx','sc16_29_new.xlsx','sc30_49_new.xlsx','sc50_70_new.xlsx']
#file_list = ['sc50_70_new.xlsx']

In [None]:
for file in file_list:
    full_data = pd.read_excel(file)
    health_rows = full_data[full_data['label'] == 0]
    disease_rows = full_data[full_data['label'] != 0]
    length = disease_rows['label'].count()
    selected_rows = health_rows.sample(length)
    new_pd = selected_rows.append(disease_rows, ignore_index=True)
    label = new_pd['label'].tolist()
    label = np.asarray(label).reshape(-1,)
    
    cols = new_pd.columns.tolist()
    cols = cols[1:]
    
    # Process each wavelenth
    result_scores = []
    for col in cols:
        x = full_data[col].tolist()
        x = np.asarray(x).reshape(-1,1)
        x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.3, random_state=66)
        
        #t0 = time()
        param_grid = {'C': [1e2,5e2,1e3, 5e3, 1e4],
                      'gamma': [0.001, 0.005, 0.01,0.05,0.1,0.2]}
        #print("Set Grid parameters")
        clf = GridSearchCV(
            SVC(kernel='rbf', class_weight='balanced'), param_grid, cv=10)
        #print("Start to fit")
        clf = clf.fit(x_train, y_train)
        #print(clf.best_estimator_)
        
        #clf = AdaBoostClassifier(n_estimators=100, random_state=46)
        #clf.fit(x_train, y_train)
        score = clf.score(x_test, y_test)
        
        result_score = dict()
        result_score['WaveLength'] = col
        result_score['score']= score
        result_score['best_parms'] = clf.best_estimator_
        result_scores.append(result_score)
        
    new_file = file.split('.')[0] + '_svm_result.xlsx'
    result_pd = pd.DataFrame(result_scores)
    result_pd.to_excel(new_file)

### Process SC 50-70 new data

In [10]:
file = "More H VS. SC 50-70.xlsx"
full_data = pd.read_excel(file)
full_data.head()

Unnamed: 0,label,394.6,396.7,398.7,400.8,402.8,404.9,406.9,409,411,...,866.7,868.8,870.9,872.9,875,877,879.1,881.1,883.2,885.2
0,0,0.04827,0.04468,0.04008,0.03521,0.0303,0.02578,0.02212,0.01985,0.01826,...,0.3162,0.3168,0.3175,0.318,0.3184,0.3187,0.319,0.3194,0.3195,0.3194
1,0,0.05322,0.04898,0.04387,0.03805,0.03224,0.02683,0.02242,0.0195,0.01754,...,0.3379,0.3388,0.3393,0.3402,0.3408,0.3412,0.3415,0.3417,0.3419,0.3416
2,0,0.0471,0.04375,0.03963,0.03504,0.03022,0.0259,0.02229,0.01998,0.01833,...,0.3627,0.3634,0.3638,0.3643,0.3648,0.3651,0.3651,0.3649,0.3651,0.3648
3,0,0.04965,0.04648,0.0423,0.03775,0.03321,0.0289,0.0252,0.02265,0.0204,...,0.3373,0.3383,0.339,0.3399,0.3406,0.3412,0.3419,0.3423,0.3429,0.3432
4,0,0.04562,0.04221,0.03784,0.03332,0.02895,0.02493,0.02176,0.02012,0.01829,...,0.2413,0.242,0.2431,0.2438,0.2444,0.2452,0.2458,0.2463,0.2465,0.2471


In [11]:
label = full_data['label'].tolist()
label = np.asarray(label).reshape(-1,)

In [12]:
cols = full_data.columns.tolist()
cols = cols[1:]

In [13]:
x = full_data.iloc[:,1:]
x = x[cols[100]].tolist()
x = np.asarray(x).reshape(-1,1)
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.3, random_state=42)

### SVM

In [11]:
# #############################################################################
# Train a SVM classification model

print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'kernel':['rbf','sigmoid'],'C': [1e2,5e2,1e3, 5e3, 1e4],
              'gamma': [0.001, 0.005, 0.01,0.05,0.1,0.2]}
print("Set Grid parameters")
clf = GridSearchCV(
    SVC(class_weight='balanced',probability=True), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
Set Grid parameters
Start to fit
done in 15.876s
Best estimator found by grid search:
SVC(C=1000.0, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.2, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)




In [12]:
clf.score(x_test,y_test)

0.6972477064220184

In [13]:
health_rows = full_data[full_data['label'] == 0]
disease_rows = full_data[full_data['label'] != 0]
length = disease_rows['label'].count()
selected_rows = health_rows.sample(length)
new_pd = selected_rows.append(disease_rows, ignore_index=True)

In [15]:
length

338

### Random forest

In [6]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'n_estimators':[50,100,150],'max_depth': range(1,20), 'criterion':['gini','entropy']}
print("Set Grid parameters")
clf = GridSearchCV(
    RandomForestClassifier(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
Set Grid parameters
Start to fit
done in 110.443s
Best estimator found by grid search:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)




In [7]:
clf.score(x_test,y_test)

0.7201834862385321

### Gaussian Naive Bayes (GaussianNB)

In [14]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {}
print("Set Grid parameters")
clf = GridSearchCV(
    GaussianNB(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
Set Grid parameters
Start to fit
done in 0.018s
Best estimator found by grid search:
GaussianNB(priors=None, var_smoothing=1e-09)


In [15]:
clf.score(x_test,y_test)

0.6788990825688074

### KNeighborsClassifier

In [None]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train, y_train)
clf.score(x_test, y_test)

In [16]:
print("Fitting the classifier to the training set")
t0 = time()
param_grid = {'n_neighbors':range(1,20)}
print("Set Grid parameters")
clf = GridSearchCV(
    KNeighborsClassifier(), param_grid, cv=10)
print("Start to fit")
clf = clf.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print("Best estimator found by grid search:")
print(clf.best_estimator_)

Fitting the classifier to the training set
Set Grid parameters
Start to fit
done in 0.600s
Best estimator found by grid search:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=19, p=2,
           weights='uniform')




In [17]:
clf.score(x_test,y_test)

0.7110091743119266

In [22]:
_tmp = full_data.drop(['label'],axis=1)
_tmp.head()

Unnamed: 0,394.6,396.7,398.7,400.8,402.8,404.9,406.9,409.0,411.0,413.1,...,866.7,868.8,870.9,872.9,875.0,877.0,879.1,881.1,883.2,885.2
0,0.04827,0.04468,0.04008,0.03521,0.0303,0.02578,0.02212,0.01985,0.01826,0.01732,...,0.3162,0.3168,0.3175,0.318,0.3184,0.3187,0.319,0.3194,0.3195,0.3194
1,0.05322,0.04898,0.04387,0.03805,0.03224,0.02683,0.02242,0.0195,0.01754,0.01761,...,0.3379,0.3388,0.3393,0.3402,0.3408,0.3412,0.3415,0.3417,0.3419,0.3416
2,0.0471,0.04375,0.03963,0.03504,0.03022,0.0259,0.02229,0.01998,0.01833,0.0179,...,0.3627,0.3634,0.3638,0.3643,0.3648,0.3651,0.3651,0.3649,0.3651,0.3648
3,0.04965,0.04648,0.0423,0.03775,0.03321,0.0289,0.0252,0.02265,0.0204,0.02007,...,0.3373,0.3383,0.339,0.3399,0.3406,0.3412,0.3419,0.3423,0.3429,0.3432
4,0.04562,0.04221,0.03784,0.03332,0.02895,0.02493,0.02176,0.02012,0.01829,0.01795,...,0.2413,0.242,0.2431,0.2438,0.2444,0.2452,0.2458,0.2463,0.2465,0.2471


In [47]:
corr = _tmp.corr()
corr.iloc[110:130,120]

620.4    0.995917
622.5    0.996764
624.5    0.997391
626.6    0.997862
628.6    0.998318
630.7    0.998706
632.7    0.999060
634.8    0.999387
636.8    0.999680
638.9    0.999905
640.9    1.000000
643      0.999887
645      0.999571
647.1    0.999045
649.1    0.998381
651.2    0.997638
653.3    0.996808
655.3    0.995870
657.4    0.994779
659.4    0.993491
Name: 640.9, dtype: float64

In [33]:
from sklearn.cluster import KMeans

row = corr.iloc[1,:].tolist()
row = np.asarray(row).reshape(-1,1)
kmeans = KMeans(n_clusters=20, random_state=0).fit(row)
label_1 = kmeans.labels_

In [34]:
row = corr.iloc[10,:].tolist()
row = np.asarray(row).reshape(-1,1)
kmeans = KMeans(n_clusters=20, random_state=0).fit(row)
label_10 = kmeans.labels_

In [40]:
clusters_1 = dict()
for index, band in zip(label_1, cols):
    if index in clusters_1:
        clusters_1[index].append(band)
    else:
        clusters_1[index] = [band]

clusters_1

{0: [636.8,
  638.9,
  640.9,
  643,
  645,
  647.1,
  649.1,
  651.2,
  653.3,
  655.3,
  657.4,
  698.4,
  700.5,
  731.3],
 1: [474.7,
  476.7,
  478.8,
  480.8,
  482.9,
  484.9,
  487,
  489,
  491.1,
  493.1,
  495.2,
  497.2,
  499.3,
  501.3,
  503.4,
  505.5,
  507.5,
  509.6,
  511.6,
  513.7,
  515.7,
  517.8,
  519.8,
  521.9,
  523.9,
  526,
  528],
 2: [751.8,
  753.8,
  755.9,
  757.9,
  760,
  762.1,
  764.1,
  766.2,
  813.4,
  815.4,
  817.5,
  819.5,
  821.6,
  823.6,
  825.7,
  827.7,
  829.8,
  831.8,
  833.9,
  836,
  838,
  840.1,
  842.1,
  844.2,
  846.2,
  848.3,
  850.3,
  852.4,
  854.4,
  856.5,
  858.5,
  860.6,
  862.6,
  864.7,
  866.7],
 3: [400.8],
 4: [406.9,
  431.6,
  433.6,
  435.7,
  437.7,
  439.8,
  441.8,
  443.9,
  445.9,
  448,
  450,
  452.1],
 5: [608.1, 610.1, 612.2, 614.2, 616.3, 706.6, 708.7, 725.1],
 6: [548.6,
  550.6,
  552.7,
  554.7,
  556.8,
  558.8,
  560.9,
  562.9,
  565,
  567,
  569.1,
  571.1,
  573.2],
 7: [659.4,
  661.5,
 

In [41]:
clusters_10 = dict()
for index, band in zip(label_10, cols):
    if index in clusters_10:
        clusters_10[index].append(band)
    else:
        clusters_10[index] = [band]

clusters_10

{0: [622.5,
  624.5,
  626.6,
  628.6,
  630.7,
  632.7,
  634.8,
  636.8,
  638.9,
  640.9,
  643,
  645,
  647.1,
  649.1,
  651.2,
  653.3,
  655.3,
  657.4],
 1: [753.8,
  755.9,
  757.9,
  760,
  762.1,
  817.5,
  819.5,
  821.6,
  823.6,
  825.7,
  827.7,
  829.8,
  831.8,
  833.9,
  836,
  838,
  840.1,
  842.1,
  844.2,
  846.2,
  848.3,
  850.3,
  852.4,
  854.4,
  856.5,
  858.5,
  860.6,
  862.6],
 2: [404.9,
  476.7,
  478.8,
  480.8,
  482.9,
  484.9,
  487,
  489,
  491.1,
  493.1,
  495.2,
  497.2,
  499.3,
  501.3,
  503.4],
 3: [729.2, 731.3],
 4: [398.7,
  536.2,
  538.3,
  540.3,
  542.4,
  544.5,
  546.5,
  548.6,
  550.6,
  552.7,
  554.7,
  556.8,
  558.8,
  560.9,
  562.9,
  565,
  567,
  569.1,
  571.1,
  573.2,
  575.2,
  577.3,
  579.4,
  581.4,
  583.5,
  585.5,
  587.6,
  589.6,
  591.7,
  593.7,
  595.8,
  597.8],
 5: [394.6, 714.8, 716.9, 718.9],
 6: [400.8, 519.8, 521.9, 523.9, 526],
 7: [406.9,
  409,
  443.9,
  445.9,
  448,
  450,
  452.1,
  454.1,
  4

### Try Stacking

In [48]:
from sklearn.ensemble import StackingClassifier

from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

ImportError: cannot import name 'StackingClassifier'

In [1]:
import pandas as pd

In [3]:
comb_6 = pd.read_csv('comb_6_test.csv')


0    15890699
1    15890699
2    15890699
3    15890699
4    15890699
5    15890699
dtype: int64

In [12]:
total_row = comb_6.count()[0]
_num = 1000000
split_num = int(total_row / _num) + 1

In [18]:
_tmp = comb_6.iloc[10:20]
_tmp.to_csv('comb_6_tmp.csv',header=False,index=False)

In [19]:
_tmp

Unnamed: 0,0,1,2,3,4,5
10,0,1,2,3,4,16
11,0,1,2,3,4,17
12,0,1,2,3,4,18
13,0,1,2,3,4,19
14,0,1,2,3,4,20
15,0,1,2,3,4,21
16,0,1,2,3,4,22
17,0,1,2,3,4,23
18,0,1,2,3,4,24
19,0,1,2,3,4,25


In [20]:
for idx in range(split_num):
    _split = comb_6.iloc[idx*_num: (idx+1)*_num]
    _split.to_csv('comb_6_'+ str(idx+1)+'.csv', header=False, index=False)