In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import pandas as pd
import numpy as np
import os
import glob

In [0]:
os.chdir('/content/drive/My Drive/Colab Notebooks/Miniprojects/PriyaPrajna')

datapath = os.getcwd()+'/Data/'

In [0]:
TypeList = [ttype+exttype for ttype in ["/Malignant*/", "/Premalignant*/",
                                             "/Normal*/"] for exttype in ["*PRN"]]
TypeList

In [0]:
df = pd.DataFrame()
for ftype in TypeList:   
    for file in glob.iglob(datapath + os.path.normpath(ftype)):        
        colname = os.path.splitext(os.path.basename(file))[0]   
        tempdf = pd.read_csv(file,delim_whitespace=True, usecols=[1], engine='python',
                             skipfooter=1, names=('Wavelength', colname),
                             dtype={'Wavelength': np.float64, colname: np.float64})        
        df = pd.concat([df, tempdf], axis=1)

In [0]:
df.head()

In [0]:
df = df.transpose()  #or df.T()
df
label_classes = ["M","P","N"]
#print(df.index)
df['Label'] = df.index.str[0]
df2=df

In [0]:
df['Label'] = df['Label'].apply(label_classes.index)

In [0]:
# distributing the dataset into malignant, premalignant, and normal 
sample_names = df.index.to_numpy()
n_M = df[df.Label == 0].shape[0]
n_PM =df[df.Label == 1].shape[0]
n_N = df[df.Label == 2].shape[0]
n = n_M+n_PM+n_N
p = len(df.columns)-1

X_M = df.loc[df['Label'] == 0, df.columns != 'Label'].values
X_M = np.column_stack((sample_names[df.Label == 0], X_M))
X_M

In [0]:
X_PM = df.loc[df['Label'] == 1, df.columns != 'Label']
X_PM = np.column_stack((sample_names[df.Label == 1], X_PM))
X_N = df.loc[df['Label'] == 2, df.columns != 'Label']
X_N = np.column_stack((sample_names[df.Label == 2], X_N))

In [0]:
y_M = df.loc[df['Label'] == 0, df.columns == 'Label'].values
y_PM = df.loc[df['Label'] == 1, df.columns == 'Label'].values
y_N = df.loc[df['Label'] == 2, df.columns == 'Label'].values

In [0]:
X = np.concatenate((X_M, X_PM, X_N))
y = np.concatenate((y_M, y_PM, y_N)).flatten()

In [0]:
print("Independent variables:\n",X)
print("\n\nOutput variable:\n",y)

# 1D-CNN

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [0]:
X_cnn = X.reshape(X.shape[0],X.shape[1], 1)
X_cnn = X_cnn[:, 1:]
print(X_cnn.shape)

In [0]:
from keras.utils import to_categorical
y_cnn = to_categorical(y)
print(y_cnn.shape)

In [0]:
from keras import layers
from keras import regularizers
from keras import optimizers
from keras.callbacks import EarlyStopping
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import GlobalAveragePooling1D
from keras.layers import AveragePooling1D
from keras.layers import Dense, Flatten, Dropout
from keras.layers import BatchNormalization
from keras.models import Sequential

In [0]:
number_of_features = X_cnn.shape[1]

In [0]:
# Function to create 1D CNN model
def create_model():
  network = Sequential()

  network.add(Conv1D(3, 16, activation='relu', input_shape=(number_of_features,1)))
  network.add(MaxPooling1D(3))
  network.add(BatchNormalization())

  network.add(Conv1D(3, 16, activation='relu'))
  network.add(MaxPooling1D(3))
  network.add(BatchNormalization())

  #network.add(GlobalAveragePooling1D())
  network.add(Flatten())
  network.add(Dropout(0.1))
  network.add(layers.Dense(units=64, activation='relu',
                           kernel_regularizer=regularizers.l2(0.0)))

  network.add(layers.Dense(units=3, activation='softmax'))

  # Compile neural network
  opt = optimizers.Adam(lr = 1e-04) 
  network.compile(optimizer = opt,
                  loss = 'categorical_crossentropy',
                  metrics=['accuracy'])
  return network

In [0]:
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [0]:
## Grid search to find optimal batch size and epochs
# Create model
model = KerasClassifier(build_fn = create_model, verbose = 0)
# Define the grid search parameters
batch_size = [8, 16, 32]
epochs = [100, 500]
param_grid = dict(batch_size = batch_size, epochs = epochs)
grid = GridSearchCV(estimator = model, param_grid = param_grid)
grid_result = grid.fit(X_cnn, y_cnn)
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))