In [2]:
# Multiclass classification using a neural network

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors as c
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import seaborn as sns
import pandas as pd


In [3]:
df_training = pd.read_csv('./dataset/datasetC.csv', header=None)
df_testing = pd.read_csv('./dataset/datasetCTest.csv', header=None)

# X is all the data except the last column
X_training = df_training.iloc[:, :-1]

# y is the last column
y_training = df_training.iloc[:, -1]

In [4]:
from xgboost import XGBClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [5]:
# Initialization of models
models = {  
    "xgboost": OneVsRestClassifier(XGBClassifier(random_state=42)),
    "SGD" : SGDClassifier(random_state=42),
    "svc" : OneVsRestClassifier(SVC()),
}

from sklearn.model_selection import train_test_split

#y_training = y_training - 1
x_train, x_test, y_train, y_test = train_test_split(X_training, y_training, test_size=0.2, random_state=42)

# Scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


In [6]:
# saving the scaler for later use
import pickle
pickle.dump(scaler, open("myscaler.pkl", "wb"))

In [7]:
# Train the models
for i in range(len(list(models))):
  model = list(models.values())[i]
  model.fit(x_train_scaled, y_train)

  #Make predictions
  y_test_pred = model.predict(x_test_scaled)

  # Test set Performance
  accuracy = accuracy_score(y_test, y_test_pred)

  print(list(models.keys())[i])

  print('- Accuracy : {:.4f}'.format(accuracy))

  print('\n')


xgboost
- Accuracy : 0.7970


SGD
- Accuracy : 0.7590


svc
- Accuracy : 0.8330




### Neural Networks

In [8]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, BatchNormalization
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Convert labels to categorical one-hot encoding
y_train_categorical = to_categorical(y_train)

# Create a Sequential model
model_nn = Sequential()

# Add an input layer and a hidden layer with 20 neurons
model_nn.add(Dense(30, input_dim=x_train_scaled.shape[1], activation='relu'))
model_nn.add(BatchNormalization())

# model_nn.add(Dense(32, input_dim=x_train_scaled.shape[1], activation='relu'))
# model_nn.add(Dense(6, activation='softmax'))  # 6 neurons in the output layer

# Add dropout after the first hidden layer
model_nn.add(Dropout(0.5))

model_nn.add(Dense(20, activation='relu'))
model_nn.add(BatchNormalization())

# Add dropout after the second hidden layer
model_nn.add(Dropout(0.5))

# Add an output layer with a neuron for each class, using softmax activation
model_nn.add(Dense(y_train_categorical.shape[1], activation='softmax'))

# Compile the model
model_nn.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='loss', patience=10)

# Define learning rate reduction
lr_reduction = ReduceLROnPlateau(monitor='loss', patience=2, verbose=1, factor=0.5, min_lr=0.00001)

# Train the model
model_nn.fit(x_train_scaled, y_train_categorical, epochs=100, batch_size=10, callbacks=[early_stopping, lr_reduction])

# Evaluate the model
loss, accuracy = model_nn.evaluate(x_test_scaled, to_categorical(y_test))

print('Test accuracy:', accuracy)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 18: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 26: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 33: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 34/100
Epoch 35/100
Epoch 35: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 36/100
Epoch 37/100
Epoch 37: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 38/100
Epoch 39/100
Epoch 39: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 40/100
Epoch 41/100
Epoch 41: Redu

In [11]:
# save the model
model_nn.save('model_nn.keras')

In [None]:
from keras.models import load_model

# Load the model
loaded_model = load_model('model_nn.keras')

# New data = df_testing
x_test_new = df_testing

# Scaling the data
scaler = pickle.load(open("myscaler.pkl", "rb"))
x_test_new_scaled = scaler.transform(x_test_new)

# Make predictions on the new data
predictions = loaded_model.predict(x_test_new_scaled)

In [None]:
# Convert predictions to label indexes using argmax
predicted_labels = np.argmax(predictions, axis=1)

# Add 1 to the predicted labels to get them back in the original format
predicted_labels_original = predicted_labels + 1

# Plot the distribution of predicted classes
plt.hist(predicted_labels_original, bins=np.arange(predictions.shape[1] + 1) - 0.5, edgecolor='black')
plt.xlabel('Class')
plt.ylabel('Frequency')
plt.title('Distribution of predicted classes')
plt.show()

In [None]:
# Save the predicted labels to a numpy file
np.save('labels22.npy', predicted_labels_original)

In [None]:
# Load the predicted labels
loaded_labels = np.load('labels22.npy')

# Print the first 20 entries of the predicted labels
print(loaded_labels[:400])