In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_predict,cross_val_score
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
import seaborn as sns; sns.set()
import time
from sklearn.feature_selection import VarianceThreshold
from imblearn.pipeline import Pipeline
from sklearn import neighbors


### Εισαγωγή dataset από αρχείο CSV

In [None]:
data = pd.read_csv("./Dry_Bean.csv")

## Εξαγωγή πληροφοριών του dataset

In [None]:
n_samples=data.shape[0]
n_features = data.shape[1] - 1

In [None]:
x = data.drop('Class',axis=1)
y = data[["Class"]]

## train-test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y,train_size=0.8,test_size=0.2, random_state=0)

In [None]:
#x_train = StandardScaler().fit_transform(x_train)
#x_test = StandardScaler().fit_transform(x_test)

y_train =  np.ravel(y_train)
y_test = np.ravel(y_test)

x = np.append(x_train, x_test).reshape(13611,16)
y = np.append(y_train, y_test)

In [None]:
# hold times for train and pred of out of the box
train_time_base = {}
pred_time_base = {}
# dictionary to store all classifier without optimization preds
predictions = {}

## Εκτέλεση MLP Out-of-the-box

In [None]:
clf = MLPClassifier()
start_time = time.time()
clf.fit(x_train, y_train)
train_time_base["MLP"] = time.time() - start_time
start_time = time.time()
preds = clf.predict(x_test)
pred_time_base["MLP"] = time.time() - start_time
predictions["MLP"] = preds
print(classification_report(y_test, preds))

In [None]:
scores_MLP_acc = cross_val_score(clf,x_train,y_train,cv=10,n_jobs=-1, scoring='accuracy')
scores_MLP_f1 = cross_val_score(clf,x_train,y_train,cv=10,n_jobs=-1,scoring='f1_weighted')

In [None]:
plt.bar(['accuracy','f1'],[np.mean(scores_MLP_acc),np.mean(scores_MLP_f1)])
plt.xlabel("scoring method")
plt.ylabel("score")
plt.title("10 fold CV MLP")
plt.show()

## Εκτέλεση SVM Out-of-the-box

In [None]:
from sklearn.svm import SVC # "Support vector classifier"
model = SVC()
start_time = time.time()
model.fit(x_train, y_train)
train_time_base["SVM"] = time.time() - start_time
start_time = time.time()
preds =model.predict(x_test)
pred_time_base["SVM"] = time.time() - start_time
predictions["SVM"] = preds
print(classification_report(y_test, preds))


In [None]:
scores_SVM_acc = cross_val_score(model,x_train,y_train,cv=10,n_jobs=-1, scoring='accuracy')
scores_SVM_f1 = cross_val_score(model,x_train,y_train,cv=10,n_jobs=-1,scoring='f1_macro')

In [None]:
plt.bar(['accuracy','f1'],[np.mean(scores_SVM_acc),np.mean(scores_SVM_f1)])
plt.xlabel("scoring method")
plt.ylabel("score")
plt.title("10 fold CV SVM")
plt.show()

## Dummy Classifiers for scale

In [None]:
from sklearn.dummy import DummyClassifier
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix
from sklearn.metrics import precision_recall_fscore_support,f1_score

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

accuracy = {}
f1 = {}

def metrics_info(classifier_name, preds):
  global accuracy, f1 
  accuracy[classifier_name] = accuracy_score(y_test, preds)
  f1[classifier_name] = f1_score(y_test, preds, average = "macro")
  # print bar plot function
def bar_plot(classifiers, scores, title):
  plt.figure(figsize = (10,6))
  y_pos = np.arange(len(classifiers))
  plt.bar(y_pos, scores, align='center', alpha = 0.9)
  plt.xticks(y_pos, classifiers, rotation = 45)
  plt.ylabel('Score')
  plt.title(title)
  plt.show()

In [None]:
dc_uniform = DummyClassifier(strategy="uniform")
dc_constant_horoz = DummyClassifier(strategy="constant", constant='horoz'.upper())
dc_constant_sira = DummyClassifier(strategy="constant", constant= 'sira'.upper())
dc_constant_barbunya = DummyClassifier(strategy="constant", constant='barbunya'.upper())
dc_constant_seker = DummyClassifier(strategy="constant", constant='seker'.upper())
dc_constant_bombay = DummyClassifier(strategy="constant",constant='bombay'.upper())
dc_constant_cali = DummyClassifier(strategy="constant",constant='cali'.upper())
dc_constant_dermason = DummyClassifier(strategy="constant",constant='dermason'.upper())
dc_most_frequent = DummyClassifier(strategy="most_frequent")
dc_stratified = DummyClassifier(strategy="stratified")

# with the fit method we train the classifier with the train set
# with the predict method predictions occur based on the input data
start_time = time.time()
dc_uniform.fit(x_train, y_train)
train_time_base["DC Uniform"] = time.time() - start_time
start_time = time.time()
predictions["DC Uniform"] = dc_uniform.predict(x_test)
pred_time_base["DC Uniform"] = time.time() - start_time
start_time = time.time()
dc_constant_horoz.fit(x_train, y_train)
train_time_base["DC Constant horoz"] = time.time() - start_time
start_time = time.time()
predictions["DC Constant horoz"] = dc_constant_horoz.predict(x_test)
pred_time_base["DC Constant horoz"] = time.time() - start_time
start_time = time.time()
dc_constant_sira.fit(x_train, y_train)
train_time_base["DC Constant sira"] = time.time() - start_time
start_time = time.time()
predictions["DC Constant sira"] = dc_constant_sira.predict(x_test)
pred_time_base["DC Constant sira"] = time.time() - start_time
start_time = time.time()

start_time = time.time()
dc_constant_bombay.fit(x_train, y_train)
train_time_base["DC Constant bombay"] = time.time() - start_time
start_time = time.time()
predictions["DC Constant bombay"] = dc_constant_bombay.predict(x_test)
pred_time_base["DC Constant bombay"] = time.time() - start_time
start_time = time.time()

start_time = time.time()
dc_constant_cali.fit(x_train, y_train)
train_time_base["DC Constant cali"] = time.time() - start_time
start_time = time.time()
predictions["DC Constant cali"] = dc_constant_cali.predict(x_test)
pred_time_base["DC Constant cali"] = time.time() - start_time
start_time = time.time()

start_time = time.time()
dc_constant_dermason.fit(x_train, y_train)
train_time_base["DC Constant dermason"] = time.time() - start_time
start_time = time.time()
predictions["DC Constant dermason"] = dc_constant_dermason.predict(x_test)
pred_time_base["DC Constant dermason"] = time.time() - start_time
start_time = time.time()

dc_constant_barbunya.fit(x_train, y_train)
train_time_base["DC Constant barbunya"] = time.time() - start_time

start_time = time.time()
predictions["DC Constant barbunya"] = dc_constant_barbunya.predict(x_test)
pred_time_base["DC Constant barbunya"] = time.time() - start_time
start_time = time.time()
dc_constant_seker.fit(x_train, y_train)
train_time_base["DC Constant seker"] = time.time() - start_time
start_time = time.time()
predictions["DC Constant seker"] = dc_constant_seker.predict(x_test)
pred_time_base["DC Constant seker"] = time.time() - start_time
start_time = time.time()
dc_most_frequent.fit(x_train, y_train)
train_time_base["DC Most Frequent"] = time.time() - start_time
start_time = time.time()
predictions["DC Most Frequent"] = dc_most_frequent.predict(x_test)
pred_time_base["DC Most Frequent"] = time.time() - start_time
start_time = time.time()
dc_stratified.fit(x_train, y_train)
train_time_base["DC Stratified"] = time.time() - start_time
start_time = time.time()
predictions["DC Stratified"] = dc_stratified.predict(x_test)
pred_time_base["DC Stratified"] = time.time() - start_time

metrics_info("MLP",predictions["MLP"])
metrics_info("SVM",predictions["SVM"])
metrics_info("DC Uniform", predictions["DC Uniform"])
metrics_info("DC Constant horoz", predictions["DC Constant horoz"])
metrics_info("DC Constant sira", predictions["DC Constant sira"])
metrics_info("DC Constant barbunya", predictions["DC Constant barbunya"])
metrics_info("DC Constant seker", predictions["DC Constant seker"])
metrics_info("DC Constant bombay", predictions["DC Constant bombay"])
metrics_info("DC Constant cali", predictions["DC Constant cali"])
metrics_info("DC Constant dermason", predictions["DC Constant dermason"])
metrics_info("DC Most Frequent", predictions["DC Most Frequent"])
metrics_info("DC Stratified", predictions["DC Stratified"])
classifier_labels = list(accuracy.keys())
classifier_accuracy = list(accuracy.values())
classifier_f1 = list(f1.values())
    
data = []
data.append(list(accuracy.values()))
data.append(list(f1.values()))
print(pd.DataFrame(data, index = ["Accuracy Score", "F1 Score"], columns = list(accuracy.keys())))

bar_plot(classifier_labels, [i*100 for i in classifier_accuracy], "Accuracy Score (Out of the box)")
bar_plot(classifier_labels, [i*100 for i in classifier_f1], "F1 Score (Out of the box)")



In [None]:
dum = DummyClassifier()

In [None]:
scores_DUM_acc = cross_val_score(dum,x_train,y_train,cv=10,n_jobs=-1, scoring='accuracy')
scores_DUM_f1 = cross_val_score(dum,x_train,y_train,cv=10,n_jobs=-1,scoring='f1_macro')

In [None]:
plt.bar(['accuracy','f1'],[np.mean(scores_DUM_acc),np.mean(scores_DUM_f1)])
plt.xlabel("scoring method")
plt.ylabel("score")
plt.title("10 fold CV DUMMY")
plt.show()

## Improvement

### Preproccessing

In [None]:
selector = VarianceThreshold(threshold=0.00001)
train_reduced = selector.fit_transform(x_train)
mask = selector.get_support()
test_reduced = np.array(x_test)[:,mask]
print("Features Used = {}".format(np.shape(test_reduced)[1]))

In [None]:
clf = MLPClassifier()
start_time = time.time()
clf.fit(train_reduced, y_train)
train_time_base["MLP"] = time.time() - start_time
start_time = time.time()
preds = clf.predict(test_reduced)
pred_time_base["MLP"] = time.time() - start_time
predictions["MLP"] = preds
print(classification_report(y_test, preds))

In [None]:
model = SVC()
start_time = time.time()
model.fit(train_reduced, y_train)
train_time_base["SVM"] = time.time() - start_time
start_time = time.time()
preds = model.predict(test_reduced)
pred_time_base["SVM"] = time.time() - start_time
predictions["SVM"] = preds
print(classification_report(y_test, preds))

### Pipeline

In [None]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler # φέρνουμε τον StandarScaler ως transformer που έχει .transform kai ΄όχι ως scale()
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.decomposition import PCA

selector = VarianceThreshold(0.0001)
scaler = StandardScaler()
ros = RandomOverSampler()
pca = PCA()

pipe = Pipeline(steps = [('selector',selector),('scaler', scaler),('sampler',ros),('pca',pca),('clf',clf)], memory = 'tmp')
timer = time.time()
pipe.fit(x_train,y_train)
demo_fit_time = time.time() - timer
timer = time.time()
pred_mlp = pipe.predict(x_test)
demo_pred_time = time.time() - timer
print(classification_report(y_test, pred_mlp))

pipe_svm = Pipeline(steps = [('selector',selector),('scaler', scaler),('sampler',ros),('pca',pca),('clf',model)], memory = 'tmp')
timer = time.time()
pipe_svm.fit(x_train,y_train)
demo_fit_time_svm = time.time() - timer
timer = time.time()
pred_svm = pipe_svm.predict(x_test)
demo_pred_time_svm = time.time() - timer
print(classification_report(y_test, pred_svm))


### GridSearch


In [None]:
from sklearn.model_selection import GridSearchCV

In [270]:
vthreshold = list(np.arange(start = 0, stop = 0.0002, step = 0.00005))
print(vthreshold)
n_components = [9,10,11,12]

[0.0, 5e-05, 0.0001, 0.00015000000000000001]


In [271]:
estimator_mlp_f1 = GridSearchCV(pipe, dict(selector__threshold=vthreshold, pca__n_components=n_components), cv=10, scoring='f1_weighted', n_jobs=-1)
estimator_mlp_f1.fit(x_train,y_train)



In [273]:
estimator_mlp_f1.best_score_


0.9304946542283187

In [274]:
!nvidia-smi

Wed Nov 23 23:00:53 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 516.40       Driver Version: 516.40       CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA GeForce ... WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   52C    P0    28W /  N/A |      0MiB /  6144MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces