In [85]:
import pandas as pd
import requests
import numpy as np

import matplotlib.pyplot as plt

In [86]:
data = pd.read_csv('https://raw.githubusercontent.com/nilsonsales/mlclass-2022/master/03_Validation/abalone_dataset.csv')

data.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
0,M,0.535,0.42,0.15,0.6995,0.2575,0.153,0.24,3
1,I,0.51,0.38,0.115,0.5155,0.215,0.1135,0.166,1
2,I,0.185,0.13,0.045,0.029,0.012,0.0075,0.0095,1
3,M,0.55,0.45,0.17,0.81,0.317,0.157,0.22,3
4,I,0.535,0.415,0.15,0.5765,0.3595,0.135,0.225,1


In [134]:
# Check if the classes are balanced
data['type'].value_counts()

1    1078
3    1051
2    1003
Name: type, dtype: int64

In [160]:
# Remove outliers
data_mod = data.copy()
Q1 = data_mod.quantile(0.25)
Q3 = data_mod.quantile(0.75)
IQR = Q3 - Q1

data_mod = data_mod[~((data_mod < (Q1 - 1.5 * IQR)) |(data_mod > (Q3 + 1.5 * IQR))).any(axis=1)]

In [185]:
X = data_mod[ data_mod.columns[:-1] ]
y = data_mod['type']

# Enconding the sex to a OneHotEncoding format
X = pd.get_dummies(X)

X.shape

(3011, 10)

In [175]:
# from sklearn.preprocessing import PowerTransformer

# normalizer = PowerTransformer().fit(X)

# X_mod = pd.DataFrame(normalizer.transform(X), columns = X.columns)

# X_mod.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M
0,0.002786,0.037969,0.300298,-0.13726,-0.36735,-0.118401,0.163386,-0.665907,-0.704115,1.32946
1,-0.231265,-0.406049,-0.677322,-0.586984,-0.612197,-0.554721,-0.453637,-0.665907,1.420223,-0.752185
2,0.149014,0.400576,0.888888,0.114795,-0.045116,-0.076347,0.003192,-0.665907,-0.704115,1.32946
3,0.002786,-0.01995,0.300298,-0.433376,0.1717,-0.312381,0.043676,-0.665907,1.420223,-0.752185
4,1.87191,2.216477,1.972958,1.916002,1.247082,2.206303,2.104631,1.501711,-0.704115,-0.752185


In [192]:
# Estratégia pra usar PCA
from sklearn.decomposition import PCA  # Make an instance of the Model

X_mod = X

#pca = PCA(n_components='mle')
pca = PCA()

pca.fit(X_mod)
X_mod = pca.transform(X_mod)

X_mod.shape

(3011, 10)

In [193]:
from sklearn.model_selection import train_test_split

# Splitting our data into train and validation (calling it test)
X_train, X_test, y_train, y_test = train_test_split(X_mod, y, test_size=0.2, random_state=0)

In [194]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score


def select_best_model(X_train, X_test, y_train, y_test):
    model_1 = LogisticRegression(random_state=42, max_iter=3000)
    model_2 = SVC(kernel='linear', probability=False)
    model_3 = SVC(kernel='rbf', C=14, gamma=0.5)
    #model_4 = RandomForestClassifier(max_depth=90, min_samples_split=12, n_estimators=42)
    model_4 = RandomForestClassifier(n_estimators=84, min_samples_split=10, min_samples_leaf=10, max_features='sqrt', max_depth=20, bootstrap=True)
    model_5 = MLPClassifier(solver='adam', alpha=0.0001, max_iter=1000, random_state=42)

    models = [model_1, model_2, model_3, model_4, model_5]
    
    best_model = None
    best_accuracy = 0

    print("\n####### Training Models #######")

    for model in models:
        print("\nModel: ", type(model).__name__)
        model.fit(X_train, y_train)
        train_accuracy = np.mean(cross_val_score(model, X_train, y_train, cv=3))
        print(f'Train accuracy: {train_accuracy}')

        y_pred = model.predict(X_test)
        val_accuracy = accuracy_score(y_test, y_pred)
        confusion = confusion_matrix(y_test, y_pred)
        print(f"Val accuracy: {val_accuracy}\nConfusion matrix:\n{confusion}")
        
        if val_accuracy > best_accuracy:
            best_model = model
            best_accuracy = val_accuracy

    print("The best model is: ", type(best_model).__name__, "with validation accuracy of ", best_accuracy)
    return best_model


model = select_best_model(X_train, X_test, y_train, y_test)



####### Training Models #######

Model:  LogisticRegression
Train accuracy: 0.6274910482200476
Val accuracy: 0.615257048092869
Confusion matrix:
[[165  43   6]
 [ 49  89  56]
 [ 28  50 117]]

Model:  SVC
Train accuracy: 0.6204362485235645
Val accuracy: 0.615257048092869
Confusion matrix:
[[164  47   3]
 [ 54  96  44]
 [ 28  56 111]]

Model:  SVC
Train accuracy: 0.6424375963370942
Val accuracy: 0.6517412935323383
Confusion matrix:
[[150  55   9]
 [ 26 119  49]
 [ 15  56 124]]

Model:  RandomForestClassifier
Train accuracy: 0.6490814267775973
Val accuracy: 0.6583747927031509
Confusion matrix:
[[164  38  12]
 [ 35 106  53]
 [ 14  54 127]]

Model:  MLPClassifier
Train accuracy: 0.6611284159878427
Val accuracy: 0.6749585406301825
Confusion matrix:
[[158  47   9]
 [ 25 121  48]
 [ 12  55 128]]
The best model is:  MLPClassifier with validation accuracy of  0.6749585406301825


In [195]:
from sklearn.model_selection import GridSearchCV

'''
    Based on: https://datascience.stackexchange.com/a/36087/97065
'''

def find_best_mlp(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(max_iter=1000, solver='adam')

    parameter_space = {
        'hidden_layer_sizes': [(50,40,40), (60,40), (50,50), (100,)], #[(50,50,50), (50,100,50), (100,)],
        #'activation': ['tanh', 'relu'],
        'alpha': [0.0001, 0.001, 0.01, 0.05],
        'learning_rate': ['constant','adaptive'],
    }


    clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
    clf.fit(X_train, y_train)

    # Best parameter set
    print('Best parameters found:\n', clf.best_params_)

    # All results
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.5f (+/-%0.03f) for %r" % (mean, std * 2, params))

    return clf

mlp = find_best_mlp(X_train, X_test, y_train, y_test)

Best parameters found:
 {'alpha': 0.01, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant'}
0.65615 (+/-0.004) for {'alpha': 0.0001, 'hidden_layer_sizes': (50, 40, 40), 'learning_rate': 'constant'}
0.65781 (+/-0.008) for {'alpha': 0.0001, 'hidden_layer_sizes': (50, 40, 40), 'learning_rate': 'adaptive'}
0.65740 (+/-0.016) for {'alpha': 0.0001, 'hidden_layer_sizes': (60, 40), 'learning_rate': 'constant'}
0.66196 (+/-0.015) for {'alpha': 0.0001, 'hidden_layer_sizes': (60, 40), 'learning_rate': 'adaptive'}
0.65407 (+/-0.009) for {'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'constant'}
0.65739 (+/-0.006) for {'alpha': 0.0001, 'hidden_layer_sizes': (50, 50), 'learning_rate': 'adaptive'}
0.65283 (+/-0.015) for {'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.65822 (+/-0.013) for {'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive'}
0.66238 (+/-0.017) for {'alpha': 0.001, 'hidden_layer_sizes': (50, 40, 40), 'le

In [196]:
# Checking the accuracy again using the best parameters and use the validation set

model2 = MLPClassifier(max_iter=1000)
model2.set_params(**mlp.best_params_)

model2.fit(X_train, y_train)
accuracy = np.mean(cross_val_score(model2, X_train, y_train, cv=3))
print('Train accuracy: ', accuracy)

y_pred = model.predict(X_test)
val_accuracy = accuracy_score(y_test, y_pred)
confusion = confusion_matrix(y_test, y_pred)
print(f"Val accuracy: {val_accuracy}\nConfusion matrix:\n{confusion}")

Train accuracy:  0.6611294511748855
Val accuracy: 0.6749585406301825
Confusion matrix:
[[158  47   9]
 [ 25 121  48]
 [ 12  55 128]]


In [35]:
# Using ALL the data available for a final model
# Based on: https://datascience.stackexchange.com/questions/33008/is-it-always-better-to-use-the-whole-dataset-to-train-the-final-model

# model3 = MLPClassifier(max_iter=1000)
# model3.set_params(**mlp.best_params_)

# model3.fit(X_mod, y)

MLPClassifier(alpha=0.05, hidden_layer_sizes=(60, 40), learning_rate='adaptive',
              max_iter=900)

In [197]:
data_app = pd.read_csv('https://raw.githubusercontent.com/nilsonsales/mlclass-2022/master/03_Validation/abalone_app.csv')

# Apply the same tranformation to the test set
data_app = pd.get_dummies(data_app)

#data_app

# Apply PCA if used
data_app = pca.transform(data_app)

data_app.shape

(1045, 10)

In [198]:
print(' - Aplicando modelo e enviando para o servidor')

y_pred = model2.predict(data_app)

# Enviando previsões realizadas com o modelo para o servidor
URL = "https://aydanomachado.com/mlclass/03_Validation.php"

#TODO Substituir pela sua chave aqui
DEV_KEY = "720pster"

# json para ser enviado para o servidor
data_json = {'dev_key':DEV_KEY,
             'predictions':pd.Series(y_pred).to_json(orient='values')}

# Enviando requisição e salvando o objeto resposta
r = requests.post(url = URL, data = data_json)

# Extraindo e imprimindo o texto da resposta
pastebin_url = r.text
print(" - Resposta do servidor:\n", r.text, "\n")

 - Aplicando modelo e enviando para o servidor
 - Resposta do servidor:
 {"status":"success","dev_key":"720pster","accuracy":0.6526315789473685,"old_accuracy":0.6622009569378} 



In [None]:
# {'activation': 'relu', 'alpha': 0.01, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant'} -> "accuracy":0.6602870813397129