In [14]:
import pandas as pd
import requests
import numpy as np

import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/nilsonsales/mlclass-2022/master/03_Validation/abalone_dataset.csv')

data.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,type
0,M,0.535,0.42,0.15,0.6995,0.2575,0.153,0.24,3
1,I,0.51,0.38,0.115,0.5155,0.215,0.1135,0.166,1
2,I,0.185,0.13,0.045,0.029,0.012,0.0075,0.0095,1
3,M,0.55,0.45,0.17,0.81,0.317,0.157,0.22,3
4,I,0.535,0.415,0.15,0.5765,0.3595,0.135,0.225,1


In [3]:
data['type'].value_counts()

1    1078
3    1051
2    1003
Name: type, dtype: int64

In [4]:
X = data[ data.columns[:-1] ]
y = data['type']

# Enconding the sex
X = pd.get_dummies(X)

X.head()

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M
0,0.535,0.42,0.15,0.6995,0.2575,0.153,0.24,0,0,1
1,0.51,0.38,0.115,0.5155,0.215,0.1135,0.166,0,1,0
2,0.185,0.13,0.045,0.029,0.012,0.0075,0.0095,0,1,0
3,0.55,0.45,0.17,0.81,0.317,0.157,0.22,0,0,1
4,0.535,0.415,0.15,0.5765,0.3595,0.135,0.225,0,1,0


In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

# Grid Search, getting best parameters for decision tree
tree_para = {'criterion':['gini','entropy'],'max_depth':list(range(1, 15))}
clf = GridSearchCV(DecisionTreeClassifier(), tree_para, cv=5)
clf.fit(X_train, y_train)
params = clf.best_params_

print(params)


{'criterion': 'gini', 'max_depth': 7}


In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix


def select_best_model(X_train, X_test, y_train, y_test):
    model_1 = LogisticRegression(random_state=42, max_iter=3000)
    model_2 = SVC(kernel='linear', probability=False)
    model_3 = SVC(kernel='rbf', C=14, gamma=0.5)
    model_4 = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1)
    model_5 = MLPClassifier(solver='adam', alpha=0.0001, max_iter=800, random_state=42)

    models = [model_1, model_2, model_3, model_4, model_5]
    
    best_model = None
    best_accuracy = 0

    print("\n####### Training Models #######")

    for model in models:
        print("\nModel: ", type(model).__name__)
        model.fit(X_train, y_train)
        accuracy = np.mean(cross_val_score(model, X_train, y_train, cv=3))
        print('Accuracy: ', accuracy)

        y_pred = model.predict(X_test)
        confusion = confusion_matrix(y_test, y_pred)
        print("Confusion matrix:\n{}".format(confusion))
        
        if accuracy > best_accuracy:
            best_model = model
            best_accuracy = accuracy

    print("The best model is: ", type(best_model).__name__, "with an accuracy of ", best_accuracy)
    return best_model


model = select_best_model(X_train, X_test, y_train, y_test)



####### Training Models #######

Model:  LogisticRegression
Accuracy:  0.6373148686043387
Confusion matrix:
[[253  54  12]
 [ 83 127  86]
 [ 40  73 212]]

Model:  SVC
Accuracy:  0.628645316042951
Confusion matrix:
[[247  62  10]
 [ 76 146  74]
 [ 36  96 193]]

Model:  SVC
Accuracy:  0.6537351098451486
Confusion matrix:
[[240  69  10]
 [ 64 159  73]
 [ 19  88 218]]

Model:  RandomForestClassifier
Accuracy:  0.6145100537825834
Confusion matrix:
[[234  65  20]
 [ 62 151  83]
 [ 21 101 203]]

Model:  MLPClassifier
Accuracy:  0.6683451080336562
Confusion matrix:
[[249  55  15]
 [ 65 146  85]
 [ 19  82 224]]
The best model is:  MLPClassifier with an accuracy of  0.6683451080336562


In [46]:
from sklearn.model_selection import GridSearchCV

'''
    Based on: https://datascience.stackexchange.com/a/36087/97065
'''

def find_best_mlp(X_train, X_test, y_train, y_test):
    mlp = MLPClassifier(max_iter=800, solver='adam')

    parameter_space = {
        'hidden_layer_sizes': [(50,50,50), (50,100,50), (100,)],
        'activation': ['tanh', 'relu'],
        'alpha': [0.0001, 0.001, 0.01, 0.05],
        'learning_rate': ['constant','adaptive'],
    }


    clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
    clf.fit(X_train, y_train)

    # Best paramete set
    print('Best parameters found:\n', clf.best_params_)

    # All results
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))

    return clf

model = find_best_mlp(X_train, X_test, y_train, y_test)

Best parameters found:
 {'activation': 'relu', 'alpha': 0.001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant'}
0.661 (+/-0.018) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant'}
0.664 (+/-0.017) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'adaptive'}
0.659 (+/-0.022) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'constant'}
0.658 (+/-0.025) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive'}
0.657 (+/-0.015) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'constant'}
0.661 (+/-0.013) for {'activation': 'tanh', 'alpha': 0.0001, 'hidden_layer_sizes': (100,), 'learning_rate': 'adaptive'}
0.661 (+/-0.018) for {'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (50, 50, 50), 'learning_rate': 'constant'}


In [47]:
data_app = pd.read_csv('https://raw.githubusercontent.com/nilsonsales/mlclass-2022/master/03_Validation/abalone_app.csv')

data_app = pd.get_dummies(data_app)

data_app

Unnamed: 0,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,sex_F,sex_I,sex_M
0,0.600,0.480,0.175,1.2290,0.4125,0.2735,0.4150,0,0,1
1,0.545,0.385,0.150,1.1185,0.5425,0.2445,0.2845,1,0,0
2,0.645,0.520,0.180,1.2850,0.5775,0.3520,0.3170,1,0,0
3,0.640,0.510,0.170,1.3715,0.5670,0.3070,0.4090,0,0,1
4,0.655,0.540,0.215,1.5555,0.6950,0.2960,0.4440,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1040,0.430,0.350,0.105,0.3660,0.1705,0.0855,0.1100,0,1,0
1041,0.475,0.360,0.125,0.4470,0.1695,0.0810,0.1400,1,0,0
1042,0.500,0.405,0.150,0.5965,0.2530,0.1260,0.1850,1,0,0
1043,0.380,0.275,0.095,0.2425,0.1060,0.0485,0.2100,0,1,0


In [49]:
print(' - Aplicando modelo e enviando para o servidor')

y_pred = model.predict(data_app)

# Enviando previsões realizadas com o modelo para o servidor
URL = "https://aydanomachado.com/mlclass/03_Validation.php"

#TODO Substituir pela sua chave aqui
DEV_KEY = "720pster"

# json para ser enviado para o servidor
data_json = {'dev_key':DEV_KEY,
             'predictions':pd.Series(y_pred).to_json(orient='values')}

# Enviando requisição e salvando o objeto resposta
r = requests.post(url = URL, data = data_json)

# Extraindo e imprimindo o texto da resposta
pastebin_url = r.text
print(" - Resposta do servidor:\n", r.text, "\n")

 - Aplicando modelo e enviando para o servidor
 - Resposta do servidor:
 {"status":"success","dev_key":"720pster","accuracy":0.6545454545454545,"old_accuracy":0} 

