In [1]:
import pandas as pd
import numpy as np
import functools as ft
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.model_selection import cross_validate

In [2]:
# Fetching data
carcass_df = pd.read_csv('UnderSampledDataSet.csv')
carcass_df.shape

(47212, 12)

In [3]:
# Sample 40% of the dataset, 
# making it quicker to train 
# and especially to grid search parameters 
carcass_df = carcass_df.sample(frac=0.40)
carcass_df.shape

(18885, 12)

In [4]:
# Removing data not used in training models 
X = carcass_df.copy()
X = X.drop(['Group', 
            'Killnumber', 
            'KillDate', 
            'WeightWarm', 
            'ClassificationTime', 
            'PartCassation'], axis=1)

# Normalizing the numerical data 
X[['Weight', 'AgeMonths']] = preprocessing.normalize(
    X[['Weight', 'AgeMonths']])
X.dtypes

AnimalType      int64
Class           int64
Fat             int64
Weight        float64
AgeMonths     float64
Remark         object
dtype: object

In [5]:
# One Hot Encoding for categorical data

# Using One Hot Encoding to add the top ten remarks as binary features 
remark_sorted = X['Remark'].value_counts()
top_remarks = []
nr_of_remarks = 12
for key in range(nr_of_remarks):
    top_remarks.append(remark_sorted.keys()[key])
    
df_encoded = pd.get_dummies(X['Remark'], columns=['Remark'])
for col in df_encoded.columns:
    if col in top_remarks:
        X['Remark',col] = df_encoded[col]

X = X.drop(['Remark'], axis=1)

# Using One Hot Encoding to add the top ten Classes as binary features 
class_sorted = X['Class'].value_counts()
top_class = []
nr_of_classes = 8
for key in range(nr_of_classes):
    top_class.append(class_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Class'], columns=['Class'])
for col in df_encoded.columns:
    if col in top_class:
        X['Class',col] = df_encoded[col]

X = X.drop(['Class'], axis=1)

# Using One Hot Encoding to add the top ten Fats as binary features 
fat_sorted = X['Fat'].value_counts()
top_fat = []
nr_of_fats = 8
for key in range(nr_of_fats):
    top_fat.append(fat_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Fat'], columns=['Fat'])
for col in df_encoded.columns:
    if col in top_fat:
        X['Fat',col] = df_encoded[col]

X = X.drop(['Fat'], axis=1)

# Using One Hot Encoding to add AnimalType as binary features 
df_encoded = pd.get_dummies(X['AnimalType'], columns=['AnimalType'])
for col in df_encoded.columns:
    X['AnimalType',col] = df_encoded[col]

X = X.drop(['AnimalType'], axis=1)
X.dtypes

Weight                     float64
AgeMonths                  float64
(Remark, 113\n589)            bool
(Remark, 338\n588)            bool
(Remark, 338\n589)            bool
(Remark, 339\n588)            bool
(Remark, 339\n589)            bool
(Remark, 339\n589\n703)       bool
(Remark, 339\n590)            bool
(Remark, 340\n589)            bool
(Remark, 588)                 bool
(Remark, 589)                 bool
(Remark, 590)                 bool
(Remark, 77\n589)             bool
(Class, 4)                    bool
(Class, 5)                    bool
(Class, 6)                    bool
(Class, 7)                    bool
(Class, 8)                    bool
(Class, 9)                    bool
(Class, 10)                   bool
(Class, 11)                   bool
(Fat, 3)                      bool
(Fat, 4)                      bool
(Fat, 5)                      bool
(Fat, 6)                      bool
(Fat, 7)                      bool
(Fat, 8)                      bool
(Fat, 9)            

In [6]:
# Converting the dataset into matrix
X = X.to_numpy()
X

array([[0.9988647294655031, 0.04763666896212888, False, ..., False, True,
        False],
       [0.9955997501374558, 0.0937077239411758, False, ..., False, False,
        False],
       [0.9969729329127248, 0.07774941182671039, False, ..., False,
        False, False],
       ...,
       [0.998369129364278, 0.057088365998804694, False, ..., False,
        False, True],
       [0.9994320773914238, 0.03369751743174582, False, ..., False, True,
        False],
       [0.9947449602428345, 0.10238390533419689, False, ..., False,
        False, False]], dtype=object)

In [7]:
y = carcass_df['Group'].to_numpy()
y

array([0, 4, 6, ..., 8, 7, 2], dtype=int64)

In [8]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2)

Using rules of thumb for determining the number of neurons for the hidden layer: 

Somewhere in between the size of the input layer and the output layer
- Using this the conclusion would be between 11-36,

The mean of neurons in the input and output layer.
- The mean is 11+36/2 = 23.5  neurons 

Less than twice the size of the input layer.
- Less than 72, meaning 1-71

2/3 the size of the input layer, plus the size of the output layer.
- Approximately 35

Using these rules of thumb the optimal number of neurons to use in the hidden layer would be between 1 and 71, but most likely somewhere between 20-71. The exact number is then determined through a gridsearch

In [9]:
H_param = {
    'hidden_layer_sizes': [*range(20, 71, 1)],
}
print(H_param)

{'hidden_layer_sizes': [20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70]}


In [10]:
model = MLPClassifier(max_iter = 10000)

In [11]:
optimal_param = GridSearchCV(model, H_param)

In [12]:
optimal_param.fit(X_train, y_train)

In [13]:
optimal_param.best_params_

{'hidden_layer_sizes': 65}

Using grid search we found that the optimum number of neurons with the standard parameters is 65.

We then continued with grid-searches to determine which other parameters were suitable for the model (the parameters used were chosen from what the previous grid search said and what was suitable to test from the documentation, https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPClassifier.html):

Why we are splitting up the grid search is because of bad computers, which tend to crash. Which lost us about a week of progress after crashing when running all parameters at once. 

In [14]:
H_param_2 = {
    'hidden_layer_sizes': [*range(60, 70, 1)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['adaptive', 'constant', 'invscaling'],
    'learning_rate_init': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]
}
print(H_param_2)

{'hidden_layer_sizes': [60, 61, 62, 63, 64, 65, 66, 67, 68, 69], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['sgd', 'adam'], 'learning_rate': ['adaptive', 'constant', 'invscaling'], 'learning_rate_init': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]}


In [15]:
model_2 = MLPClassifier(max_iter = 10000)

In [16]:
optimal_param_2 = GridSearchCV(model_2, H_param_2)

In [17]:
optimal_param_2.fit(X_train, y_train)

In [18]:
optimal_param_2.best_params_

{'activation': 'tanh',
 'hidden_layer_sizes': 69,
 'learning_rate': 'constant',
 'learning_rate_init': 0.001,
 'solver': 'adam'}

In [20]:
H_param_3 = {
    'hidden_layer_sizes': [*range(69, 75, 1)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['adaptive', 'constant', 'invscaling'],
    'learning_rate_init': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]
}
print(H_param_3)

{'hidden_layer_sizes': [69, 70, 71, 72, 73, 74], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['sgd', 'adam'], 'learning_rate': ['adaptive', 'constant', 'invscaling'], 'learning_rate_init': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]}


In [21]:
model_3 = MLPClassifier(max_iter = 10000)

In [22]:
optimal_param_3 = GridSearchCV(model_3, H_param_3)

In [23]:
optimal_param_3.fit(X_train, y_train)

In [24]:
optimal_param_3.best_params_

{'activation': 'tanh',
 'hidden_layer_sizes': 74,
 'learning_rate': 'adaptive',
 'learning_rate_init': 0.001,
 'solver': 'adam'}

In [26]:
H_param_4 = {
    'hidden_layer_sizes': [*range(74, 85, 1)],
    'activation': ['tanh'],
    'solver': ['adam'],
    'learning_rate': ['adaptive', 'constant'],
    'learning_rate_init': [0.001, 0.0005]
}
print(H_param_4)

{'hidden_layer_sizes': [74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84], 'activation': ['tanh'], 'solver': ['adam'], 'learning_rate': ['adaptive', 'constant'], 'learning_rate_init': [0.001, 0.0005]}


In [27]:
model_4 = MLPClassifier(max_iter = 10000)

In [28]:
optimal_param_4 = GridSearchCV(model_4, H_param_4)

In [29]:
optimal_param_4.fit(X_train, y_train)

In [30]:
optimal_param_4.best_params_

{'activation': 'tanh',
 'hidden_layer_sizes': 82,
 'learning_rate': 'adaptive',
 'learning_rate_init': 0.001,
 'solver': 'adam'}

In [31]:
model_optimal = MLPClassifier(hidden_layer_sizes = (82), 
                              activation = 'tanh',
                              solver = 'adam', #defult
                              learning_rate = 'adaptive', 
                              learning_rate_init = 0.001, #defult
                              max_iter = 10000)
print(model_optimal)

MLPClassifier(activation='tanh', hidden_layer_sizes=82,
              learning_rate='adaptive', max_iter=10000)


In [34]:
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cross_score = cross_validate(model_optimal,
                    X, y, 
                    scoring=scoring, 
                    cv=10)
scores_df = pd.DataFrame.from_dict(cross_score)
print(model_optimal)
display(scores_df)
scores_df.describe()

MLPClassifier(activation='tanh', hidden_layer_sizes=82,
              learning_rate='adaptive', max_iter=10000)


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
0,134.146452,0.023036,0.960296,0.960881,0.960303,0.960247
1,108.681172,0.025089,0.967178,0.967584,0.967268,0.967295
2,102.967775,0.023547,0.955532,0.955685,0.955588,0.955527
3,97.560298,0.033123,0.96612,0.966867,0.966178,0.966233
4,116.490302,0.023549,0.95765,0.95803,0.957824,0.957681
5,127.546346,0.024059,0.95286,0.953272,0.952921,0.952851
6,112.035768,0.023006,0.957627,0.958077,0.957791,0.957695
7,99.553696,0.022006,0.954979,0.955628,0.954946,0.955011
8,105.733011,0.019998,0.959746,0.960305,0.959667,0.959766
9,66.755984,0.018003,0.957627,0.957893,0.95772,0.95763


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,107.14708,0.023542,0.958962,0.959422,0.959021,0.958994
std,18.436429,0.003952,0.00461,0.004681,0.004613,0.004647
min,66.755984,0.018003,0.95286,0.953272,0.952921,0.952851
25%,100.407216,0.022256,0.956056,0.956237,0.956121,0.956053
50%,107.207091,0.023292,0.957638,0.958054,0.957808,0.957688
75%,115.376669,0.023932,0.960159,0.960737,0.960144,0.960127
max,134.146452,0.033123,0.967178,0.967584,0.967268,0.967295


In [35]:
H_param_5 = {
    'hidden_layer_sizes': [(65)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'learning_rate': ['adaptive', 'constant', 'invscaling'],
    'learning_rate_init': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]
}
print(H_param_5)

{'hidden_layer_sizes': [65], 'activation': ['logistic', 'tanh', 'relu'], 'solver': ['sgd', 'adam'], 'learning_rate': ['adaptive', 'constant', 'invscaling'], 'learning_rate_init': [0.1, 0.05, 0.01, 0.005, 0.001, 0.0005]}


In [36]:
model_5 = MLPClassifier(max_iter = 10000)

In [37]:
optimal_param_5 = GridSearchCV(model_5, H_param_5)

In [38]:
optimal_param_5.fit(X_train, y_train)

In [39]:
optimal_param_5.best_params_

{'activation': 'tanh',
 'hidden_layer_sizes': 65,
 'learning_rate': 'adaptive',
 'learning_rate_init': 0.0005,
 'solver': 'adam'}

In [42]:
model_optimal_5 = MLPClassifier(hidden_layer_sizes = (65), 
                              activation = 'tanh',
                              solver = 'adam', #defult
                              learning_rate = 'adaptive', 
                              learning_rate_init = 0.0005,
                              max_iter = 10000)

In [43]:
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
cross_score_5 = cross_validate(model_optimal_5,
                    X, y, 
                    scoring=scoring, 
                    cv=10)
scores_df_5 = pd.DataFrame.from_dict(cross_score_5)
print(model_optimal_5)
display(scores_df_5)
scores_df_5.describe()

MLPClassifier(activation='tanh', hidden_layer_sizes=65,
              learning_rate='adaptive', learning_rate_init=0.0005,
              max_iter=10000)


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
0,116.352921,0.014043,0.955532,0.956033,0.955639,0.955517
1,109.86065,0.016001,0.959767,0.960275,0.95982,0.959802
2,131.765019,0.015015,0.961355,0.961758,0.961373,0.961482
3,112.271793,0.015,0.962414,0.963189,0.962492,0.962628
4,115.831649,0.018,0.956061,0.956488,0.956291,0.95616
5,107.879806,0.013005,0.949153,0.949705,0.949191,0.949315
6,113.029346,0.014006,0.956568,0.957057,0.956738,0.956635
7,134.645397,0.012633,0.959216,0.960176,0.959276,0.95933
8,99.739345,0.013042,0.954449,0.955081,0.954435,0.954494
9,112.34152,0.01803,0.962394,0.962878,0.962522,0.962505


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,115.371745,0.014878,0.957691,0.958264,0.957778,0.957787
std,10.52267,0.001954,0.00417,0.004196,0.004175,0.004184
min,99.739345,0.012633,0.949153,0.949705,0.949191,0.949315
25%,110.463436,0.013283,0.955664,0.956147,0.955802,0.955678
50%,112.685433,0.014521,0.957892,0.958616,0.958007,0.957983
75%,116.222603,0.015754,0.960958,0.961387,0.960984,0.961062
max,134.645397,0.01803,0.962414,0.963189,0.962522,0.962628


In [53]:
print('Neurons 82:')
print(scores_df.describe())
print('Neurons 65:')
print(scores_df_5.describe())

Neurons 82:
         fit_time  score_time  test_accuracy  test_precision_macro  \
count   10.000000   10.000000      10.000000             10.000000   
mean   107.147080    0.023542       0.958962              0.959422   
std     18.436429    0.003952       0.004610              0.004681   
min     66.755984    0.018003       0.952860              0.953272   
25%    100.407216    0.022256       0.956056              0.956237   
50%    107.207091    0.023292       0.957638              0.958054   
75%    115.376669    0.023932       0.960159              0.960737   
max    134.146452    0.033123       0.967178              0.967584   

       test_recall_macro  test_f1_macro  
count          10.000000      10.000000  
mean            0.959021       0.958994  
std             0.004613       0.004647  
min             0.952921       0.952851  
25%             0.956121       0.956053  
50%             0.957808       0.957688  
75%             0.960144       0.960127  
max             0.967

In [56]:
0.958962 - 0.957691 # comparing test_accuracy

0.0012710000000000221

Comparing the test_accuracy between using 65 or 82 neurons shows that 82 is 0.0013 units or 0.13% better. 
Which I don't see as worth it considering the risk of overfitting the models.
This is also why I have chosen to go with these parameters for my final model:

hidden_layer_sizes = (65), 

activation = 'tanh',

solver = 'adam', #defult

learning_rate = 'adaptive', 

learning_rate_init = 0.0005,

max_iter = 10000