In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from collections import defaultdict
from tqdm import tqdm

In [2]:
# Fetching data
carcass_df = pd.read_csv('UnderSampledDataSet.csv')

In [3]:
# Removing data not used in training models 
X = carcass_df.copy()
X = X.drop(['Group', 
            'Killnumber', 
            'KillDate', 
            'WeightWarm', 
            'ClassificationTime', 
            'PartCassation'], axis=1)

# Normalizing the numerical data 
X[['Weight', 'AgeMonths']] = preprocessing.normalize(
    X[['Weight', 'AgeMonths']])
X.dtypes

AnimalType      int64
Class           int64
Fat             int64
Weight        float64
AgeMonths     float64
Remark         object
dtype: object

In [4]:
# One Hot Encoding for categorical data

# Using One Hot Encoding to add the top ten remarks as binary features 
remark_sorted = X['Remark'].value_counts()
top_remarks = []
nr_of_remarks = 12
for key in range(nr_of_remarks):
    top_remarks.append(remark_sorted.keys()[key])
    
df_encoded = pd.get_dummies(X['Remark'], columns=['Remark'])
for col in df_encoded.columns:
    if col in top_remarks:
        X['Remark',col] = df_encoded[col]

X = X.drop(['Remark'], axis=1)

# Using One Hot Encoding to add the top ten Classes as binary features 
class_sorted = X['Class'].value_counts()
top_class = []
nr_of_classes = 8
for key in range(nr_of_classes):
    top_class.append(class_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Class'], columns=['Class'])
for col in df_encoded.columns:
    if col in top_class:
        X['Class',col] = df_encoded[col]

X = X.drop(['Class'], axis=1)

# Using One Hot Encoding to add the top ten Fats as binary features 
fat_sorted = X['Fat'].value_counts()
top_fat = []
nr_of_fats = 8
for key in range(nr_of_fats):
    top_fat.append(fat_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Fat'], columns=['Fat'])
for col in df_encoded.columns:
    if col in top_fat:
        X['Fat',col] = df_encoded[col]

X = X.drop(['Fat'], axis=1)

# Using One Hot Encoding to add AnimalType as binary features 
df_encoded = pd.get_dummies(X['AnimalType'], columns=['AnimalType'])
for col in df_encoded.columns:
    X['AnimalType',col] = df_encoded[col]

X = X.drop(['AnimalType'], axis=1)
X.dtypes

Weight                float64
AgeMonths             float64
(Remark, 113\n589)       bool
(Remark, 338\n588)       bool
(Remark, 338\n589)       bool
(Remark, 339\n588)       bool
(Remark, 339\n589)       bool
(Remark, 339\n590)       bool
(Remark, 340\n589)       bool
(Remark, 588)            bool
(Remark, 589)            bool
(Remark, 590)            bool
(Remark, 77\n589)        bool
(Remark, 77\n590)        bool
(Class, 4)               bool
(Class, 5)               bool
(Class, 6)               bool
(Class, 7)               bool
(Class, 8)               bool
(Class, 9)               bool
(Class, 10)              bool
(Class, 11)              bool
(Fat, 3)                 bool
(Fat, 4)                 bool
(Fat, 5)                 bool
(Fat, 6)                 bool
(Fat, 7)                 bool
(Fat, 8)                 bool
(Fat, 9)                 bool
(Fat, 10)                bool
(AnimalType, 212)        bool
(AnimalType, 216)        bool
(AnimalType, 218)        bool
(AnimalTyp

In [5]:
X = X.to_numpy()
y = carcass_df['Group'].to_numpy()

In [6]:
# Setting k-fold parameters used in the cross validation
kf = KFold(n_splits=10, random_state=42, shuffle=True)
print(f"kfold parametrar: {kf}")

kfold parametrar: KFold(n_splits=10, random_state=42, shuffle=True)


In [7]:
# Setting the parameters for the model, 
# these where derived from using rule of thumb and gridsearch 
model_ann = MLPClassifier(hidden_layer_sizes = (65), 
                              activation = 'tanh',
                              solver = 'adam', #defult
                              learning_rate_init = 0.0005,
                              max_iter = 10000)

In [8]:
# Using cross validation to train and test models on set of performance mesurements 
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
result_ann = pd.DataFrame()
nr_of_iterations = range(10)
# The cross validation is set to using k-fold and a 1 to 10 split,
# which is then repeated 10 times resulting in 100 models beeing trained and evaluated.
for i in tqdm(nr_of_iterations, total=len(nr_of_iterations), ncols = 100, 
              desc ="Cross validating the model"):
    cross_score = cross_validate(model_ann,
                        X, y, 
                        scoring=scoring, 
                        cv=kf)
    scores_df = pd.DataFrame.from_dict(cross_score)
    frames = [result_ann, scores_df]
    result_ann = pd.concat(frames)
display(result_ann)

Cross validating the model: 100%|███████████████████████████████| 10/10 [5:41:47<00:00, 2050.78s/it]


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
0,215.905516,0.037150,0.971199,0.971397,0.971114,0.971115
1,242.096690,0.027196,0.968657,0.968536,0.968712,0.968476
2,217.867082,0.025042,0.977759,0.977689,0.977630,0.977628
3,204.602905,0.031605,0.976700,0.976762,0.976666,0.976618
4,233.168952,0.024975,0.979030,0.979174,0.979198,0.979125
...,...,...,...,...,...,...
5,178.971858,0.023999,0.977123,0.977583,0.977272,0.977361
6,206.566121,0.028998,0.971616,0.971646,0.971956,0.971743
7,182.737361,0.028549,0.973523,0.973490,0.973607,0.973413
8,217.842721,0.023031,0.978394,0.978661,0.978308,0.978417


In [9]:
result_ann.describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,205.046973,0.027055,0.975131,0.975204,0.975139,0.975095
std,19.394421,0.002587,0.003191,0.003295,0.003191,0.003259
min,135.250707,0.022957,0.966963,0.966764,0.96713,0.966781
25%,192.600247,0.024983,0.973152,0.973039,0.972947,0.972933
50%,205.079201,0.026992,0.975114,0.975245,0.975051,0.975084
75%,216.602284,0.029,0.9776,0.977818,0.977633,0.977608
max,255.587891,0.03715,0.981572,0.981696,0.981835,0.981737


In [10]:
result_ann.to_csv('ANNTestResults.csv', index=False)

In [12]:
from sklearn.svm import SVC
# Best Parameters: {'C': 10, 'degree': 7, 'gamma': 1, 'kernel': 'poly'}
# Best New Hyperparamters {'C': 1000, 'degree': 3, 'gamma': 1, 'kernel': 'poly'}

# Setting the parameters for the model, 
model_svc = SVC(kernel ='poly', C = 1000, degree = 3, gamma = 1)

In [13]:
# Using cross validation to train and test models on set of performance mesurements 
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
result_svc = pd.DataFrame()
nr_of_iterations = range(10)
# The cross validation is set to using k-fold and a 1 to 10 split,
# which is then repeated 10 times resulting in 100 models beeing trained and evaluated.
for i in tqdm(nr_of_iterations, total=len(nr_of_iterations), ncols = 100, desc ="Cross validating the model"):
# for i in nr_of_iterations:
    cross_score = cross_validate(model_svc,
                        X, y, 
                        scoring=scoring, 
                        cv=kf)
    scores_df = pd.DataFrame.from_dict(cross_score)
    frames = [result_svc, scores_df]
    result_svc = pd.concat(frames)
display(result_svc)

Cross validating the model: 100%|███████████████████████████████████| 10/10 [04:13<00:00, 25.38s/it]


Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
0,2.085543,0.560759,0.975858,0.975799,0.976093,0.975881
1,1.918244,0.586406,0.982211,0.982123,0.982135,0.982107
2,1.954855,0.564415,0.982419,0.982588,0.982547,0.982542
3,1.918971,0.562357,0.977971,0.978050,0.978092,0.978050
4,2.526785,0.634236,0.980936,0.981059,0.981001,0.980988
...,...,...,...,...,...,...
5,1.913908,0.561453,0.982419,0.982666,0.982163,0.982385
6,1.941688,0.609773,0.977547,0.977492,0.977732,0.977579
7,2.046996,0.558432,0.981995,0.981922,0.982028,0.981945
8,1.996761,0.563291,0.982207,0.982304,0.982175,0.982205


In [14]:
result_svc.describe()

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision_macro,test_recall_macro,test_f1_macro
count,100.0,100.0,100.0,100.0,100.0,100.0
mean,1.97157,0.562257,0.980365,0.980397,0.980387,0.980357
std,0.173835,0.036161,0.002298,0.002349,0.002205,0.002282
min,1.842425,0.527683,0.975858,0.975799,0.976093,0.975881
25%,1.885949,0.542104,0.977971,0.97805,0.978092,0.97805
50%,1.922948,0.554934,0.981466,0.981491,0.981514,0.981466
75%,1.997233,0.567327,0.982211,0.982304,0.982163,0.982205
max,3.117307,0.808861,0.982419,0.982666,0.982547,0.982542


In [15]:
result_svc.to_csv('SVMTestResults.csv', index=False)