In [1]:
import pandas as pd
import numpy as np
import functools as ft
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn import preprocessing
from numpy import array

In [2]:
# Fetching data
carcass_df = pd.read_csv('UnderSampledDataSet.csv')

# Removing data not used in training models 
X = carcass_df.copy()
X = X.drop(['Group', 
            'Killnumber', 
            'KillDate', 
            'WeightWarm', 
            'ClassificationTime', 
            'PartCassation'], axis=1)

# Normalizing the numerical data 
X[['Weight', 'AgeMonths']] = preprocessing.normalize(
    X[['Weight', 'AgeMonths']])
X.dtypes

AnimalType      int64
Class           int64
Fat             int64
Weight        float64
AgeMonths     float64
Remark         object
dtype: object

In [3]:
print(len(X['Remark'].unique()))
sumRemarkTop5Count = sum(X['Remark'].value_counts().head(5))
sumRemarkCount = sum(X['Remark'].value_counts())
print(sumRemarkTop5Count)
print(sumRemarkCount)
print(sumRemarkTop5Count/sumRemarkCount)
print(X['Remark'].value_counts().head(10))
print(X['Class'].value_counts().head(10))
print(X['Fat'].value_counts().head(10))

3002
18809
47212
0.3983944759806829
Remark
589         8037
590         4367
588         3830
338\n589    1328
339\n589    1247
113\n589     837
77\n589      803
340\n589     777
339\n588     733
338\n588     710
Name: count, dtype: int64
Class
7     8083
6     7465
8     6778
5     6681
9     5374
4     3941
10    3923
11    2406
3     1031
12     913
Name: count, dtype: int64
Fat
6     12538
5      9695
7      9330
4      4566
8      4119
9      2736
3      2074
10      926
2       755
11      300
Name: count, dtype: int64


In [4]:
# One Hot Encoding for categorical data

# Using One Hot Encoding to add the top ten remarks as binary features 
remark_sorted = X['Remark'].value_counts()
top_remarks = []
nr_of_remarks = 12
for key in range(nr_of_remarks):
    top_remarks.append(remark_sorted.keys()[key])
    
df_encoded = pd.get_dummies(X['Remark'], columns=['Remark'])
for col in df_encoded.columns:
    if col in top_remarks:
        X['Remark',col] = df_encoded[col]

X = X.drop(['Remark'], axis=1)

# Using One Hot Encoding to add the top ten Classes as binary features 
class_sorted = X['Class'].value_counts()
top_class = []
nr_of_classes = 8
for key in range(nr_of_classes):
    top_class.append(class_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Class'], columns=['Class'])
for col in df_encoded.columns:
    if col in top_class:
        X['Class',col] = df_encoded[col]

X = X.drop(['Class'], axis=1)

# Using One Hot Encoding to add the top ten Fats as binary features 
fat_sorted = X['Fat'].value_counts()
top_fat = []
nr_of_fats = 8
for key in range(nr_of_fats):
    top_fat.append(fat_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Fat'], columns=['Fat'])
for col in df_encoded.columns:
    if col in top_fat:
        X['Fat',col] = df_encoded[col]

X = X.drop(['Fat'], axis=1)

# Using One Hot Encoding to add AnimalType as binary features 
df_encoded = pd.get_dummies(X['AnimalType'], columns=['AnimalType'])
for col in df_encoded.columns:
    X['AnimalType',col] = df_encoded[col]

X = X.drop(['AnimalType'], axis=1)
print(X.dtypes)
print(X.dtypes.value_counts())

Weight                float64
AgeMonths             float64
(Remark, 113\n589)       bool
(Remark, 338\n588)       bool
(Remark, 338\n589)       bool
(Remark, 339\n588)       bool
(Remark, 339\n589)       bool
(Remark, 339\n590)       bool
(Remark, 340\n589)       bool
(Remark, 588)            bool
(Remark, 589)            bool
(Remark, 590)            bool
(Remark, 77\n589)        bool
(Remark, 77\n590)        bool
(Class, 4)               bool
(Class, 5)               bool
(Class, 6)               bool
(Class, 7)               bool
(Class, 8)               bool
(Class, 9)               bool
(Class, 10)              bool
(Class, 11)              bool
(Fat, 3)                 bool
(Fat, 4)                 bool
(Fat, 5)                 bool
(Fat, 6)                 bool
(Fat, 7)                 bool
(Fat, 8)                 bool
(Fat, 9)                 bool
(Fat, 10)                bool
(AnimalType, 212)        bool
(AnimalType, 216)        bool
(AnimalType, 218)        bool
(AnimalTyp

In [5]:
# Converting the dataset into matrix
X = X.to_numpy()
X

array([[0.9980564846305773, 0.06231575632899144, False, ..., False, True,
        False],
       [0.9975599617888159, 0.06981491700128503, False, ..., False,
        False, False],
       [0.9978052958341178, 0.0662162488018512, False, ..., False, True,
        False],
       ...,
       [0.9980059138553454, 0.06312048724270698, False, ..., False,
        False, True],
       [0.9987950661447739, 0.049075613545393554, False, ..., False,
        True, False],
       [0.9990400346134506, 0.04380649768648041, False, ..., False, True,
        False]], dtype=object)

In [6]:
y = carcass_df['Group'].to_numpy()
y

array([ 0,  0,  0, ..., 10, 10, 10], dtype=int64)

In [7]:
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size = 0.2)

In [8]:
model_default = MLPClassifier(max_iter = 1000)

In [9]:
model_default.fit(X_train, y_train)

In [10]:
model_default.n_layers_

3

In [11]:
predictions_default = model_default.predict(X_test)

In [12]:
accuracy_score(y_test, predictions_default)

0.9706661018744043

In [13]:
pd.crosstab(y_test, predictions_default)

col_0,0,1,2,3,4,5,6,7,8,9,10
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,813,0,0,14,0,2,5,0,8,7,9
1,0,892,7,0,0,0,0,3,4,1,17
2,0,8,817,0,0,3,0,1,7,7,7
3,2,0,1,785,6,1,12,2,0,1,1
4,0,0,0,0,833,2,5,0,1,0,0
5,0,0,0,1,8,899,2,0,0,0,0
6,2,0,0,5,6,2,807,0,0,0,1
7,1,6,1,1,1,2,1,870,1,1,9
8,1,5,3,0,2,0,0,0,804,3,10
9,14,2,4,2,0,0,0,1,4,805,12


In [14]:
print (classification_report(y_test, predictions_default))

              precision    recall  f1-score   support

           0       0.97      0.95      0.96       858
           1       0.98      0.97      0.97       924
           2       0.98      0.96      0.97       850
           3       0.97      0.97      0.97       811
           4       0.97      0.99      0.98       841
           5       0.99      0.99      0.99       910
           6       0.97      0.98      0.97       823
           7       0.99      0.97      0.98       894
           8       0.96      0.97      0.97       828
           9       0.97      0.95      0.96       844
          10       0.93      0.98      0.95       860

    accuracy                           0.97      9443
   macro avg       0.97      0.97      0.97      9443
weighted avg       0.97      0.97      0.97      9443

