In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from tqdm import tqdm

In [2]:
carcass_df = pd.read_csv('UnderSampledDataSet.csv')
# Number of classes
C = carcass_df['Group'].nunique()
# Size of population
P = len(carcass_df)
print('number of clusters:', C)
print('size of pop:', P)

number of clusters: 11
size of pop: 47212


In [3]:
# Formulas from Performance metrics in report 
TP = P/C * 1/C 
FP = P/C * (1 - 1/C)
FN = (P - P/C) * 1/C
TN = (P - P/C) * (1 - 1/C)
print('TP are:', TP)
print('FP are:', FP)
print('FN are:', FN)
print('TN are:', TN)
print('Together they are the population:', TP + FP + FN + TN)

TP are: 390.1818181818182
FP are: 3901.8181818181815
FN are: 3901.818181818182
TN are: 39018.181818181816
Together they are the population: 47212.0


In [4]:
# Average Accuracy
average_accuracy = (TP+TN)/(TP+FN+FP+TN)
print(average_accuracy)

0.8347107438016528


In [5]:
# Precision 
Precision = TP/(TP+FP)
print(Precision)

0.09090909090909091


In [6]:
# Recall 
Recall = TP/(TP+FN)
print(Recall)

0.09090909090909091


In [7]:
# F1 score
f1_score = 2*((Recall * Precision)/(Recall + Precision))
print(f1_score)

0.09090909090909091


In [8]:
# Removing data not used in training models 
X = carcass_df.copy()
X = X.drop(['Group', 
            'Killnumber', 
            'KillDate', 
            'WeightWarm', 
            'ClassificationTime', 
            'PartCassation'], axis=1)

# Normalizing the numerical data 
X[['Weight', 'AgeMonths']] = preprocessing.normalize(
    X[['Weight', 'AgeMonths']])
X.dtypes

AnimalType      int64
Class           int64
Fat             int64
Weight        float64
AgeMonths     float64
Remark         object
dtype: object

In [9]:
# One Hot Encoding for categorical data

# Using One Hot Encoding to add the top ten remarks as binary features 
remark_sorted = X['Remark'].value_counts()
top_remarks = []
nr_of_remarks = 12
for key in range(nr_of_remarks):
    top_remarks.append(remark_sorted.keys()[key])
    
df_encoded = pd.get_dummies(X['Remark'], columns=['Remark'])
for col in df_encoded.columns:
    if col in top_remarks:
        X['Remark',col] = df_encoded[col]

X = X.drop(['Remark'], axis=1)

# Using One Hot Encoding to add the top ten Classes as binary features 
class_sorted = X['Class'].value_counts()
top_class = []
nr_of_classes = 8
for key in range(nr_of_classes):
    top_class.append(class_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Class'], columns=['Class'])
for col in df_encoded.columns:
    if col in top_class:
        X['Class',col] = df_encoded[col]

X = X.drop(['Class'], axis=1)

# Using One Hot Encoding to add the top ten Fats as binary features 
fat_sorted = X['Fat'].value_counts()
top_fat = []
nr_of_fats = 8
for key in range(nr_of_fats):
    top_fat.append(fat_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Fat'], columns=['Fat'])
for col in df_encoded.columns:
    if col in top_fat:
        X['Fat',col] = df_encoded[col]

X = X.drop(['Fat'], axis=1)

# Using One Hot Encoding to add AnimalType as binary features 
df_encoded = pd.get_dummies(X['AnimalType'], columns=['AnimalType'])
for col in df_encoded.columns:
    X['AnimalType',col] = df_encoded[col]

X = X.drop(['AnimalType'], axis=1)
X.dtypes

Weight                float64
AgeMonths             float64
(Remark, 113\n589)       bool
(Remark, 338\n588)       bool
(Remark, 338\n589)       bool
(Remark, 339\n588)       bool
(Remark, 339\n589)       bool
(Remark, 339\n590)       bool
(Remark, 340\n589)       bool
(Remark, 588)            bool
(Remark, 589)            bool
(Remark, 590)            bool
(Remark, 77\n589)        bool
(Remark, 77\n590)        bool
(Class, 4)               bool
(Class, 5)               bool
(Class, 6)               bool
(Class, 7)               bool
(Class, 8)               bool
(Class, 9)               bool
(Class, 10)              bool
(Class, 11)              bool
(Fat, 3)                 bool
(Fat, 4)                 bool
(Fat, 5)                 bool
(Fat, 6)                 bool
(Fat, 7)                 bool
(Fat, 8)                 bool
(Fat, 9)                 bool
(Fat, 10)                bool
(AnimalType, 212)        bool
(AnimalType, 216)        bool
(AnimalType, 218)        bool
(AnimalTyp

In [10]:
X = X.to_numpy()
y = carcass_df['Group'].to_numpy()

In [11]:
# Create a baseline random classifier
dummy_clf = DummyClassifier(strategy='uniform', random_state=42)

In [12]:
# Setting k-fold parameters used in the cross validation
kf = KFold(n_splits=10, random_state=42, shuffle=True)
print(f"kfold parametrar: {kf}")

kfold parametrar: KFold(n_splits=10, random_state=42, shuffle=True)


In [None]:
# Using cross validation to train and test models on set of performance mesurements 
scoring = ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
result = pd.DataFrame()
nr_of_iterations = range(10)
# The cross validation is set to using k-fold and a 1 to 10 split,
# which is then repeated 10 times resulting in 100 models beeing trained and evaluated.
for i in tqdm(nr_of_iterations, total=len(nr_of_iterations), ncols = 100, 
              desc ="Cross validating the model"):
    cross_score = cross_validate(dummy_clf,
                        X, y, 
                        scoring=scoring, 
                        cv=kf)
    scores_df = pd.DataFrame.from_dict(cross_score)
    frames = [result, scores_df]
    result = pd.concat(frames)
display(result)

Cross validating the model:  60%|█████████████████████▌              | 6/10 [00:01<00:00,  5.23it/s]

In [None]:
result.describe()

In [None]:
result.to_csv('DummyTestResults.csv', index=False)

My estimated average accuracy of 83.5% is not the same as the dummy classifiers 9.2%. 
I have tried to find out why and it seems that sklearn uses another formula for its [accuracy](https://scikit-learn.org/stable/modules/model_evaluation.html#accuracy-score). 

Calculating the accuracy using this formula gives a similar result as of that the sklearns dummy classifier gets.

Their formula could be divided into two: 
* The first part is 1 divided by the population ``1/population`` 
* And the number of times the classifier predicts something correctly, calculated by summing the indicator function which returns 1 if something is classified correctly and 0 if something is wrongly classified. This indicator function should in our case return 1 every 11th time following that there are 11 clusters that are evenly distributed, which because this is for the entire population should give us 1 ``(1/number of clusters) * population size`` times. Which can be shortened to ``population/number of clusters`` 

Multiplying these then gives the accuracy 

In [None]:
sklearn_accuracy = (1/P)*(P/C)
print(sklearn_accuracy)