In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.metrics import recall_score
from collections import defaultdict
from tqdm import tqdm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import svm, datasets
import sklearn.model_selection as model_selection
from sklearn.metrics import f1_score
import random
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix 

In [2]:
# Fetching data
carcass_df = pd.read_csv('UnderSampledDataSet.csv')

In [3]:
# Removing data not used in training models 
X = carcass_df.copy()
X = X.drop(['Group', 
            'Killnumber', 
            'KillDate', 
            'WeightWarm', 
            'ClassificationTime', 
            'PartCassation'], axis=1)

In [4]:
# Normalizing the numerical data 
X[['Weight', 'AgeMonths']] = preprocessing.normalize(
    X[['Weight', 'AgeMonths']])
X.dtypes

AnimalType      int64
Class           int64
Fat             int64
Weight        float64
AgeMonths     float64
Remark         object
dtype: object

In [5]:
# One Hot Encoding for categorical data

# Using One Hot Encoding to add the top ten remarks as binary features 
remark_sorted = X['Remark'].value_counts()
top_remarks = []
nr_of_remarks = 12
for key in range(nr_of_remarks):
    top_remarks.append(remark_sorted.keys()[key])
    
df_encoded = pd.get_dummies(X['Remark'], columns=['Remark'])
for col in df_encoded.columns:
    if col in top_remarks:
        X['Remark',col] = df_encoded[col]

X = X.drop(['Remark'], axis=1)

# Using One Hot Encoding to add the top ten Classes as binary features 
class_sorted = X['Class'].value_counts()
top_class = []
nr_of_classes = 8
for key in range(nr_of_classes):
    top_class.append(class_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Class'], columns=['Class'])
for col in df_encoded.columns:
    if col in top_class:
        X['Class',col] = df_encoded[col]

X = X.drop(['Class'], axis=1)

# Using One Hot Encoding to add the top ten Fats as binary features 
fat_sorted = X['Fat'].value_counts()
top_fat = []
nr_of_fats = 8
for key in range(nr_of_fats):
    top_fat.append(fat_sorted.keys()[key])
        
df_encoded = pd.get_dummies(X['Fat'], columns=['Fat'])
for col in df_encoded.columns:
    if col in top_fat:
        X['Fat',col] = df_encoded[col]

X = X.drop(['Fat'], axis=1)

# Using One Hot Encoding to add AnimalType as binary features 
df_encoded = pd.get_dummies(X['AnimalType'], columns=['AnimalType'])
for col in df_encoded.columns:
    X['AnimalType',col] = df_encoded[col]

X = X.drop(['AnimalType'], axis=1)
X.dtypes

Weight                float64
AgeMonths             float64
(Remark, 113\n589)       bool
(Remark, 338\n588)       bool
(Remark, 338\n589)       bool
(Remark, 339\n588)       bool
(Remark, 339\n589)       bool
(Remark, 339\n590)       bool
(Remark, 340\n589)       bool
(Remark, 588)            bool
(Remark, 589)            bool
(Remark, 590)            bool
(Remark, 77\n589)        bool
(Remark, 77\n590)        bool
(Class, 4)               bool
(Class, 5)               bool
(Class, 6)               bool
(Class, 7)               bool
(Class, 8)               bool
(Class, 9)               bool
(Class, 10)              bool
(Class, 11)              bool
(Fat, 3)                 bool
(Fat, 4)                 bool
(Fat, 5)                 bool
(Fat, 6)                 bool
(Fat, 7)                 bool
(Fat, 8)                 bool
(Fat, 9)                 bool
(Fat, 10)                bool
(AnimalType, 212)        bool
(AnimalType, 216)        bool
(AnimalType, 218)        bool
(AnimalTyp

In [6]:
X = X.to_numpy()
y = carcass_df['Group'].to_numpy()

In [7]:
# Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,random_state=101,shuffle=True)

In [8]:
# Setting the hyperparamters to be used in the search
grid = {
    'C':[0.01,0.1,1,10,100,1000],
    'kernel' : ["poly"],
    'degree' : [1,2,3,4,5,6,7,8,9,10],
    'gamma' : [1, 0.1, 0.01, 0.001, 0.0001]
}
#Setting model
svm  = SVC ()
svm_cv = GridSearchCV(svm, grid, cv = 5, verbose = 3)
#Fitting model
svm_cv.fit(X_train,y_train)
#Printing scored
print("Best Parameters:",svm_cv.best_params_)
print("Train Score:",svm_cv.best_score_)
print("Test Score:",svm_cv.score(X_test,y_test))
#Printing classification report
grid_predictions = svm_cv.predict(X_test) 
print(classification_report(y_test, grid_predictions)) 

Fitting 5 folds for each of 300 candidates, totalling 1500 fits
[CV 1/5] END C=0.01, degree=1, gamma=1, kernel=poly;, score=0.781 total time=  20.0s
[CV 2/5] END C=0.01, degree=1, gamma=1, kernel=poly;, score=0.786 total time=  23.4s
[CV 3/5] END C=0.01, degree=1, gamma=1, kernel=poly;, score=0.772 total time=  27.4s
[CV 4/5] END C=0.01, degree=1, gamma=1, kernel=poly;, score=0.783 total time=  21.4s
[CV 5/5] END C=0.01, degree=1, gamma=1, kernel=poly;, score=0.788 total time=  23.2s
[CV 1/5] END C=0.01, degree=1, gamma=0.1, kernel=poly;, score=0.714 total time=  53.4s
[CV 2/5] END C=0.01, degree=1, gamma=0.1, kernel=poly;, score=0.720 total time=  50.8s
[CV 3/5] END C=0.01, degree=1, gamma=0.1, kernel=poly;, score=0.719 total time=  50.3s
[CV 4/5] END C=0.01, degree=1, gamma=0.1, kernel=poly;, score=0.726 total time=  53.0s
[CV 5/5] END C=0.01, degree=1, gamma=0.1, kernel=poly;, score=0.722 total time=  57.8s
[CV 1/5] END C=0.01, degree=1, gamma=0.01, kernel=poly;, score=0.092 total t