In [0]:
# A machine learning model has been developed to predict if horses will live or die.r.
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
from sklearn.metrics import cohen_kappa_score

In [0]:
# Lendo a base de teste e de treino.
df_train = pd.read_csv('horse.csv')
df_test = pd.read_csv('horseTest.csv')

In [0]:
# As the model asks as output only live or die, I turned euthanasia into death.
outcome_1 = df_train['outcome'].replace(['euthanized'], 'died')
df_train['outcome'] = outcome_1
outcome_2 = df_test['outcome'].replace(['euthanized'], 'died')
df_test['outcome'] = outcome_2

In [0]:
# Doing an exploratory analysis of the data, I found that these attributes have a high number of missings. So I decided to exclude them from the model.
df_train = df_train.drop(['abdomo_protein','abdomo_appearance','nasogastric_reflux_ph'],axis=1)
df_test = df_test.drop(['abdomo_protein','abdomo_appearance','nasogastric_reflux_ph'],axis=1)

In [0]:
# Manipulating data.
h = list(df_train.columns) 

In [0]:
# Matching missing data. The strategy adopted was that of the most frequent value in the column.
imp = SimpleImputer(strategy="most_frequent")
df_train_imp = imp.fit_transform(df_train)
df_test_imp = imp.fit_transform(df_test)

In [0]:
# Manipulando dados.
df_train_imp = pd.DataFrame(df_train_imp)
df_test_imp = pd.DataFrame(df_test_imp)

In [0]:
# Manipulando dados.
df_test_imp.columns = h
df_train_imp.columns = h

In [0]:
# Separating the label attribute in the training and test base.
X_df_imp_train = df_train_imp.drop('outcome',axis=1)
y_df_imp_train = df_train_imp['outcome']
X_df_imp_test = df_test_imp.drop('outcome',axis=1)
y_df_imp_test = df_test_imp['outcome']

In [0]:
# Turning the categorical label into numeric.
y_df_imp_train = pd.get_dummies(y_df_imp_train)
y_df_imp_train = y_df_imp_train.drop('lived',axis = 1)
y_df_imp_test = pd.get_dummies(y_df_imp_test)
y_df_imp_test = y_df_imp_test.drop('lived',axis = 1)

In [0]:
# Turning categorical attributes into numeric.
X_df_imp_train_dummies = pd.get_dummies(data=X_df_imp_train, columns=['surgical_lesion','surgery', 'age','temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time','pain','peristalsis','abdominal_distention','nasogastric_tube','nasogastric_reflux','rectal_exam_feces','abdomen','cp_data'])
X_df_imp_test_dummies = pd.get_dummies(data=X_df_imp_test, columns=['surgical_lesion','surgery', 'age','temp_of_extremities','peripheral_pulse','mucous_membrane','capillary_refill_time','pain','peristalsis','abdominal_distention','nasogastric_tube','nasogastric_reflux','rectal_exam_feces','abdomen','cp_data'])

In [0]:
# Training the machine learning algorithm.
clf = RandomForestClassifier(max_depth=10, random_state=0,n_estimators=100)
clf.fit(X_df_imp_train_dummies,y_df_imp_train);

  


In [0]:
y_pred = clf.predict(X_df_imp_test_dummies)

In [0]:
confusion_matrix(y_df_imp_test,y_pred)

array([[52,  1],
       [ 0, 36]])

In [0]:
accuracy_score(y_df_imp_test,y_pred)

0.9887640449438202

In [0]:
cohen_kappa_score(y_df_imp_test,y_pred)

0.9767805896164884