In [1]:
# Initial imports
import pandas as pd
from path import Path
from sklearn import tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [10]:
# Loading data
file_path = Path("./Resources/data_prepared.csv")
df_milk = pd.read_csv(file_path)
df_milk.head()

Unnamed: 0,Daily_Production,Average_Daily_Production_Days,Average_Daily_Production_Percentage,Lactation_Number,Milk_Days,Days_After_Heat,Insemination,Days_After_Insemination,Pregnacy_Days,Number_of_Abortions,...,bulls_name_RHETORIC,bulls_name_SCORPIO,bulls_name_SCRAPPLE,bulls_name_SILVER INFOCUS,bulls_name_TORETTO,bulls_name_TORRO,bulls_name_UNCHARTED,bulls_name_VIEWPOINT,bulls_name_WALLEN,bulls_name_WILD THING
0,29.2,29.9,-2,4,222,139,2,139,139,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,34.6,34.4,1,4,210,132,2,132,132,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44.3,44.3,0,4,20,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,41.1,0.0,0,3,5,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,51.9,50.6,3,7,116,61,1,61,61,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
df_milk.columns


Index(['Daily_Production', 'Average_Daily_Production_Days',
       'Average_Daily_Production_Percentage', 'Lactation_Number', 'Milk_Days',
       'Days_After_Heat', 'Insemination', 'Days_After_Insemination',
       'Pregnacy_Days', 'Number_of_Abortions', 'Production_Peak',
       'Previous_Lactation_Production', 'Actual_Lactation_Production',
       'Daily_Production_bool', 'given_birthdate_days',
       'Gynecological_Status_Abortion', 'Gynecological_Status_Heat',
       'Gynecological_Status_Insemination', 'Gynecological_Status_Labor',
       'Gynecological_Status_No_Inseminate', 'Gynecological_Status_Pregnacy',
       'Inseminator_Name_CARLOS-JAIME', 'Inseminator_Name_CARLOS-JEHU',
       'Inseminator_Name_CARLOS-LUIS', 'Inseminator_Name_CARLOS-MANUEL',
       'Inseminator_Name_CARLOS-MAURICIO', 'Inseminator_Name_CARLOS-URIEL',
       'Inseminator_Name_DAVID-ADAN', 'Inseminator_Name_DAVID-JEHU',
       'Inseminator_Name_DAVID-MANUEL', 'Inseminator_Name_DAVID-URIEL',
       'Insemina

In [12]:
# Define the features set.
X = df_milk.copy()
X = X.drop("Daily_Production", axis=1)
X = X.drop("Daily_Production_bool", axis=1)
X.head()

Unnamed: 0,Average_Daily_Production_Days,Average_Daily_Production_Percentage,Lactation_Number,Milk_Days,Days_After_Heat,Insemination,Days_After_Insemination,Pregnacy_Days,Number_of_Abortions,Production_Peak,...,bulls_name_RHETORIC,bulls_name_SCORPIO,bulls_name_SCRAPPLE,bulls_name_SILVER INFOCUS,bulls_name_TORETTO,bulls_name_TORRO,bulls_name_UNCHARTED,bulls_name_VIEWPOINT,bulls_name_WALLEN,bulls_name_WILD THING
0,29.9,-2,4,222,139,2,139,139,0,50.3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,34.4,1,4,210,132,2,132,132,0,39.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,44.3,0,4,20,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0,3,5,0,0,0,0,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50.6,3,7,116,61,1,61,61,0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# Define the target set.
y = df_milk["Daily_Production_bool"].values
y[:5]

array([1., 1., 1., 1., 1.])

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [15]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3094, 97)
(1032, 97)
(3094,)
(1032,)


In [16]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [17]:
# Creating the decision tree classifier instance.
model = tree.DecisionTreeClassifier()
# Fitting the model.
model = model.fit(X_train_scaled, y_train)

In [18]:
# Making predictions using the testing data.
predictions = model.predict(X_test_scaled)

In [19]:
predictions

array([1., 1., 1., ..., 1., 1., 1.])

In [20]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,216,10
Actual 1,16,790


In [21]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [22]:
acc_score


0.9748062015503876

In [23]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,216,10
Actual 1,16,790


Accuracy Score : 0.9748062015503876
Classification Report
              precision    recall  f1-score   support

         0.0       0.93      0.96      0.94       226
         1.0       0.99      0.98      0.98       806

    accuracy                           0.97      1032
   macro avg       0.96      0.97      0.96      1032
weighted avg       0.98      0.97      0.97      1032

