#### Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

#### Read in Dataset

In [2]:
dogs = pd.read_csv('../data/dogs.csv', parse_dates=['intake_datetime'])

In [3]:
#removing low frequency events

dogs = dogs[~dogs['outcome_type'].isin(['Died', 'Disposal', 'Missing', 'Rto-Adopt'])]

#### Define Features

Trying an even simpler model based on intake_type and intake_age_in_years

In [4]:
features = [
    'intake_type',
    'intake_age_in_years',
]

target = ['outcome_type']

In [5]:
#14 outcomes are NaN so we drop those

dogs = dogs.dropna(subset=['outcome_type'])

In [6]:
X = dogs[features]
y = dogs['outcome_type']

In [7]:
X = pd.get_dummies(data=X, 
                   columns=['intake_type'],
                  drop_first=True)

#### Train, Test, Split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=63)

#### Fit RandomForest Model

In [9]:
clf = RandomForestClassifier()

In [10]:
clf.fit(X_train, y_train)

RandomForestClassifier()

#### Results

In [11]:
clf.predict_proba(X_test)

array([[0.46787758, 0.02086194, 0.30149527, 0.20976521],
       [0.63728159, 0.01100954, 0.11843539, 0.23327348],
       [0.07333518, 0.01342796, 0.83596041, 0.07727644],
       ...,
       [0.36931738, 0.02018527, 0.40925405, 0.2012433 ],
       [0.64007771, 0.00859214, 0.10431391, 0.24701624],
       [0.64007771, 0.00859214, 0.10431391, 0.24701624]])

In [12]:
accuracy_score(y_test, clf.predict(X_test))

0.574668647166362

In [13]:
print(classification_report(y_test, clf.predict(X_test)))

                 precision    recall  f1-score   support

       Adoption       0.58      0.85      0.69      8332
     Euthanasia       0.80      0.07      0.12       473
Return to Owner       0.57      0.59      0.58      4789
       Transfer       0.57      0.03      0.06      3910

       accuracy                           0.57     17504
      macro avg       0.63      0.38      0.36     17504
   weighted avg       0.58      0.57      0.50     17504

