<a href="https://colab.research.google.com/github/mikalajjjj/Data-Mining-Animal_Shelter/blob/main/AACOutcomes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Classification: Austin Animal Center Outcomes
- Intro to Data Mining CS 363D
- Project Group 46

Can we predict whether an animal in the Austin Animal Center will be adopted, transfered, or euthanized? Here, we perform multi-class classification on an animal's `Outcome Type`, using features from the [Austin Animal Center Intakes dataset](https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Intakes/wter-evkm). The class label (`Outcome Type`) is from the [Animal Center Outcomes dataset](https://data.austintexas.gov/Health-and-Community-Services/Austin-Animal-Center-Outcomes/9t4d-g238); these data sets are joined on `Animal ID`.


In [1]:
%pip install pandas
%pip install scikit-learn



In [2]:
!curl -o intakes.csv "https://data.austintexas.gov/api/views/wter-evkm/rows.csv?accessType=DOWNLOAD"
!curl -o outcomes.csv "https://data.austintexas.gov/api/views/9t4d-g238/rows.csv?accessType=DOWNLOAD"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 24.7M    0 24.7M    0     0  2172k      0 --:--:--  0:00:11 --:--:-- 1921k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 20.3M    0 20.3M    0     0   884k      0 --:--:--  0:00:23 --:--:-- 1004k


In [3]:
import pandas as pd
import sklearn as sk

## Data Prep

- We join the 2 datasets: Intakes and Outcomes on their common column `Animal ID`.
- Some Animal IDs have multiple intakes/outcomes.
    - Number the duplicates (based on DateTime).
    - Pair intakes and outcomes based on Animal ID and Duplicate Number.

In [4]:
intakes = pd.read_csv("intakes.csv")
intakes["DateTime"] = pd.to_datetime(intakes["DateTime"])
intakes.sort_values(by="DateTime", inplace=True) # sort by date
intakes['Duplicate Number'] = intakes.groupby(['Animal ID']).cumcount() # Mark duplicate entries

# Age upon Intake: translate into Age upon Intake_NumYearsOld
def getNumYearsOld(age_str):
    months_in_a_year = 12
    weeks_in_a_year = 52.143
    days_in_a_year = 365.25
    year_idx = age_str.find("year")-1
    if(year_idx >= 0):
        return float(age_str[:year_idx])
    month_idx = age_str.find("month")-1
    if(month_idx >= 0):
        return float(age_str[:month_idx])/months_in_a_year
    weeks_idx = age_str.find("week")-1
    if(weeks_idx >= 0):
        return float(age_str[:weeks_idx])/weeks_in_a_year
    days_idx = age_str.find("day")-1
    if(days_idx >= 0):
        return float(age_str[:days_idx])/days_in_a_year
    return None

intakes["Age upon Intake"] = intakes["Age upon Intake"].apply(getNumYearsOld) # make age in terms of yearas

print(intakes.shape)
intakes.head()

(173812, 13)


Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Duplicate Number
0,A521520,Nina,2013-10-01 07:51:00,October 2013,Norht Ec in Austin (TX),Stray,Normal,Dog,Spayed Female,7.0,Border Terrier/Border Collie,White/Tan,0
1,A664235,,2013-10-01 08:33:00,October 2013,Abia in Austin (TX),Stray,Normal,Cat,Unknown,0.019178,Domestic Shorthair Mix,Orange/White,0
2,A664236,,2013-10-01 08:33:00,October 2013,Abia in Austin (TX),Stray,Normal,Cat,Unknown,0.019178,Domestic Shorthair Mix,Orange/White,0
3,A664237,,2013-10-01 08:33:00,October 2013,Abia in Austin (TX),Stray,Normal,Cat,Unknown,0.019178,Domestic Shorthair Mix,Orange/White,0
4,A664233,Stevie,2013-10-01 08:53:00,October 2013,7405 Springtime in Austin (TX),Stray,Injured,Dog,Intact Female,3.0,Pit Bull Mix,Blue/White,0


In [5]:
outcomes = pd.read_csv("outcomes.csv")
outcomes["DateTime"] = pd.to_datetime(outcomes["DateTime"])
outcomes.sort_values(by="DateTime", inplace=True)
outcomes['Duplicate Number'] = outcomes.groupby(['Animal ID']).cumcount() # Mark duplicate entries
print(outcomes.shape)
outcomes.head()


ValueError: time data "2013-10-01T09:31:00" doesn't match format "%Y-%m-%dT%H:%M:%S%z", at position 746. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [None]:
outcomes_label = outcomes[["Animal ID", "Duplicate Number", "DateTime", "Outcome Type"]]
outcomes_label = outcomes_label.rename(columns={"DateTime": "DateTime_Outcome"})
print(outcomes_label.shape)
outcomes_label.head()

In [None]:
raw_df = intakes.merge(outcomes_label, on=["Animal ID", "Duplicate Number"], how="inner").set_index("Animal ID")
raw_df.sort_values(by=["Animal ID", "Duplicate Number"], inplace=True)
raw_df.head()

# "Scamp" has 3 intakes/outcomes (all resulting in "Return to Owner"). They are paired together correctly!

In [None]:
raw_df.index = raw_df.index.map(str) + "_" + raw_df['Duplicate Number'].map(str)
raw_df = raw_df.drop(columns=["DateTime_Outcome", "Duplicate Number"])
raw_df.index.name = 'Animal ID'



raw_df.to_csv("raw_full_dataset.csv")
print(raw_df.shape)
raw_df.head()

## Data Exploration

In [None]:
raw_df.describe(include="all", datetime_is_numeric=True)
# don't have a name?
# datetime as timedelta - visualize datetime

In [None]:
raw_df["Outcome Type"].value_counts()

In [None]:
#plot = raw_df.loc[raw_df['Outcome Type'] == "Euthanasia"].plot.pie(y='', figsize=(5, 5))

ax = raw_df.plot.box(column="Age upon Intake", by="Outcome Type", figsize=(10, 8))


In [None]:
from matplotlib import pyplot as plt

def cumulatively_categorize(df, columns, threshold_freq = .9, num_categories = None):
    '''
    Only keep categories needed sum to threshold_freq, move rest into "Other".
    Or, keep top threshold_num categories, and move rest into "Other".
    '''
    df = df.copy()
    threshold = threshold_freq*len(df)
    for column_name in columns:

        counts = df[column_name].value_counts()

        s = 0
        to_keep = []
        for value, count in counts.iteritems():
            s += count
            to_keep.append(value)

            if(num_categories is not None):
                # based on num of categories
                if(len(to_keep) > num_categories):
                    break
            else:
                # based on frequency
                if(s > threshold):
                    break

        print(to_keep)
        df[column_name]=df[column_name].apply(lambda x: x if x in to_keep else 'Other')
    return df

def show_pie_chart(df, categorical_var, label = "Outcome Type"):
    gb = df.groupby([label])
    outcome_groups = [gb.get_group(x) for x in gb.groups]
    outcome_labels = [x for x in gb.groups]

    pie_chart_df = pd.DataFrame()

    for i, outcome_df in enumerate(outcome_groups):
        val_counts = outcome_df[categorical_var].value_counts()
        print(outcome_labels[i])
        print(val_counts)
        pie_chart_df[outcome_labels[i]] = val_counts # this logic is wrong: where does "Bat" go?
        # Idea: insert empty column with every Breed category, then drop it at the end.

    pie_chart_df

    print(pie_chart_df)

    pie_chart_df.plot.pie(subplots=True, layout=(3,3), figsize=(20, 20)) # TODO: fix the size, make pretty

df = raw_df.copy()
df['Breed'] = df['Breed'].apply(lambda str: str.replace(" Mix", "")) # remove Mix

show_pie_chart(cumulatively_categorize(df, columns=["Breed"], num_categories=10), "Breed")

In [None]:
df = cumulatively_categorize(df, columns=["Breed"], num_categories = 10)
show_pie_chart(raw_df, "Intake Condition")

## Feature Engineering

In [None]:
df = raw_df.copy()

def one_hot_encode(df, columns):
    '''
    Take a categorical variable, one-hot encode, and join back into dataframe.
    '''
    for column_name in columns:
        one_hot_encoded = pd.get_dummies(df[column_name], prefix=column_name)
        df = df.drop(column_name, axis='columns')
        df = df.join(one_hot_encoded, on="Animal ID")
    return df


# Breed: bin less common breeds into "Other", one-hot encode
df['Breed'] = df['Breed'].apply(lambda str: str.replace(" Mix", "")) # remove Mix
df = cumulatively_categorize(df, columns=["Breed"], num_categories = 10)
df = one_hot_encode(df, columns=["Breed"])

# Outcome Type: collect Adoption, Return to Owner, Rto-Adopt
def collectAdoption(outcome):
    if(outcome in ["Adoption", "Return to Owner", "Rto-Adopt"]):
        return ("Adoption/RTO")
    else:
        return outcome
df["Outcome Type"] = df["Outcome Type"].apply(collectAdoption)
df = cumulatively_categorize(df, columns=["Outcome Type"],num_categories=4)
df = one_hot_encode(df, columns=["Outcome Type"])

# To drop
to_drop = ["Name", "MonthYear", "Color", "Found Location", "DateTime"]
df = df.drop(columns=to_drop)

# To one-hot encode
to_one_hot_encode = ["Intake Type", "Intake Condition", "Animal Type", "Sex upon Intake"] # experiment
df = cumulatively_categorize(df, columns=to_one_hot_encode)
df = one_hot_encode(df, columns=to_one_hot_encode)


In [None]:
df.head()

In [None]:
print(df.columns)

In [None]:
# labels_names = ['Outcome Type_Adoption', 'Outcome Type_Died',
#        'Outcome Type_Disposal', 'Outcome Type_Euthanasia',
#        'Outcome Type_Missing', 'Outcome Type_Relocate',
#        'Outcome Type_Return to Owner', 'Outcome Type_Rto-Adopt',
#        'Outcome Type_Transfer',]

labels_names = ['Outcome Type_Died',
       'Outcome Type_Disposal', 'Outcome Type_Euthanasia',
       'Outcome Type_Other', 'Outcome Type_Transfer', 'Outcome Type_Adoption/RTO']
labels = df[labels_names]
features = df.drop(columns=labels_names)
print("labels:",labels.shape)
print("features:",features.shape)
features.head()

## Data Analysis

### Test/Train Split

In [None]:
from sklearn import *
from sklearn.model_selection import train_test_split
train_size = int(.8*len(features))
test_size = len(features) - train_size
X_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=test_size, train_size=train_size, shuffle=False)
print("length of train set: ")
print(len(y_train))
print("length of test set: ")
print(len(y_test))

In [None]:
from sklearn import *
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
clfO = DecisionTreeClassifier(criterion='entropy', max_depth=4)
clfO = clfO.fit(X_train, y_train)
y_predict = clfO.predict(x_test)
acc = accuracy_score(y_test, y_predict)
print(acc) # need to see precision

In [None]:
print(tree.export_text(clfO, feature_names=list(features.columns)))

In [None]:
# Use `sklearn.tree.DecisionTreeClassifier` to fit a decision tree classifier on the training set. Use entropy as the split criterion.
from sklearn import *

clf = tree.DecisionTreeClassifier(criterion="entropy")
param_grid={
    "max_depth": [5,10],
    "min_samples_leaf": [5,10],
    "max_features": [5,10],
}
clf = model_selection.GridSearchCV(clf,param_grid=param_grid,cv=5,scoring="accuracy")
clf.fit(X_train,y_train)
print(clf.best_params_)
print(clf.best_score_)

In [None]:
# Naive Bayes
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score
labels = df.iloc[:,13]
clf = GaussianNB()
accs=np.mean(model_selection.cross_val_score(clf, X=features, y=labels,cv=10))
print("Accuracy NB: ")
print(accs.mean())
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
y_pred = cross_val_predict(clf, features, labels, cv=10)
conf_mat = confusion_matrix(labels, y_pred)
print("Confusion Matrix: ")
print(conf_mat)
print("Classification Report: ")
print(classification_report(labels, y_pred))


In [None]:
# knn
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
from sklearn.decomposition import PCA
pca = PCA()
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
from sklearn.pipeline import Pipeline
pipe = Pipeline([('scaler', scaler), ('pca', pca), ('knn', knn)])
res = cross_val_score(pipe, features, labels, cv=5)
print(res.mean())

## Results