In [1]:
import numpy as np
import pandas as pd
import matplotlib

In [2]:
# Import dataframe from aac_intakes_outcomes.csv
csv_file_path = "/home/john/code/joagap/animals/clean_dataset.csv"
df = pd.read_csv(csv_file_path)
print(df.head())

  age_upon_outcome animal_id_outcome        date_of_birth     outcome_type  \
0         10 years           A006100  2007-07-09 00:00:00  Return to Owner   
1          7 years           A006100  2007-07-09 00:00:00  Return to Owner   
2          6 years           A006100  2007-07-09 00:00:00  Return to Owner   
3         10 years           A047759  2004-04-02 00:00:00         Transfer   
4         16 years           A134067  1997-10-16 00:00:00  Return to Owner   

  sex_upon_outcome  age_upon_outcome_(days)  age_upon_outcome_(years)  \
0    Neutered Male                     3650                      10.0   
1    Neutered Male                     2555                       7.0   
2    Neutered Male                     2190                       6.0   
3    Neutered Male                     3650                      10.0   
4    Neutered Male                     5840                      16.0   

      outcome_datetime  outcome_month  outcome_year  ...  \
0  2017-12-07 14:07:00          

In [3]:
# Check unique ids
unique_ids = df["animal_id_outcome"].nunique()
unique_ids

67212

In [4]:
duplicate_ids = df[df["animal_id_outcome"].duplicated()]["animal_id_outcome"].unique()
duplicate_ids

array(['A006100', 'A245945', 'A282897', ..., 'A768473', 'A768566',
       'A768895'], dtype=object)

In [5]:
len(duplicate_ids)

6038

In [6]:
len(duplicate_ids)/unique_ids * 100

8.98351484853895

In [7]:
# How to remove unique ids (df clean)
df.drop_duplicates(subset=["animal_id_outcome"], keep="first", inplace=True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67212 entries, 0 to 74875
Data columns (total 31 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age_upon_outcome           67212 non-null  object 
 1   animal_id_outcome          67212 non-null  object 
 2   date_of_birth              67212 non-null  object 
 3   outcome_type               67211 non-null  object 
 4   sex_upon_outcome           67211 non-null  object 
 5   age_upon_outcome_(days)    67212 non-null  int64  
 6   age_upon_outcome_(years)   67212 non-null  float64
 7   outcome_datetime           67212 non-null  object 
 8   outcome_month              67212 non-null  int64  
 9   outcome_year               67212 non-null  int64  
 10  outcome_monthyear          67212 non-null  object 
 11  outcome_weekday            67212 non-null  object 
 12  outcome_hour               67212 non-null  int64  
 13  age_upon_intake            67212 non-null  obj

In [9]:
# Calculate the time_in_shelter_days and create a new column
import pandas as pd

# Convert the 'intake_datetime' and 'outcome_datetime' columns to datetime format
df['intake_datetime'] = pd.to_datetime(df['intake_datetime'])
df['outcome_datetime'] = pd.to_datetime(df['outcome_datetime'])

# Calculate the difference between 'outcome_datetime' and 'intake_datetime' and convert it to days
df['time_in_shelter_days'] = (df['outcome_datetime'] - df['intake_datetime']).dt.days

In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [11]:
# Define the features and target variable
features = ['age_upon_outcome_(days)', 'animal_type', 'color', 'intake_condition', 
            'sex_upon_intake','age_upon_intake_age_group']

target = 'time_in_shelter_days'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

In [12]:
# create a list of numerical columns and categorical columns
numeric_features = ['age_upon_outcome_(days)']

categorical_features = ['age_upon_intake_age_group', 'animal_type', 'color', 'intake_condition', 
            'sex_upon_intake']

# create the transformer for numerical columns
numeric_transformer = make_pipeline(StandardScaler())

# create the transformer for categorical columns
categorical_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))


# use ColumnTransformer to specify which columns need to be preprocessed in what way
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [13]:
# create the pipeline for linear regression with preprocessor
pipeline = make_pipeline(preprocessor)

In [14]:
pipeline

In [15]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Transform
X_train_transformed = pd.DataFrame(pipeline.transform(X_train))
X_train_transformed



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,478,479,480,481,482,483,484,485,486,487
0,-0.390393,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
1,-0.390393,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,-0.390393,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.274405,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-0.390393,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53764,-0.057994,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
53765,2.601196,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
53766,-0.390393,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
53767,2.268797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [17]:
from sklearn.pipeline import Pipeline

In [18]:
# Add DecisionTreeRegressor to the pipeline
model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DecisionTreeRegressor())
])

In [19]:
model_pipeline

In [20]:
model_pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('standardscaler',
                                                     StandardScaler())]),
                                    ['age_upon_outcome_(days)']),
                                   ('cat',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    ['age_upon_intake_age_group', 'animal_type',
                                     'color', 'intake_condition',
                                     'sex_upon_intake'])])),
  ('regressor', DecisionTreeRegressor())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('standardscaler',
               

In [21]:
# Fit the pipeline on the training data
model_pipeline.fit(X_train, y_train)



In [22]:
# make predictions on the test set
y_pred = model_pipeline.predict(X_test)

In [23]:
y_pred

array([25.06557377,  5.        , 24.18269231, ...,  8.625     ,
        7.89473684, 62.        ])

In [24]:
# Evaluate the model using mean squared error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1772.5117166047196


In [25]:
# Calculate evaluation metrics r2
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)

R-squared: -0.23317708616378452


OPTIONAL: Gradient Boosting approach

In [26]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Define features and target
features = ['animal_type', 'breed', 'color', 'intake_condition', 'sex_upon_intake', 'age_upon_intake_(days)']
target = ['time_in_shelter_days']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

# Define the preprocessing steps for numerical and categorical features
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = make_column_transformer(
    (numeric_transformer, ['age_upon_intake_(days)']),
    (categorical_transformer, ['animal_type', 'breed', 'color', 'intake_condition', 'sex_upon_intake'])
)

# Define the Gradient Boosting pipeline
pipeline = make_pipeline(preprocessor, GradientBoostingRegressor())

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test data
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)

  y = column_or_1d(y, warn=True)


ValueError: Found unknown categories ['Cardigan Welsh Corgi/Miniature Schnauzer', 'Australian Kelpie/Whippet', 'Pointer/Australian Kelpie', 'Great Pyrenees/Dalmatian', 'Great Pyrenees/Chow Chow', 'Manchester Terrier/Basenji', 'Dachshund/German Shepherd', 'Miniature Poodle/Chihuahua Longhair', 'Rhod Ridgeback/Pointer', 'Old English Sheepdog Mix', 'Staffordshire/Bulldog', 'Yorkshire Terrier/Parson Russell Terrier', 'Border Collie/Siberian Husky', 'Toy Fox Terrier/Yorkshire Terrier', 'Border Terrier/Rat Terrier', 'Border Collie/Alaskan Husky', 'Papillon/Skye Terrier', 'Pointer/Rhod Ridgeback', 'Cane Corso/Mastiff', 'Vizsla/Rhod Ridgeback', 'Papillon/Miniature Poodle', 'Jack Russell Terrier/Papillon', 'Yorkshire Terrier/Soft Coated Wheaten Terrier', 'Siberian Husky/Catahoula', 'Pit Bull/American Foxhound', 'Pekingese/Lhasa Apso', 'Collie Rough/Chinese Sharpei', 'Domestic Shorthair/British Shorthair', 'Beagle/Australian Shepherd', 'Vizsla/Beagle', 'Cardigan Welsh Corgi/Cardigan Welsh Corgi', 'Miniature Pinscher/Yorkshire Terrier', 'American Staffordshire Terrier/Rottweiler', 'Whippet/Plott Hound', 'Collie Smooth/Golden Retriever', 'Beagle/Manchester Terrier', 'Whippet/Beagle', 'Whippet/Borzoi', 'Whippet/Catahoula', 'Great Pyrenees/Queensland Heeler', 'English Coonhound/German Shepherd', 'Airedale Terrier/Labrador Retriever', 'Pekingese/Pug', 'Entlebucher Mix', 'Toy Poodle/Bichon Frise', 'Great Pyrenees/English Pointer', 'Dutch Shepherd/Anatol Shepherd', 'German Shepherd/Whippet', 'Skye Terrier/Miniature Poodle', 'Border Collie/Australian Kelpie', 'Pembroke Welsh Corgi/Pit Bull', 'Black/Tan Hound/Beagle', 'Miniature Poodle/Golden Retriever', 'Afghan Hound Mix', 'Chihuahua Longhair/Pug', 'Bluetick Hound/Great Pyrenees', 'English Coonhound/Border Collie', 'Norfolk Terrier/Bruss Griffon', 'American Bulldog/Basset Hound', 'Dogue De Bordeaux/American Bulldog', 'Shih Tzu/Bruss Griffon', 'Plott Hound/Border Collie', 'Border Collie/Staffordshire', 'Great Pyrenees/Siberian Husky', 'Greyhound/Dalmatian', 'Pit Bull/Pharaoh Hound', 'Shiba Inu/Australian Kelpie', 'Black Mouth Cur/Rhod Ridgeback', 'Manchester Terrier/Chihuahua Longhair', 'Rat Terrier/Jack Russell Terrier', 'Whippet/Australian Cattle Dog', 'Miniature Schnauzer/Cocker Spaniel', 'Chihuahua Shorthair/Cocker Spaniel', 'Staffordshire/French Bulldog', 'Standard Schnauzer/Labrador Retriever', 'Dachshund Wirehair/West Highland', 'Boxer/Plott Hound', 'Australian Cattle Dog/Australian Kelpie', 'Norwich Terrier/Cairn Terrier', 'Shiba Inu/Chinese Sharpei', 'Beauceron', 'Pit Bull/Border Collie', 'Pointer/German Shorthair Pointer', 'Cocker Spaniel/Toy Poodle', 'Queensland Heeler/Labrador Retriever', 'Chihuahua Shorthair/Cavalier Span', 'American Pit Bull Terrier/American Bulldog', 'Australian Shepherd/Field Spaniel', 'Dutch Shepherd/Plott Hound', 'Lhasa Apso/Pekingese', 'Rat Terrier/Pit Bull', 'Black Mouth Cur/Australian Cattle Dog', 'Australian Shepherd/Pit Bull', 'Unknown Mix', 'Mastiff/Plott Hound', 'Miniature Schnauzer/Soft Coated Wheaten Terrier', 'Australian Shepherd/Collie Smooth', 'Miniature Pinscher/Maltese', 'Shih Tzu/Affenpinscher', 'Australian Kelpie/Border Collie', 'Cardigan Welsh Corgi/Rat Terrier', 'Bichon Frise/Pekingese', 'Whippet/Rat Terrier', 'Golden Retriever/Pembroke Welsh Corgi', 'Field Spaniel/Cocker Spaniel', 'Catahoula/Black Mouth Cur', 'American Eskimo/Australian Shepherd', 'Dachshund Wirehair/Cairn Terrier', 'German Shepherd/German Shepherd', 'Great Dane/Staffordshire', 'Papillon/Pomeranian', 'Collie Smooth/Australian Kelpie', 'Miniature Schnauzer/Dachshund', 'Dachshund Longhair/Golden Retriever', 'Maltese/Standard Poodle', 'Maltese/Cocker Spaniel', 'Akita/Labrador Retriever', 'Alaskan Malamute/Labrador Retriever', 'Beagle/Treeing Walker Coonhound', 'Miniature Pinscher/Pomeranian', 'Rhod Ridgeback/Mastiff', 'Papillon/Australian Cattle Dog', 'Pembroke Welsh Corgi/Brittany', 'Boerboel', 'American Staffordshire Terrier/Boxer', 'Dachshund/Cavalier Span', 'Bull Terrier/Australian Kelpie', 'Pomeranian/American Eskimo', 'Great Pyrenees/Alaskan Husky', 'Rottweiler/Boxer', 'Shetland Sheepdog/Basenji', 'Irish Wolfhound/American Pit Bull Terrier', 'Dogo Argentino/Pit Bull', 'Miniature Schnauzer/Border Terrier', 'Maltese/Papillon', 'German Shepherd/Swedish Vallhund', 'Whippet', 'Pharaoh Hound/Australian Cattle Dog', 'Rhod Ridgeback/Australian Cattle Dog', 'Nova Scotia Duck Tolling Retriever/Golden Retriever', 'Kuvasz/Labrador Retriever', 'Australian Cattle Dog/English Coonhound', 'Chihuahua Shorthair/Catahoula', 'Pointer/Blue Lacy', 'Mexican Hairless', 'Boston Terrier/Australian Kelpie', 'Beagle/Miniature Pinscher', 'Border Terrier/Dachshund Wirehair', 'Chihuahua Shorthair/Affenpinscher', 'Chinese Sharpei/Boxer', 'Saluki/Labrador Retriever', 'Collie Smooth/Chow Chow', 'Cardigan Welsh Corgi/Cairn Terrier', 'Siberian Husky/Cardigan Welsh Corgi', 'Plott Hound/Bull Terrier', 'Siberian Husky/Great Pyrenees', 'Boxer/American Bulldog', 'German Shepherd/Ibizan Hound', 'Basset Hound/Boxer', 'Great Dane/German Shepherd', 'Lhasa Apso/Jack Russell Terrier', 'Australian Shepherd/Jack Russell Terrier', 'Treeing Walker Coonhound/Dachshund', 'Dachshund Wirehair/Standard Poodle', 'Great Pyrenees/Collie Rough', 'Border Collie/Anatol Shepherd', 'Siberian Husky/American Pit Bull Terrier', 'Dandie Dinmont', 'Boston Terrier/Miniature Schnauzer', 'Redbone Hound/Anatol Shepherd', 'Domestic Longhair/Russian Blue', 'Soft Coated Wheaten Terrier/Standard Poodle', 'Great Dane/Australian Cattle Dog', 'Jack Russell Terrier/Italian Greyhound', 'Airedale Terrier/Irish Terrier', 'Chihuahua Shorthair/Queensland Heeler', 'Border Collie/German Shorthair Pointer', 'Dachshund/Border Collie', 'Cardigan Welsh Corgi/Australian Kelpie', 'German Shepherd/Nova Scotia Duck Tolling Retriever', 'Rottweiler/Rhod Ridgeback', 'Whippet/Anatol Shepherd', 'Pembroke Welsh Corgi/Australian Shepherd', 'German Shepherd/Basenji', 'Labrador Retriever/St. Bernard Smooth Coat', 'Plott Hound/German Shepherd', 'Collie Smooth/Saluki', 'Australian Cattle Dog/Parson Russell Terrier', 'Bulldog/Australian Cattle Dog', 'Boxer/American Pit Bull Terrier', 'Catahoula/English Coonhound', 'Belgian Sheepdog', 'Miniature Poodle/West Highland', 'Boykin Span/Dachshund', 'Lowchen/Yorkshire Terrier'] in column 1 during transform