In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time

In [None]:
# THIS IS FOR DATA CLEANING - all the stuff should be here
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# we are dropping color after we establish a primary color columns
df_train = df_train.drop(columns=['id', 'name', 'date_of_birth', 'outcome_time', 'found_location'])
df_train = df_train.dropna(subset=['age_upon_intake'])

In [None]:
# Intake Time
# Convert string timestamps to UNIX timestamp
dt_series = pd.to_datetime(df_train['intake_time'])
df_train['intake_time'] = dt_series.astype('int64') // (10 ** 9)
print(df_train['intake_time'])

# Sex Upon Intake
# Replace NA values with 'Unknown' using .loc for proper assignment
df_train.loc[df_train['sex_upon_intake'].isna(), 'sex_upon_intake'] = 'Unknown'

# Age Upon Intake
def convert_age(age_str):
    """
    Convert age strings to years.
    Expected format: "<number> <unit>" e.g., "2 years", "8 months", "3 weeks", "15 days"
    """
    if not isinstance(age_str, str):
        return None

    parts = age_str.split()
    if len(parts) != 2:
        return None

    try:
        num = float(parts[0])
    except ValueError:
        return None

    unit = parts[1].lower()
    if "year" in unit:
        return num
    elif "month" in unit:
        return num / 12
    elif "week" in unit:
        return num / 52
    elif "day" in unit:
        return num / 365
    else:
        return None

df_train['age_upon_intake'] = df_train['age_upon_intake'].apply(convert_age)
df_train.loc[df_train['age_upon_intake'] < 0, 'age_upon_intake'] = 0

# Breed
# Create is_mix column
df_train['is_mix'] = df_train['breed'].str.contains('mix', case=False, na=False).astype(int)
# remove mix from all breeds
df_train['breed'] = df_train['breed'].str.replace(' mix', '', case=False)

In [None]:
color_group_map = {
  'blue tabby':       'gray tabby',
  'silver tabby':     'gray tabby',
  'silver':           'gray',
  'blue':             'gray',
  'orange tabby':     'orange',
  'orange tiger':     'orange',
  'red':              'orange',
  'red tabby':        'orange',
  'red tick':         'orange',
  'yellow':           'orange',
  'tan':              'cream',
  'tricolor':         'calico'
}

# lowercase
df_train['color'] = df_train['color'].str.lower().str.strip()

# feature engineering -> primary color 
df_train['primary_color'] = df_train['color'].astype(str).apply(
    lambda x: x.split('/')[0].strip() if '/' in x else x.strip()
)

# simplify synonymous colors if in map
df_train['primary_color'] = df_train['primary_color'].map(color_group_map).fillna(df_train['primary_color'])

df_train = df_train.drop(columns=['color'])


def freq_encode(df, col):
  # count frequencies
  freq_series = df[col].value_counts()

  # map frequencies back to the original column, replacing values
  df[col] = df[col].map(freq_series)

  return df

In [None]:
# cleaning intake type + condition
df_train = df_train[df_train['intake_type'] != 'Wildlife']


df_train['intake_condition'] = df_train['intake_condition'].replace({'Unknown': 'Unknown Condition / Other', 'Other': 'Unknown Condition / Other'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Space': 'Unknown Condition / Other'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Behavior': 'Normal / Behavior', 'Normal': 'Normal / Behavior'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Neonatal': 'Nursing / Neonatal', 'Nursing': 'Nursing / Neonatal'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Neurologic': 'Med Urgent', 'Agonal': 'Med Urgent', 'Parvo': 'Med Urgent'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Agonal': 'Med Urgent / Neurological'})
df_train['intake_condition'] = df_train['intake_condition'].replace({'Congenital': 'Sick'})



In [None]:
# One hot encode intake type
df = pd.get_dummies(df_train['intake_type'])
df_train = df_train.drop('intake_type', axis=1)
df_train = pd.concat([df_train, df], axis=1)
df_train.head()

In [None]:
# One hot encode intake condition
df = pd.get_dummies(df_train['intake_condition'])
df_train = df_train.drop('intake_condition', axis=1)
df_train = pd.concat([df_train, df], axis=1)
df_train.head()

In [None]:
# One hot encode sex upon intake
df = pd.get_dummies(df_train['sex_upon_intake'])
df_train = df_train.drop('sex_upon_intake', axis=1)
df_train = pd.concat([df_train, df], axis=1)
df_train.head()

In [None]:
print(df_train.shape)
print(df_train.head())

In [None]:
label_column = df_train.pop('outcome_type')
df_train.insert(df_train.shape[1], 'outcome_type', label_column)
cats = df_train[df_train['animal_type'] == 'Cat']
dogs = df_train[df_train['animal_type'] == 'Dog']

In [None]:
cats = cats.drop('animal_type', axis=1)
dogs = dogs.drop('animal_type', axis=1)

cats_data = cats.iloc[:, :-1]
cats_labels = cats.iloc[:, -1:]

dogs_data = dogs.iloc[:, :-1]
dogs_labels = dogs.iloc[:, -1:]

In [None]:
cats.head()

In [None]:
# Working with Decision Trees
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer

In [32]:
def print_runtime(start, end):
    total_time = end - start
    min = int(total_time // 60)
    sec = int(total_time % 60)
    print(f"Time taken: {min}:{sec:02d}")

In [33]:
# Transformer code for pipelines

# Use FunctionTransformer to wrap the freq_encode function
def apply_freq_encode(df):
    df = freq_encode(df, 'primary_color')
    df = freq_encode(df, 'breed')

    return df

In [None]:
# THIS IS A TEST

df_data = cats.head(n=1000).iloc[:, :-1]
df_labels = cats.head(n=1000).iloc[:, -1:]

tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced') # it's balancced to handle class imbalances
pipe = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('DT', tree)
])


# make a matrix of HP values to tune
HP = {
    "DT__max_depth": [10, 25],
    "DT__max_features" : [None, 28],  
    "DT__min_samples_leaf": [5, 10]
    }

start = time.time()

griddy = GridSearchCV(estimator=pipe, param_grid=HP, cv=10, scoring='accuracy')
accs = cross_val_score(griddy, X=df_data, y=df_labels, cv=10)    
end = time.time()

print_runtime(start, end)



print('The best parameters for our model are: ', griddy.best_params_)
print('The best accuracies we obtained using the best hyperparameter values are: ', griddy.best_score_)
print('The Generalization accuracy of tuned, CV model is ', griddy.mean())

KeyboardInterrupt: 

In [None]:
# chat 

from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import GridSearchCV, cross_val_score
import pandas as pd

# # Assume apply_freq_encode and 'cats' are defined

# df_data   = cats.head(n=1000).iloc[:, :-1]
# df_labels = cats.head(n=1000).iloc[:, -1:]

tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced')

pipe = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('DT', tree)
])

# Hyperparameter grid
HP = {
    "DT__max_depth": [10, 12, 15, 17, 20, 25],
    "DT__max_features": [None, 5, 15, 20, 28],  
    "DT__min_samples_leaf": [5, 10, 20, 30, 40, 50]
}

# Grid search with cross-validation
griddy = GridSearchCV(estimator=pipe, param_grid=HP, cv=10, scoring='accuracy')

# Now actually fit the model
griddy.fit(df_data, df_labels.values.ravel())

# Output results
print('The best parameters for our model are:', griddy.best_params_)
print('The best accuracy we obtained using the best hyperparameter values is:', griddy.best_score_)

# Generalization accuracy using cross_val_score (optional)
accs = cross_val_score(griddy.best_estimator_, X=df_data, y=df_labels.values.ravel(), cv=10)
print('The generalization accuracy of the tuned CV model is:', accs.mean())


The best parameters for our model are: {'DT__max_depth': 10, 'DT__max_features': None, 'DT__min_samples_leaf': 5}
The best accuracy we obtained using the best hyperparameter values is: 0.562
The generalization accuracy of the tuned CV model is: 0.56


  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline

sample = cats.sample(n=1000)
X = sample.iloc[:, :-1]
y = sample.iloc[:, -1:].values.ravel()

# Define pipeline
pipe = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('scaler', StandardScaler()),
    ('mlp', MLPClassifier(max_iter=1000, random_state=42))
])

# Define hyperparameter grid
param_grid = {
    'mlp__hidden_layer_sizes': [(30,), (40,), (50,), (60,)],
    'mlp__activation': ['logistic', 'tanh', 'relu']
}

# Grid search with cross-validation
gs = GridSearchCV(pipe, param_grid=param_grid, cv=5)

# Use cross_val_score to estimate performance
accs = cross_val_score(gs, X, y, cv=5)
print(f"Mean cross-validated accuracy: {accs.mean():.4f}")


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [None]:
# # CAT MODEL

cat_tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced') # it's balancced to handle class imbalances
pipeline_cat = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('DT', cat_tree)
])


# make a matrix of HP values to tune
HP_values_cat = {
    "DT__max_depth": [100, 125, 150, 175, 200, 250],
    "DT__max_features" : [0, 5, 15, 20, 28],  
    "DT__min_samples_leaf": [5, 10, 20, 30, 40, 50]
    }

#nested_tree = DecisionTreeClassifier()
nested_grid_search_cat = GridSearchCV(estimator=pipeline_cat, param_grid=HP_values_cat, cv=10, scoring='accuracy')
nested_accs_cat = cross_val_score(nested_grid_search_cat, X=cats_data, y=cats_labels, cv=10)    

print('The best parameters for our model are: ', nested_grid_search_cat.best_params_)
print('The best accuracies we obtained using the best hyperparameter values are: ', nested_grid_search_cat.best_score_)
print('The Generalization accuracy of tuned, CV model is ', nested_accs_cat.mean())

In [None]:
# DOG MODEL
dogs_train, dogs_test, dogs_outcome_train, dogs_outcome_test = train_test_split(dogs_data, dogs_labels, test_size=0.20)


dog_tree = DecisionTreeClassifier(criterion='entropy', class_weight='balanced') # it's balancced to handle class imbalances

pipeline_dog = Pipeline([
    ('encoder', FunctionTransformer(apply_freq_encode, validate=False)),
    ('DT', dog_tree)
])


# make a matrix of HP values to tune
HP_values_dogs = {
    "DT__max_depth": [100, 125, 150, 175, 200, 225, 250, 275, 300, 325, 350, 375, 400, 425, 450, 475, 500], 
    "DT__max_features" : [0, 5, 10, 15, 20, 25, 28],  
    "DT__min_samples_leaf": [5, 10, 20, 30, 40, 50]
}

#nested_tree = DecisionTreeClassifier()
nested_grid_search_dog = GridSearchCV(estimator=pipeline_dog, param_grid=HP_values_dogs, cv=10, scoring='accuracy')
nested_accs_dog = cross_val_score(nested_grid_search_dog, X=dogs_data, y=dogs_labels, cv=10)    

print('The best parameters for our model are: ', nested_grid_search_dog.best_params_)
print('The best accuracies we obtained using the best hyperparameter values are: ', nested_grid_search_dog.best_score_)
print('The Generalization accuracy of tuned, CV model is ', nested_accs_dog.mean())




'''
dogs_train, dogs_test, dogs_outcome_train, dogs_outcome_test = train_test_split(dogs_data, dogs_labels, test_size=0.20)

# frequency encode primary color and breed
dogs_train = freq_encode(dogs_train, 'primary_color')
dogs_train = freq_encode(dogs_train, 'breed')

dogs_test = freq_encode(dogs_test, 'primary_color')
dogs_test = freq_encode(dogs_test, 'breed')


dog_tree = DecisionTreeClassifier(criterion='entropy')
dog_tree = dog_tree.fit(dogs_train, dogs_outcome_train) 

'''
#cv_tree = DecisionTreeClassifier(criterion='entropy')
accs = cross_val_score(dog_tree, X=dogs_train, y=dogs_outcome_train, cv=10)  # we use for CV
print('The Generalization accuracy is ', accs.mean())
'''
predictions = dog_tree.predict(dogs_test)# predicting on test data

print('Accuracy Score: ', accuracy_score(y_true=dogs_outcome_test, y_pred=predictions))


dogs_test_pred = dog_tree.predict(dogs_test)
accuracy = accuracy_score(y_true=dogs_outcome_train, y_pred=dogs_test_pred)
print("Accuracy:", accuracy)


# #gddy code is before, i have to edit it  - normally we would mess around and look at the best paramets and then do griddy

# inner_tree = DecisionTreeClassifier()

# # make a matrix of HP values to tune
# HP_values = {"max_depth": [10, 12, 15, 17, 20, 22, 25, 27, 30, 32, 35, 37, 40], "max_features" : [0, 5, 10, 15],  "min_samples_leaf": [10, 20, 30]}

# grid_search = GridSearchCV(estimator=inner_tree, param_grid=HP_values, cv=5, scoring='accuracy')
# grid_search.fit(data_feats, data_lbls)


# print('The best parameters for our model are: ', grid_search.best_params_)
# print('The best accuracies we obtained using the best hyperparameter values are: ', grid_search.best_score_)

# nested_tree = DecisionTreeClassifier()

# # we use the same HP_values
# nested_grid_search = GridSearchCV(estimator=nested_tree, param_grid=HP_values, cv=5, scoring='accuracy')
# nested_accs = cross_val_score(nested_grid_search, X=data_feats, y=data_lbls, cv=5) 
# print('The Generalization accuracy of tuned, CV model is ', nested_accs.mean())
'''


In [None]:
cats.head()