## Random_Forest_Classifier DOG

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Data Processing
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [17]:
from Animal_Adoption.preproc.data_dog import get_data_dogs
df = get_data_dogs()

In [None]:
df.info()

In [18]:
df.isnull().sum()

animal_id_outcome                      0
date_of_birth                          0
outcome_type                           0
sex_upon_outcome                       0
age_upon_intake                        0
animal_id_intake                       0
animal_type                            0
breed                                  0
color                                  0
intake_condition                       0
intake_type                            0
sex_upon_intake                        0
age_upon_intake_(years)                0
intake_datetime                        0
intake_number                          0
time_in_shelter_days                   0
sex_type                               0
sex                                    0
breed_new                              0
breed_2classes                         0
outcome_type_2classes                  0
time_in_shelter_days_round             0
time_in_shelter_class                  0
time_in_shelter_days_round_8classes    0
time_in_shelter_

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38882 entries, 0 to 38881
Data columns (total 52 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   animal_id_outcome                    38880 non-null  object        
 1   date_of_birth                        38880 non-null  datetime64[ns]
 2   outcome_type                         38880 non-null  object        
 3   sex_upon_outcome                     38880 non-null  object        
 4   age_upon_intake                      38880 non-null  object        
 5   animal_id_intake                     38880 non-null  object        
 6   animal_type                          38880 non-null  object        
 7   breed                                38882 non-null  object        
 8   color                                38880 non-null  object        
 9   intake_condition                     38880 non-null  object        
 10  intake_typ

## TRAIN_TEST_DATASET- TARGET 8 classes

In [80]:
# Split the data into features (X) and target (y)
# 'intake_condition_2classes, sex, sex_type, animal_type, breed_2classes,\
# color_3classes, outcome_type_2classes, age_upon_intake_(years)

# add 	- 'good_with_children', 'good_with_other_dogs', 'shedding', 'grooming', \
# 	- 'drooling', 'good_with_strangers', 'playfulness', 'protectiveness', 'trainability',\
# 'energy', 'barking''coat_length', 'min_life_expectancy', 'max_life_expectancy',\

# 		○ 'max_height_male', 'max_weight_male', 'min_height_male', 'min_weight_male'
# 'max_height_female', 'max_weight_female', 'min_height_female', 'min_weight_female'

X = df[['age_upon_intake_(years)', 'intake_condition_2classes', 'sex', 'sex_type', 'breed', 'color_3classes',\
    'outcome_type_2classes','good_with_children', 'good_with_other_dogs', 'shedding', 'grooming', \
        'drooling', 'good_with_strangers', 'playfulness', 'protectiveness', 'trainability',\
        'energy', 'barking','coat_length', 'min_life_expectancy', 'max_life_expectancy' ]]
y = df['time_in_shelter_days_round_8classes'] # next one with 'time_in_shelter_days_round_5classes', 'time_in_shelter_days_round_2classes'


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [81]:
# create a list of numerical columns and categorical columns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['age_upon_intake_(years)', 'good_with_children', 'good_with_other_dogs', 'shedding', 'grooming', \
        'drooling', 'good_with_strangers', 'playfulness', 'protectiveness', 'trainability',\
        'energy', 'barking','coat_length', 'min_life_expectancy', 'max_life_expectancy']

categorical_features = ['intake_condition_2classes', 'sex', 'sex_type', 'breed', 'color_3classes',\
    'outcome_type_2classes']

# create the transformer for numerical columns
numeric_transformer = make_pipeline(StandardScaler())

# create the transformer for categorical columns
categorical_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))


# use ColumnTransformer to specify which columns need to be preprocessed in what way
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = make_pipeline(preprocessor, RandomForestClassifier())
pipeline.fit(X_train,y_train)
pipeline.predict(X_test)



array(['between 6 and 10 days', 'between 1 and 5 days', 'several hours',
       ..., 'between 1 and 5 days', 'between 1 and 5 days',
       'between 1 and 5 days'], dtype=object)

In [82]:
pipeline.score(X_test,y_test)

0.43020667726550077

## TRAIN_TEST_DATASET- TARGET 2 classes

In [9]:
# Split the data into features (X) and target (y)

X = df[['intake_condition_2classes', 'sex', 'sex_type',\
    'animal_type', 'breed_2classes', 'age_upon_intake_(years)', 'color_3classes',\
    'outcome_type_2classes']]
y = df['time_in_shelter_days_round_2classes'] # next one with 'time_in_shelter_days_round_5classes', 'time_in_shelter_days_round_2classes'


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
# create a list of numerical columns and categorical columns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['age_upon_intake_(years)']

categorical_features = ['intake_condition_2classes', 'sex', 'sex_type',\
    'animal_type', 'breed_2classes', 'color_3classes',\
    'outcome_type_2classes']

# create the transformer for numerical columns
numeric_transformer = make_pipeline(StandardScaler())

# create the transformer for categorical columns
categorical_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))


# use ColumnTransformer to specify which columns need to be preprocessed in what way
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = make_pipeline(preprocessor, RandomForestClassifier())
pipeline.fit(X_train,y_train)
pipeline.predict(X_test)



array(['one week', 'one week', 'one week', ..., 'more than one week',
       'one week', 'one week'], dtype=object)

In [11]:
pipeline.score(X_test,y_test)

0.6952590959206174

## TRAIN_TEST_DATASET- TARGET numerical

In [12]:
# Split the data into features (X) and target (y)

X = df[['intake_condition_2classes', 'sex', 'sex_type',\
    'animal_type', 'breed_2classes', 'age_upon_intake_(years)', 'color_3classes',\
    'outcome_type_2classes']]
y = df['time_in_shelter_days_round'] # next one with 'time_in_shelter_days_round_5classes', 'time_in_shelter_days_round_2classes'


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [13]:
# create a list of numerical columns and categorical columns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['age_upon_intake_(years)']

categorical_features = ['intake_condition_2classes', 'sex', 'sex_type',\
    'animal_type', 'breed_2classes', 'color_3classes',\
    'outcome_type_2classes']

# create the transformer for numerical columns
numeric_transformer = make_pipeline(StandardScaler())

# create the transformer for categorical columns
categorical_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))


# use ColumnTransformer to specify which columns need to be preprocessed in what way
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = make_pipeline(preprocessor, RandomForestClassifier())
pipeline.fit(X_train,y_train)
pipeline.predict(X_test)



array([1, 5, 4, ..., 0, 4, 4])

In [14]:
pipeline.score(X_test,y_test)

0.20606394707828005

## AUROC- for binary target

In [50]:
# Import the necessary libraries
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [57]:
# Create a datatset using the 'make_classification' function
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=0, random_state=42)
X = df[['intake_condition_2classes', 'sex', 'sex_type', 'breed_2classes', 'age_upon_intake_(years)', 'color_3classes',\
    'outcome_type_2classes']]
y = df['time_in_shelter_days_round_2classes'] # next one with 'time_in_shelter_days_round_5classes', 'time_in_shelter_days_round_2classes'


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [58]:
# create a list of numerical columns and categorical columns
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

numeric_features = ['age_upon_intake_(years)']

categorical_features = ['intake_condition_2classes', 'sex', 'sex_type', 'breed_2classes', 'color_3classes',\
    'outcome_type_2classes']

# create the transformer for numerical columns
numeric_transformer = make_pipeline(StandardScaler())

# create the transformer for categorical columns
categorical_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))


# use ColumnTransformer to specify which columns need to be preprocessed in what way
preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

pipeline = make_pipeline(preprocessor, RandomForestClassifier())
pipeline.fit(X_train,y_train)
pipeline.predict(X_test)



array(['one week', 'one week', 'one week', ..., 'more than one week',
       'one week', 'one week'], dtype=object)

In [59]:
pipeline.fit(X_train, y_train)



In [60]:
# predict the probabilities of the positive class (i.e., class 1) on the testing set
y_pred_proba = pipeline.predict_proba(X_test)[:, 1]

In [61]:
# compute the AUROC using the 'roc_auc_score' function
auroc = roc_auc_score(y_test, y_pred_proba)
print("AUROC score: {:.3f}".format(auroc))

AUROC score: 0.732


## Crossvalidate the pipeline

In [74]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('randomforestclassifier', RandomForestClassifier())
])

# Example feature matrix X and target vector y
# X = [[1, 2, 3], [4, 5, 6], [7, 8, 9], [10, 11, 12]]
# y = [0, 1, 0, 1]

# Compute cross-validation scores
scores = cross_val_score(pipeline, X, y, cv=5)

# Print mean and standard deviation of scores
print("Accuracy: {:.2f} (+/- {:.2f})".format(scores.mean(), scores.std() * 2))

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/base.py", line 862, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 824, in fit
    return self.partial_fit(X, y, sample_weight)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 861, in partial_fit
    X = self._validate_data(
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/base.py", line 546, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/pandas/core/generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'not normal'

--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/pipeline.py", line 401, in fit
    Xt = self._fit(X, y, **fit_params_steps)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/pipeline.py", line 359, in _fit
    X, fitted_transformer = fit_transform_one_cached(
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/joblib/memory.py", line 349, in __call__
    return self.func(*args, **kwargs)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/pipeline.py", line 893, in _fit_transform_one
    res = transformer.fit_transform(X, y, **fit_params)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/utils/_set_output.py", line 142, in wrapped
    data_to_wrap = f(self, X, *args, **kwargs)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/base.py", line 862, in fit_transform
    return self.fit(X, y, **fit_params).transform(X)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 824, in fit
    return self.partial_fit(X, y, sample_weight)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/preprocessing/_data.py", line 861, in partial_fit
    X = self._validate_data(
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/base.py", line 546, in _validate_data
    X = check_array(X, input_name="X", **check_params)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/utils/validation.py", line 879, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/sklearn/utils/_array_api.py", line 185, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
  File "/home/wiesemann/.pyenv/versions/3.10.6/envs/animals/lib/python3.10/site-packages/pandas/core/generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
ValueError: could not convert string to float: 'normal'


## GridSearch- hyperparameter tuning

In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [72]:
# Create a datatset using the 'make_classification' function
#X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=0, random_state=42)
X = df[['intake_condition_2classes', 'sex', 'sex_type', 'breed_2classes', 'age_upon_intake_(years)', 'color_3classes',\
    'outcome_type_2classes']]
y = df['time_in_shelter_days_round_2classes'] # next one with 'time_in_shelter_days_round_5classes', 'time_in_shelter_days_round_2classes'


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [73]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('randomforestclassifier', RandomForestClassifier())
])

param_grid = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__max_depth': [3, 5, 7],
    'randomforestclassifier__max_features': [3, 5, 7, 10]
}

grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("Best hyperparameters: ", grid_search.best_params_)
print("Best accuracy score: {:.3f}".format(grid_search.best_score_))



Best hyperparameters:  {'randomforestclassifier__max_depth': 7, 'randomforestclassifier__max_features': 7, 'randomforestclassifier__n_estimators': 50}
Best accuracy score: 0.700


In [None]:
# # target: time_in_shelter_days_round
# Best hyperparameters:  {'randomforestclassifier__max_depth': 7, 'randomforestclassifier__max_features': 10, 'randomforestclassifier__n_estimators': 50}
# Best accuracy score: 0.216

## target: time_in_shelter_days_round_2classes
# Best hyperparameters:  {'randomforestclassifier__max_depth': 7, 'randomforestclassifier__max_features': 7, 'randomforestclassifier__n_estimators': 50}
# Best accuracy score: 0.700
