In [2]:
import numpy as np
import pandas as pd
import matplotlib

In [3]:
from Animal_Adoption.preproc.data import get_data

In [4]:
df = get_data()

TypeError: get_data() missing 1 required positional argument: 'animal_type'

In [4]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [5]:
# Colors

colors = df['color'].map(lambda x: " ".join(x.split("/")))
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(colors)
arr = X.toarray()

In [6]:
vectorizer.get_feature_names_out()

array(['beige', 'black', 'brown', 'gray', 'orange', 'point', 'smoke',
       'spotted', 'striped', 'tricolor', 'white'], dtype=object)

In [7]:
df[vectorizer.get_feature_names_out()] = arr

In [8]:
# Define the features and target variable
features = ['age_upon_intake_(years)', 'animal_type', 'breed',
            'intake_condition', 'sex', 'sex_type'] + list(vectorizer.get_feature_names_out())

target = 'time_in_shelter_days'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

In [9]:
# Ensure that the shapes are correct
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (53768, 17)
Shape of y_train: (53768,)
Shape of X_test: (13443, 17)
Shape of y_test: (13443,)


In [10]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [11]:
# create a list of numerical columns and categorical columns
numeric_features = ['age_upon_intake_(years)']

categorical_features = ['animal_type', 'breed',
            'intake_condition', 'sex', 'sex_type']


# create the transformer for numerical columns
numeric_transformer = make_pipeline(StandardScaler())

# create the transformer for categorical columns
categorical_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))


# use ColumnTransformer to specify which columns need to be preprocessed in what way
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [12]:
# create the pipeline with the preprocessor
pipeline = make_pipeline(preprocessor)

In [13]:
pipeline

In [14]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Transform
X_train_transformed = pd.DataFrame(pipeline.transform(X_train))
X_train_transformed



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,0.623992,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
1,-0.630677,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
2,-0.378827,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-0.378827,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,1.292538,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53763,4.635267,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
53764,-0.378827,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
53765,-0.706689,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
53766,2.295357,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [16]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline

# Add RandomForestRegressor to the pipeline

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [17]:
model_pipeline

In [18]:
model_pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('standardscaler',
                                                     StandardScaler())]),
                                    ['age_upon_intake_(years)']),
                                   ('cat',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    ['animal_type', 'breed', 'intake_condition',
                                     'sex', 'sex_type'])])),
  ('regressor', RandomForestRegressor())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('standardscaler',
                                                   StandardScaler())]),
         

In [19]:
# Fit the pipeline on the training data
model_pipeline.fit(X_train, y_train)



In [20]:
# make predictions on the test set
y_pred = model_pipeline.predict(X_test)

In [21]:
y_pred

array([ 6.29527786,  2.4273445 ,  4.27644363, ..., 55.05249224,
       16.44681427, 35.80222031])

In [22]:
# Evaluate the model using mean squared error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1703.9822156303285


In [23]:
# Calculate evaluation metrics r2
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)

R-squared: 0.07506856005413798


In [55]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to search over
param_grid = {
    'regressor__n_estimators': [150, 200, 250, 300],
    'regressor__max_depth': [8, 12],
    'regressor__min_samples_split': [6, 12],
}

# Create the GridSearchCV object
grid_search = GridSearchCV(model_pipeline, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)



In [56]:
model_grid = grid_search.best_estimator_

In [57]:
model_grid

In [58]:
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.1053771320460225
{'regressor__max_depth': 8, 'regressor__min_samples_split': 12, 'regressor__n_estimators': 300}
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  ['age_upon_intake_(years)']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['animal_type', 'breed',
                                                   'intake_condition', 'sex',
                                           