In [1]:
import numpy as np
import pandas as pd
import matplotlib

In [2]:
from Animal_Adoption.preproc.data import get_data

In [3]:
df = get_data()

In [4]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [129]:
from sklearn.base import TransformerMixin, BaseEstimator

class ColorTransformer(TransformerMixin, BaseEstimator):
    
    def __init__(self):
        super()
        self.vectorizer = CountVectorizer()
        
    def fit(self, X, y=None):
        X = X['color'].map(lambda x: " ".join(x.split("/")))
        self.vectorizer.fit(X)
        return self
    
    def transform(self, X, y=None):
        X = X['color'].map(lambda x: " ".join(x.split("/")))
        transformed = self.vectorizer.transform(X)
        columns = self.vectorizer.get_feature_names_out()
        return pd.DataFrame(transformed.toarray(), columns=columns)

In [124]:
color_transformer = ColorTransformer()

In [125]:
df.columns

Index(['animal_id_outcome', 'date_of_birth', 'outcome_type',
       'sex_upon_outcome', 'age_upon_intake', 'animal_type', 'breed', 'color',
       'intake_condition', 'intake_type', 'sex_upon_intake',
       'age_upon_intake_(years)', 'intake_datetime', 'intake_month',
       'intake_year', 'intake_weekday', 'intake_hour', 'intake_number',
       'time_in_shelter_days', 'sex_type', 'sex', 'beige', 'black', 'brown',
       'gray', 'orange', 'point', 'smoke', 'spotted', 'striped', 'tricolor',
       'white'],
      dtype='object')

In [126]:
df['color'].shape

(67211,)

In [127]:
df[['color']]

Unnamed: 0,color
0,Beige/White
3,Tricolor
4,Brown/White
5,Black/White
6,Black/Gray
...,...
79665,Gray
79666,Gray
79667,Striped
79670,Black/White


In [128]:
color_transformer.fit_transform(df[['color']])

         color
0  Beige/White
3     Tricolor
4  Brown/White
5  Black/White
6   Black/Gray
(67211, 11)
['beige' 'black' 'brown' 'gray' 'orange' 'point' 'smoke' 'spotted'
 'striped' 'tricolor' 'white']


Unnamed: 0,beige,black,brown,gray,orange,point,smoke,spotted,striped,tricolor,white
0,1,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,0,1
3,0,1,0,0,0,0,0,0,0,0,1
4,0,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
67206,0,0,0,1,0,0,0,0,0,0,0
67207,0,0,0,1,0,0,0,0,0,0,0
67208,0,0,0,0,0,0,0,0,1,0,0
67209,0,1,0,0,0,0,0,0,0,0,1


In [None]:
#def color_to_array(series):
 #   vectorizer = CountVectorizer()
  #  X = vectorizer.fit_transform(colors)
   # arr = X.toarray()

In [5]:
# Colors
#colors = df['color'].map(lambda x: " ".join(x.split("/")))
#from sklearn.feature_extraction.text import CountVectorizer
#vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(colors)
#arr = X.toarray()

In [None]:
#vectorizer.get_feature_names_out()

In [7]:
#df[vectorizer.get_feature_names_out()] = arr

In [47]:
# Define the features and target variable
features = ['age_upon_intake_(years)', 'animal_type', 'breed',
            'intake_condition', 'sex', 'sex_type', 'color']

target = 'time_in_shelter_days'

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(df[features], df[target], test_size=0.2, random_state=42)

In [48]:
# Ensure that the shapes are correct
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (53768, 7)
Shape of y_train: (53768,)
Shape of X_test: (13443, 7)
Shape of y_test: (13443,)


In [49]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split

In [130]:
# create a list of numerical columns and categorical columns
numeric_features = ['age_upon_intake_(years)']

categorical_features = ['animal_type', 'breed',
            'intake_condition', 'sex', 'sex_type']


# create the transformer for numerical columns
numeric_transformer = make_pipeline(StandardScaler())

# create the transformer for categorical columns
categorical_transformer = make_pipeline(OneHotEncoder(sparse=False, handle_unknown='ignore'))


# use ColumnTransformer to specify which columns need to be preprocessed in what way
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features),
        ('color', ColorTransformer(), ['color'])
    ])

In [137]:
df[categorical_features].nunique().sum()

22

In [131]:
# create the pipeline for linear regression with preprocessor
pipeline = make_pipeline(preprocessor)

In [132]:
pipeline

In [133]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Transform
X_train_transformed = pd.DataFrame(pipeline.transform(X_train))
X_train_transformed



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24,25,26,27,28,29,30,31,32,33
0,0.623992,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.630677,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.378827,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,-0.378827,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.292538,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53763,4.635267,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53764,-0.378827,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53765,-0.706689,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
53766,2.295357,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [138]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

# Add RandomForestRegressor to the pipeline

model_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GradientBoostingRegressor())
])

In [139]:
model_pipeline

In [140]:
model_pipeline.get_params()

{'memory': None,
 'steps': [('preprocessor',
   ColumnTransformer(transformers=[('num',
                                    Pipeline(steps=[('standardscaler',
                                                     StandardScaler())]),
                                    ['age_upon_intake_(years)']),
                                   ('cat',
                                    Pipeline(steps=[('onehotencoder',
                                                     OneHotEncoder(handle_unknown='ignore',
                                                                   sparse=False))]),
                                    ['animal_type', 'breed', 'intake_condition',
                                     'sex', 'sex_type']),
                                   ('color', ColorTransformer(), ['color'])])),
  ('regressor', GradientBoostingRegressor())],
 'verbose': False,
 'preprocessor': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('standardscaler',


In [141]:
# Fit the pipeline on the training data
model_pipeline.fit(X_train, y_train)



In [142]:
# make predictions on the test set
y_pred = model_pipeline.predict(X_test)

In [143]:
y_pred

array([ 8.4142148 ,  6.69002992,  3.81679409, ..., 55.55383568,
       17.44675289, 35.81514997])

In [144]:
# Evaluate the model using mean squared error
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 1651.199837748528


In [145]:
# Calculate evaluation metrics r2
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print('R-squared:', r2)

R-squared: 0.10371914122227599


In [146]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameters to search over
param_grid = {
    'regressor__n_estimators': [150, 200, 250, 300],
    'regressor__max_depth': [8, 12],
    'regressor__min_samples_split': [6, 12],
}

# Create the GridSearchCV object
grid_search = GridSearchCV(model_pipeline, param_grid=param_grid, cv=5)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)







In [147]:
pipeline_best_model = grid_search.best_estimator_

In [29]:
pipeline_best_model

In [30]:
print(grid_search.best_score_)
print(grid_search.best_params_)
print(grid_search.best_estimator_)

0.08539510464204525
{'regressor__max_depth': 8, 'regressor__min_samples_split': 12, 'regressor__n_estimators': 150}
Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('standardscaler',
                                                                   StandardScaler())]),
                                                  ['age_upon_intake_(years)']),
                                                 ('cat',
                                                  Pipeline(steps=[('onehotencoder',
                                                                   OneHotEncoder(handle_unknown='ignore',
                                                                                 sparse=False))]),
                                                  ['animal_type', 'breed',
                                                   'intake_condition', 'sex',
                                          

In [32]:
# Export Pipeline as picke file
import pickle
with open("pipeline_best_model.pkl", "wb") as file:
    pickle.dump(pipeline_best_model, file)    

In [None]:
with open():
    pipeline = pickle.load()

In [None]:
pipeline.predict()