In [None]:
# data analysis and wrangling
import numpy as np
import pandas as pd
from scipy.stats import uniform
from scipy.stats import randint

# visualization
%matplotlib notebook
import seaborn as sns
import matplotlib.pyplot as plt

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

In [None]:
train_df = pd.read_csv('/content/drive/My Drive/Pet Adoption Dataset/train.csv')
train_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [None]:
test_df = pd.read_csv('/content/drive/My Drive/Pet Adoption Dataset/test.csv')
test_df.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,Black,0.87,42.73,0,7
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,Orange Tabby,0.06,6.71,0,1
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,Black,0.24,41.21,0,7
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,Black,0.29,8.46,7,1
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,Brown,0.71,30.92,0,7


In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_id          18834 non-null  object 
 1   issue_date      18834 non-null  object 
 2   listing_date    18834 non-null  object 
 3   condition       17357 non-null  float64
 4   color_type      18834 non-null  object 
 5   length(m)       18834 non-null  float64
 6   height(cm)      18834 non-null  float64
 7   X1              18834 non-null  int64  
 8   X2              18834 non-null  int64  
 9   breed_category  18834 non-null  float64
 10  pet_category    18834 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 1.6+ MB


In [None]:
train_df.describe()

Unnamed: 0,condition,length(m),height(cm),X1,X2,breed_category,pet_category
count,17357.0,18834.0,18834.0,18834.0,18834.0,18834.0,18834.0
mean,0.88339,0.502636,27.448832,5.369598,4.577307,0.600563,1.709143
std,0.770434,0.288705,13.019781,6.572366,3.517763,0.629883,0.717919
min,0.0,0.0,5.0,0.0,0.0,0.0,0.0
25%,0.0,0.25,16.1725,0.0,1.0,0.0,1.0
50%,1.0,0.5,27.34,0.0,4.0,1.0,2.0
75%,1.0,0.76,38.89,13.0,9.0,1.0,2.0
max,2.0,1.0,50.0,19.0,9.0,2.0,4.0


In [None]:
mean_length=round(train_df['length(m)'].mean(),2)
data = [train_df, test_df]
for dataset in data:
    dataset['issue_date']=pd.to_datetime(dataset['issue_date'])
    dataset['listing_date']=pd.to_datetime(dataset['listing_date'])

    dataset['date_diff']=(dataset['listing_date'] - dataset['issue_date']).dt.days
    dataset['issue_day']=pd.to_datetime(dataset['issue_date']).dt.day
    dataset['issue_month']=pd.to_datetime(dataset['issue_date']).dt.month
    dataset['issue_year']=pd.to_datetime(dataset['issue_date']).dt.year
    dataset['listing_day']=pd.to_datetime(dataset['listing_date']).dt.day
    dataset['listing_month']=pd.to_datetime(dataset['listing_date']).dt.month
    dataset['listing_year']=pd.to_datetime(dataset['listing_date']).dt.year

    dataset['listing_hour']=pd.to_datetime(dataset['listing_date']).dt.hour
    # dataset['listing_minute']=pd.to_datetime(dataset['listing_date']).dt.minute

    # dataset['date_diff']=pd.to_datetime(dataset['listing_date']).dt.date - pd.to_datetime(dataset['issue_date']).dt.date
    # dataset['date_diff'] = dataset['date_diff']/np.timedelta64(1, 'D')
    # dataset['date_diff'] = dataset['date_diff'].astype(int)

    dataset['condition'].fillna(value=3, inplace=True)
    dataset['condition'] = dataset['condition'].astype(int)

    dataset['length(m)'].replace(0,mean_length, inplace=True)
    dataset['height(m)']=dataset['height(cm)']/100
    dataset['height_length_ratio']=dataset['height(m)']/dataset['length(m)']
    dataset['X1_X2_ratio']=dataset['X1']/dataset['X2']

    dataset.drop(columns=['issue_date','listing_date','height(cm)'], inplace=True)

train_df['breed_category'] = train_df['breed_category'].astype(int)

train_df.head()

Unnamed: 0,pet_id,condition,color_type,length(m),X1,X2,breed_category,pet_category,date_diff,issue_day,issue_month,issue_year,listing_day,listing_month,listing_year,listing_hour,height(m),height_length_ratio,X1_X2_ratio
0,ANSL_69903,2,Brown Tabby,0.8,13,9,0,1,73,10,7,2016,21,9,2016,16,0.0778,0.09725,1.444444
1,ANSL_66892,1,White,0.72,13,9,0,2,1862,21,11,2013,27,12,2018,17,0.1419,0.197083,1.444444
2,ANSL_69750,3,Brown,0.15,15,4,2,4,752,28,9,2014,19,10,2016,8,0.409,2.726667,3.75
3,ANSL_71623,1,White,0.62,0,1,0,2,755,31,12,2016,25,1,2019,18,0.1782,0.287419,0.0
4,ANSL_57969,2,Black,0.5,18,4,0,1,52,28,9,2017,19,11,2017,9,0.1106,0.2212,4.5


In [None]:
test_df.head()

Unnamed: 0,pet_id,condition,color_type,length(m),X1,X2,date_diff,issue_day,issue_month,issue_year,listing_day,listing_month,listing_year,listing_hour,height(m),height_length_ratio,X1_X2_ratio
0,ANSL_75005,0,Black,0.87,0,7,4404,17,8,2005,7,9,2017,15,0.4273,0.491149,0.0
1,ANSL_76663,1,Orange Tabby,0.06,0,1,174,15,11,2018,8,5,2019,17,0.0671,1.118333,0.0
2,ANSL_58259,1,Black,0.24,0,7,1999,11,10,2012,2,4,2018,16,0.4121,1.717083,0.0
3,ANSL_67171,1,Black,0.29,7,1,1148,13,2,2015,6,4,2018,7,0.0846,0.291724,7.0
4,ANSL_72871,1,Brown,0.71,0,7,463,18,1,2017,26,4,2018,13,0.3092,0.435493,0.0


In [None]:
train_df = pd.concat([train_df,pd.get_dummies(train_df['color_type'], prefix='color')],axis=1)
train_df.drop('color_type', axis = 1, inplace=True)
train_df.head()

Unnamed: 0,pet_id,condition,length(m),X1,X2,breed_category,pet_category,date_diff,issue_day,issue_month,issue_year,listing_day,listing_month,listing_year,listing_hour,height(m),height_length_ratio,X1_X2_ratio,color_Agouti,color_Apricot,color_Black,color_Black Brindle,color_Black Smoke,color_Black Tabby,color_Black Tiger,color_Blue,color_Blue Cream,color_Blue Merle,color_Blue Point,color_Blue Smoke,color_Blue Tabby,color_Blue Tick,color_Blue Tiger,color_Brown,color_Brown Brindle,color_Brown Merle,color_Brown Tabby,color_Brown Tiger,color_Buff,color_Calico,color_Calico Point,color_Chocolate,color_Chocolate Point,color_Cream,color_Cream Tabby,color_Fawn,color_Flame Point,color_Gold,color_Gray,color_Gray Tabby,color_Green,color_Lilac Point,color_Liver,color_Liver Tick,color_Lynx Point,color_Orange,color_Orange Tabby,color_Pink,color_Red,color_Red Merle,color_Red Tick,color_Sable,color_Seal Point,color_Silver,color_Silver Lynx Point,color_Silver Tabby,color_Tan,color_Torbie,color_Tortie,color_Tortie Point,color_Tricolor,color_White,color_Yellow,color_Yellow Brindle
0,ANSL_69903,2,0.8,13,9,0,1,73,10,7,2016,21,9,2016,16,0.0778,0.09725,1.444444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ANSL_66892,1,0.72,13,9,0,2,1862,21,11,2013,27,12,2018,17,0.1419,0.197083,1.444444,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,ANSL_69750,3,0.15,15,4,2,4,752,28,9,2014,19,10,2016,8,0.409,2.726667,3.75,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,ANSL_71623,1,0.62,0,1,0,2,755,31,12,2016,25,1,2019,18,0.1782,0.287419,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,ANSL_57969,2,0.5,18,4,0,1,52,28,9,2017,19,11,2017,9,0.1106,0.2212,4.5,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
test_df = pd.concat([test_df,pd.get_dummies(test_df['color_type'], prefix='color')],axis=1)
test_df.drop('color_type', axis = 1, inplace=True)
test_df.head()

Unnamed: 0,pet_id,condition,length(m),X1,X2,date_diff,issue_day,issue_month,issue_year,listing_day,listing_month,listing_year,listing_hour,height(m),height_length_ratio,X1_X2_ratio,color_Agouti,color_Apricot,color_Black,color_Black Brindle,color_Black Smoke,color_Black Tabby,color_Blue,color_Blue Cream,color_Blue Merle,color_Blue Point,color_Blue Smoke,color_Blue Tabby,color_Blue Tick,color_Blue Tiger,color_Brown,color_Brown Brindle,color_Brown Merle,color_Brown Tabby,color_Buff,color_Calico,color_Calico Point,color_Chocolate,color_Chocolate Point,color_Cream,color_Cream Tabby,color_Fawn,color_Flame Point,color_Gold,color_Gray,color_Gray Tabby,color_Green,color_Lilac Point,color_Liver,color_Liver Tick,color_Lynx Point,color_Orange,color_Orange Tabby,color_Pink,color_Red,color_Red Merle,color_Red Tick,color_Sable,color_Seal Point,color_Silver,color_Silver Lynx Point,color_Silver Tabby,color_Tan,color_Torbie,color_Tortie,color_Tortie Point,color_Tricolor,color_White,color_Yellow,color_Yellow Brindle
0,ANSL_75005,0,0.87,0,7,4404,17,8,2005,7,9,2017,15,0.4273,0.491149,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,ANSL_76663,1,0.06,0,1,174,15,11,2018,8,5,2019,17,0.0671,1.118333,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,ANSL_58259,1,0.24,0,7,1999,11,10,2012,2,4,2018,16,0.4121,1.717083,0.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,ANSL_67171,1,0.29,7,1,1148,13,2,2015,6,4,2018,7,0.0846,0.291724,7.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,ANSL_72871,1,0.71,0,7,463,18,1,2017,26,4,2018,13,0.3092,0.435493,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
train_cols = train_df.columns
test_cols = test_df.columns

not_in_test = train_cols.difference(test_cols)
not_in_test

Index(['breed_category', 'color_Black Tiger', 'color_Brown Tiger',
       'pet_category'],
      dtype='object')

In [None]:
train_df.drop(columns=['color_Black Tiger', 'color_Brown Tiger'], inplace=True)

In [None]:
# train_df.groupby(['condition', 'pet_category'], as_index=False).size()

In [None]:
# %matplotlib inline
# train_df.plot.scatter(x='condition',
#                       y='pet_category',
#                       c='DarkBlue')

In [None]:
# train_features = ['height(cm)',	'X1',	'X2','date_diff']
# scaler = StandardScaler()
train_df_ss=train_df.copy()
# train_df_ss[train_features] = scaler.fit_transform(train_df_ss[train_features])
# train_df_ss.head(10)

In [None]:
test_df_ss=test_df.copy()
# test_df_ss[train_features] = scaler.transform(test_df_ss[train_features])
# test_df_ss.head(10)

In [None]:
X_train = train_df_ss.drop(columns = ['pet_id','breed_category','pet_category'], inplace= False)
Y1_train = train_df_ss['breed_category']
Y2_train = train_df_ss['pet_category']
X_test = test_df_ss.drop(columns = ['pet_id'], inplace= False)

### Random Forest

In [None]:
# Random Forest

# random_forest1 = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=1)
# random_forest1.fit(X_train, Y1_train)
# Y1_pred = random_forest1.predict(X_test)
# acc_random_forest1 = round(random_forest1.score(X_train, Y1_train) * 100, 2)
# acc_random_forest1

In [None]:
# Random Forest

# random_forest2 = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=1)
# random_forest2.fit(X_train, Y2_train)
# Y2_pred = random_forest2.predict(X_test)
# acc_random_forest2 = round(random_forest2.score(X_train, Y2_train) * 100, 2)
# acc_random_forest2

In [None]:
# submission = pd.DataFrame({'pet_id': test_df['pet_id'],'breed_category': Y1_pred,'pet_category': Y2_pred})
# submission.to_csv('random_forest_submission.csv', index=False)

### SVC

In [None]:
# Support Vector Machines

# svc1 = SVC()
# svc1.fit(X_train, Y1_train)
# Y1_pred = svc1.predict(X_test)
# acc_svc1 = round(svc1.score(X_train, Y1_train) * 100, 2)
# acc_svc1

In [None]:
# Support Vector Machines

# svc2 = SVC()
# svc2.fit(X_train, Y2_train)
# Y2_pred = svc2.predict(X_test)
# acc_svc2 = round(svc2.score(X_train, Y2_train) * 100, 2)
# acc_svc2

In [None]:
# submission = pd.DataFrame({'pet_id': test_df['pet_id'],'breed_category': Y1_pred,'pet_category': Y2_pred})
# submission.to_csv('svc_submission.csv', index=False)

In [None]:
# param_grid = {'C': [0.1, 1, 10],  
#               'gamma': [1, 0.1, 0.01], 
#               'kernel': ['rbf','poly']}  
# svc = SVC()
# clf = GridSearchCV(estimator=svc, param_grid=param_grid, n_jobs=-1)
# clf.fit(X_train, Y2_train)
# clf.best_params_, clf.best_score_

### XGB

In [None]:
# param_grid = {# 'n_estimators': [300,500,800,1000],
#               'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],
#               # 'subsample': [0.3,0.6,0.9],
#               'max_depth': [3, 4, 5, 6, 7, 8, 9],
#               # 'colsample_bytree': [0.5,0.7,0.9],
#               # 'min_child_weight': [1, 2, 3, 4]
#               }
# xgb = XGBClassifier()
# clf = RandomizedSearchCV(xgb, param_grid, n_jobs=-1, random_state=42)
# clf.fit(X_train, Y1_train)
# clf.best_params_, clf.best_score_

In [None]:
# param_grid = {'n_estimators': [300,500,800,1000],
#               'learning_rate': [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],
#               'subsample': [0.3,0.6,0.9],
#               'max_depth': [3, 4, 5, 6, 7, 8, 9],
#               'colsample_bytree': [0.5,0.7,0.9],
#               'min_child_weight': [1, 2, 3, 4]
#               }
# xgb = XGBClassifier()
# clf = RandomizedSearchCV(xgb, param_grid, n_jobs=-1, random_state=42)
# clf.fit(X_train, Y2_train)
# clf.best_params_, clf.best_score_

In [None]:
# XGBoost # colsample_bytree= 0.9,learning_rate= 0.25,max_depth= 3,min_child_weight= 2,n_estimators= 300,subsample= 0.9

# xgb1 = XGBClassifier(learning_rate= 0.1, max_depth= 3, n_estimators= 300)
# xgb1.fit(X_train, Y1_train)
# Y1_pred = xgb1.predict(X_test)
# acc_xgb1 = round(xgb1.score(X_train, Y1_train) * 100, 2)
# acc_xgb1

In [None]:
# XGBoost # colsample_bytree= 0.9,learning_rate= 0.25,max_depth= 3,min_child_weight= 2,n_estimators= 300,subsample= 0.9

# xgb2 = XGBClassifier(learning_rate= 0.1, max_depth= 4, n_estimators= 300)
# xgb2.fit(X_train, Y2_train)
# Y2_pred = xgb2.predict(X_test)
# acc_xgb2 = round(xgb2.score(X_train, Y2_train) * 100, 2)
# acc_xgb2

In [None]:
# submission = pd.DataFrame({'pet_id': test_df['pet_id'],'breed_category': Y1_pred,'pet_category': Y2_pred})
# submission.to_csv('xgb_submission.csv', index=False)

### LGBM

In [None]:
# param_grid ={'num_leaves': [2,5,10,15,30,45],
#              'min_child_samples': [2,5,7,10,15], 
#              'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2],
#              'learning_rate': [0.001,0.01,0.1,1,10],
#              'min_data_in_leaf': [5,10,20,30,40,50,60],
#             #  'subsample': uniform(loc=0.2, scale=0.8), 
#             #  'colsample_bytree': uniform(loc=0.4, scale=0.6),
#              'reg_alpha': [0.01,0.1,0,1,10],
#              'reg_lambda': [0.01,0.1,0,1,10],
#              'max_depth': [1,3,5,10,15,20,25,-1]
#              }
# lgbm = LGBMClassifier()
# clf = RandomizedSearchCV(lgbm, param_grid, n_jobs=-1, random_state=42)
# clf.fit(X_train, Y1_train)
# clf.best_params_, clf.best_score_

In [None]:
# param_grid ={'num_leaves': [6,15,30,45,60,75,90],
#              'min_child_samples': [2,5,7,10,15], 
#              'min_child_weight': [1e-3, 1e-2, 1e-1, 1, 1e1, 1e2],
#              'learning_rate': [0.001,0.01,0.1,1,10],
#              'min_data_in_leaf': [5,10,20,30,40],
#             #  'subsample': uniform(loc=0.2, scale=0.8), 
#             #  'colsample_bytree': uniform(loc=0.4, scale=0.6),
#              'reg_alpha': [0.01,0.1,0,1,10],
#              'reg_lambda': [0.01,0.1,0,1,10],
#              'max_depth': [1,3,5,10,15,20,-1]
#              }
# lgbm = LGBMClassifier()
# clf = RandomizedSearchCV(lgbm, param_grid, n_jobs=-1, random_state=42)
# clf.fit(X_train, Y2_train)
# clf.best_params_, clf.best_score_

In [None]:
# LGBM # min_child_samples= 20, min_child_weight= 0.01, reg_alpha= 0.1, reg_lambda= 0.01
# learning_rate= 0.1, max_depth= 6, min_data_in_leaf= 20, num_leaves= 45

lgbm1 = LGBMClassifier(learning_rate= 0.1, max_depth= 6, min_data_in_leaf= 20, num_leaves= 45)
lgbm1.fit(X_train, Y1_train)
Y1_pred = lgbm1.predict(X_test)
acc_lgbm1 = round(lgbm1.score(X_train, Y1_train) * 100, 2)
acc_lgbm1

93.89

In [None]:
# LGBM # min_child_samples= 20, min_child_weight= 1, num_leaves= 80
# learning_rate= 0.1, max_depth= 8, min_data_in_leaf= 20, num_leaves= 150

lgbm2 = LGBMClassifier(learning_rate= 0.1, max_depth= 8, min_data_in_leaf= 20, num_leaves= 150)
lgbm2.fit(X_train, Y2_train)
Y2_pred = lgbm2.predict(X_test)
acc_lgbm2 = round(lgbm2.score(X_train, Y2_train) * 100, 2)
acc_lgbm2

92.42

In [None]:
submission = pd.DataFrame({'pet_id': test_df['pet_id'],'breed_category': Y1_pred,'pet_category': Y2_pred})
submission.to_csv('lgbm_submission.csv', index=False)

### KNN

In [None]:
# KNN or k-Nearest Neighbors

# knn1 = KNeighborsClassifier(n_neighbors=7)
# knn1.fit(X_train, Y1_train)
# Y1_pred = knn1.predict(X_test)
# acc_knn1 = round(knn1.score(X_train, Y1_train) * 100, 2)
# acc_knn1

In [None]:
# KNN or k-Nearest Neighbors

# knn2 = KNeighborsClassifier(n_neighbors=7)
# knn2.fit(X_train, Y2_train)
# Y2_pred = knn2.predict(X_test)
# acc_knn2 = round(knn2.score(X_train, Y2_train) * 100, 2)
# acc_knn2

In [None]:
# submission = pd.DataFrame({'pet_id': test_df['pet_id'],'breed_category': Y1_pred,'pet_category': Y2_pred})
# submission.to_csv('knn_submission.csv', index=False)