In [1]:
# Load the dataset
import pandas as pd
import numpy as np

df = pd.read_csv('pets_prepared.csv')
df.head()

Unnamed: 0,PetID,AdoptionSpeed,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,...,HealthName,VaccinatedName,DewormedName,SterilizedName,BreedName,BreedBinsName,StateName,StateBinsName,ColorName,AdoptedName
0,86e1089a3,2,2,Nibble,3,299,0,1,1,7,...,Healthy,No,No,No,Tabby,Tabby,Selangor,Selangor,Black,Y
1,6296e909a,0,2,No Name Yet,1,265,0,1,1,2,...,Healthy,Not Sure,Not Sure,Not Sure,Domestic Medium Hair,Domestic Medium Hair,Kuala Lumpur,Kuala Lumpur,Black,Y
2,3422e4906,3,1,Brisco,1,307,0,1,2,7,...,Healthy,Yes,Yes,No,Mixed Breed,Mixed Breed,Selangor,Selangor,Brown,Y
3,5842f1ff5,2,1,Miko,4,307,0,2,1,2,...,Healthy,Yes,Yes,No,Mixed Breed,Mixed Breed,Kuala Lumpur,Kuala Lumpur,Black,Y
4,850a43f90,2,1,Hunter,1,307,0,1,1,0,...,Healthy,No,No,No,Mixed Breed,Mixed Breed,Selangor,Selangor,Black,Y


In [12]:
# Investigate the nature of data in a few columns
df['AgeBins'].value_counts()

[  3,  6)    3670
2            3503
[  0,  2)    2483
[ 24, 60)    1601
[  6, 12)    1588
[ 12, 24)    1557
[ 60,255]     591
Name: AgeBins, dtype: int64

In [None]:
df[['Age', 'AgeBins']].head(50)

# Seems strange there are so many pets in the '2' category.

In [13]:
df['BreedBins'].value_counts()

307       5927
266       3634
Others    3272
265       1258
299        342
264        296
292        264
Name: BreedBins, dtype: int64

In [15]:
df[['BreedBins', 'BreedName']].head(50)

# 307 corresponds with Mixed Breed
# All the less common breeds are grouped together in the 'Other' category
# We may want to encode the less common breeds as well depending on the entropy of the column

Unnamed: 0,BreedBins,BreedName
0,299,Tabby
1,265,Domestic Medium Hair
2,307,Mixed Breed
3,307,Mixed Breed
4,307,Mixed Breed
5,266,Domestic Short Hair
6,264,Domestic Long Hair
7,307,Mixed Breed
8,265,Domestic Medium Hair
9,265,Domestic Medium Hair


In [42]:
# Investigate the frequency of the "Breed1" column
df['Breed1'].value_counts()

307    5927
266    3634
265    1258
299     342
264     296
       ... 
176       1
214       1
125       1
123       1
81        1
Name: Breed1, Length: 176, dtype: int64

In [43]:
# Investigate the frequency of the "Breed2" column
df['Breed2'].value_counts()

0      10762
307     1727
266      599
265      321
299      138
       ...  
104        1
36         1
17         1
257        1
279        1
Name: Breed2, Length: 135, dtype: int64

In [None]:
# The number of unique values in the "Breed1" and "Breed2" columns is quite high. 
# I suspect that Breed can be encompassed by FurLength and BreedPure since majority of pets 
# are mixed breed or domestic

In [50]:
# Change all columns into categorical columns
y = df['AdoptionSpeed']

# Column 32 onwards are all derived values or labels which have already been encoded
# We will keep BreedPure, ColorAmt and NameorNO though because the information may be lost when we remove the original columns
x = df.iloc[:,:34].drop('AdoptionSpeed', axis=1)

# We may deal with these columns later, they are difficult to encode/likely not useful for analysis
x = x.drop(['PetID', 'Name', 'Description', 'RescuerID'], axis=1)

# Remove numerical columns
x = x.drop(['Breed1', 'Breed2', 'Age', 'Quantity', 'Fee', 'State', 'VideoAmt', 'PhotoAmt'], axis=1)

# Find columns with missing values
x.isnull().sum()

# No missing values!

Type            0
Gender          0
Color1          0
Color2          0
Color3          0
MaturitySize    0
FurLength       0
Vaccinated      0
Dewormed        0
Sterilized      0
Health          0
AgeBins         0
FeeBins         0
BreedBins       0
StateBins       0
VideoAmtBins    0
PhotoAmtBins    0
QuantityBins    0
BreedPure       0
ColorAmt        0
NameorNO        0
dtype: int64

In [56]:
# Encode the columns
x = pd.get_dummies(x, columns=x.columns, drop_first=True)
x.head()


Unnamed: 0,Type_2,Gender_2,Gender_3,Color1_2,Color1_3,Color1_4,Color1_5,Color1_6,Color1_7,Color2_2,...,PhotoAmtBins_>=6,QuantityBins_2,QuantityBins_3,QuantityBins_4,QuantityBins_5,QuantityBins_>=6,BreedPure_Y,ColorAmt_2,ColorAmt_3,NameorNO_Y
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,1
1,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,1,1,0,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,1
3,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [55]:
# Test which columns are significant

from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest

# Select the best 10 features
selector = SelectKBest(chi2, k='all')
selector.fit(categorical_encoded, y)

# Get the indices of the best features
mask = selector.get_support(indices=True)

# Get the names of the best features
feature_names = categorical_encoded.columns[mask]

# Get the scores of each feature
scores = selector.scores_

# Create a DataFrame with feature names and their corresponding scores
feature_scores = pd.DataFrame({'Feature': categorical_encoded.columns, 'Score': scores})

# Print the DataFrame
print(feature_scores)




             Feature       Score
0             Type_2   87.957838
1           Gender_2   22.079585
2           Gender_3   11.000357
3           Color1_2   22.896721
4           Color1_3   10.275636
5           Color1_4    7.029712
6           Color1_5   18.722371
7           Color1_6   11.748164
8           Color1_7   16.416191
9           Color2_2   15.914620
10          Color2_3    3.001906
11          Color2_4    8.352691
12          Color2_5    7.270784
13          Color2_6   20.001563
14          Color2_7    9.617964
15          Color3_3    3.629455
16          Color3_4   10.986530
17          Color3_5    6.244860
18          Color3_6    4.121616
19          Color3_7    4.153076
20    MaturitySize_2   43.311368
21    MaturitySize_3   15.740021
22    MaturitySize_4    7.169928
23       FurLength_2   11.018202
24       FurLength_3  131.379623
25      Vaccinated_2  138.583013
26      Vaccinated_3   32.725034
27        Dewormed_2   71.036836
28        Dewormed_3   45.038349
29      St