In [33]:
import pandas as pd
import numpy as np

In [34]:
# import train.csv
url_train = "https://raw.githubusercontent.com/naufaldi-fir/adoption-prediction-project/main/Data%20Set/train.csv"
train = pd.read_csv(url_train)

# import test.csv
url_test = "https://raw.githubusercontent.com/naufaldi-fir/adoption-prediction-project/main/Data%20Set/test.csv"
test = pd.read_csv(url_test)

In [35]:
print(train.head())

   Type         Name  Age  Breed1  Breed2  Gender  Color1  Color2  Color3  \
0     2       Nibble    3     299       0       1       1       7       0   
1     2  No Name Yet    1     265       0       1       1       2       0   
2     1       Brisco    1     307       0       1       2       7       0   
3     1         Miko    4     307       0       2       1       2       0   
4     1       Hunter    1     307       0       1       1       0       0   

   MaturitySize  ...  Health  Quantity  Fee  State  \
0             1  ...       1         1  100  41326   
1             2  ...       1         1    0  41401   
2             2  ...       1         1    0  41326   
3             2  ...       1         1  150  41401   
4             2  ...       1         1    0  41326   

                          RescuerID  VideoAmt  \
0  8480853f516546f6cf33aa88cd76c379         0   
1  3082c7125d8fb66f7dd4bff4192c8b14         0   
2  fa90fa5b1ee11c86938398b60abc32cb         0   
3  9238e4f44c71a

### One hot encoding for color

In [36]:
# drop all columns except color
color = train.loc[:,["Color1","Color2","Color3"]]
color.head()

Unnamed: 0,Color1,Color2,Color3
0,1,7,0
1,1,2,0
2,2,7,0
3,1,2,0
4,1,0,0


In [37]:
# because it doesn't make a lot of sense to sort the color in Color 1,2 and 3. The table will be 
# transformed to binary data using one hot encoding
dum_color1 = pd.get_dummies(color.Color1,prefix = "ColorID")
dum_color2 = pd.get_dummies(color.Color2,prefix = "ColorID")
dum_color3 = pd.get_dummies(color.Color3,prefix = "ColorID")

In [38]:
# The dummies (one hot encoding) have different shape and attribute for every column in color
all_color = [dum_color1,dum_color2,dum_color3]
i = 1
for x in all_color:
    print("Dummy Color {}".format(i))
    print(x.head())
    i = i + 1

Dummy Color 1
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          1          0          0          0          0          0          0
1          1          0          0          0          0          0          0
2          0          1          0          0          0          0          0
3          1          0          0          0          0          0          0
4          1          0          0          0          0          0          0
Dummy Color 2
   ColorID_0  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          0          0          0          0          0          0          1
1          0          1          0          0          0          0          0
2          0          0          0          0          0          0          1
3          0          1          0          0          0          0          0
4          1          0          0          0          0          0          0
Dummy Color 3
   ColorID

In [39]:
#changing the attribute and shape of the table so we can sum all the tables
dum_color2["ColorID_1"] = 0
dum_color3["ColorID_1"] = 0
dum_color3["ColorID_2"] = 0
dum_color2.drop(["ColorID_0"], axis = 1, inplace = True)
dum_color3.drop(["ColorID_0"], axis = 1, inplace = True)

In [40]:
#print all the columns, all dummies have 
i = 1
for x in all_color:
    print("Dummy Color {}".format(i))
    print(x.columns)
    i = i + 1

Dummy Color 1
Index(['ColorID_1', 'ColorID_2', 'ColorID_3', 'ColorID_4', 'ColorID_5',
       'ColorID_6', 'ColorID_7'],
      dtype='object')
Dummy Color 2
Index(['ColorID_2', 'ColorID_3', 'ColorID_4', 'ColorID_5', 'ColorID_6',
       'ColorID_7', 'ColorID_1'],
      dtype='object')
Dummy Color 3
Index(['ColorID_3', 'ColorID_4', 'ColorID_5', 'ColorID_6', 'ColorID_7',
       'ColorID_1', 'ColorID_2'],
      dtype='object')


In [14]:
dum_color = dum_color1 + dum_color2 + dum_color3
dum_color

Unnamed: 0,col_1,col_2,col_3,col_4,col_5,col_6,col_7
0,1,0,0,0,0,0,1
1,1,1,0,0,0,0,0
2,0,1,0,0,0,0,1
3,1,1,0,0,0,0,0
4,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...
14988,1,0,0,0,0,0,0
14989,1,0,0,1,0,0,1
14990,0,0,0,0,1,1,1
14991,0,0,0,1,0,0,1


In [16]:
# check one hot encoding with train data, one hot encoding success
pd.concat([color, dum_color],axis = 1).iloc[30:40]

Unnamed: 0,Color1,Color2,Color3,col_1,col_2,col_3,col_4,col_5,col_6,col_7
30,2,7,0,0,1,0,0,0,0,1
31,1,0,0,1,0,0,0,0,0,0
32,5,7,0,0,0,0,0,1,0,1
33,1,6,7,1,0,0,0,0,1,1
34,1,7,0,1,0,0,0,0,0,1
35,6,0,0,0,0,0,0,0,1,0
36,1,4,0,1,0,0,1,0,0,0
37,2,0,0,0,1,0,0,0,0,0
38,1,0,0,1,0,0,0,0,0,0
39,5,0,0,0,0,0,0,1,0,0


### Preparing data for KNN