In [9]:
import pandas as pd
import numpy as np

In [10]:
# import train.csv
url_train = "https://raw.githubusercontent.com/naufaldi-fir/adoption-prediction-project/main/Data%20Set/train.csv"
train = pd.read_csv(url_train)

In [11]:
print(train.head())

   Type         Name  Age  Breed1  Breed2  Gender  Color1  Color2  Color3  \
0     2       Nibble    3     299       0       1       1       7       0   
1     2  No Name Yet    1     265       0       1       1       2       0   
2     1       Brisco    1     307       0       1       2       7       0   
3     1         Miko    4     307       0       2       1       2       0   
4     1       Hunter    1     307       0       1       1       0       0   

   MaturitySize  ...  Health  Quantity  Fee  State  \
0             1  ...       1         1  100  41326   
1             2  ...       1         1    0  41401   
2             2  ...       1         1    0  41326   
3             2  ...       1         1  150  41401   
4             2  ...       1         1    0  41326   

                          RescuerID  VideoAmt  \
0  8480853f516546f6cf33aa88cd76c379         0   
1  3082c7125d8fb66f7dd4bff4192c8b14         0   
2  fa90fa5b1ee11c86938398b60abc32cb         0   
3  9238e4f44c71a

### One hot encoding for color in train.csv

In [12]:
# drop all columns except color
color = train.loc[:,["Color1","Color2","Color3"]]
color.head()

Unnamed: 0,Color1,Color2,Color3
0,1,7,0
1,1,2,0
2,2,7,0
3,1,2,0
4,1,0,0


In [13]:
# because it doesn't make a lot of sense to sort the color in Color 1,2 and 3. The table will be 
# transformed to binary data using one hot encoding
dum_color1 = pd.get_dummies(color.Color1,prefix = "ColorID")
dum_color2 = pd.get_dummies(color.Color2,prefix = "ColorID",drop_first = True)
dum_color3 = pd.get_dummies(color.Color3,prefix = "ColorID",drop_first = True)

In [14]:
all_color = [dum_color1,dum_color2,dum_color3]

In [15]:
# define print color function
def print_color(list):
    i = 1
    for x in list:
        print(f"Dummy Color {i}")
        print(x.head())
        print(f"Shape of the table {x.shape}")
        i = i + 1

In [16]:
# print all color
# print all one hot encoding from get_dummies function => 
# The dummies have different shape and attribute for each table
print_color(all_color)

Dummy Color 1
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          1          0          0          0          0          0          0
1          1          0          0          0          0          0          0
2          0          1          0          0          0          0          0
3          1          0          0          0          0          0          0
4          1          0          0          0          0          0          0
Shape of the table (14993, 7)
Dummy Color 2
   ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          0          0          0          0          0          1
1          1          0          0          0          0          0
2          0          0          0          0          0          1
3          1          0          0          0          0          0
4          0          0          0          0          0          0
Shape of the table (14993, 6)
Dummy Color 3
   ColorID_3  Co

The difference between dummy color 1 and dummy color 2 are in the existence of attribute ColorID_0 and ColorID_1

In [17]:
# changing the attribute and shape of the table so we can sum all the tables
dum_color2["ColorID_1"] = 0
dum_color3["ColorID_2"] = 0
dum_color3["ColorID_1"] = 0
print_color(all_color)

Dummy Color 1
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          1          0          0          0          0          0          0
1          1          0          0          0          0          0          0
2          0          1          0          0          0          0          0
3          1          0          0          0          0          0          0
4          1          0          0          0          0          0          0
Shape of the table (14993, 7)
Dummy Color 2
   ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7  ColorID_1
0          0          0          0          0          0          1          0
1          1          0          0          0          0          0          0
2          0          0          0          0          0          1          0
3          1          0          0          0          0          0          0
4          0          0          0          0          0          0      

In [18]:
def change_color(list):
    arrange_color = []
    for i in range(1,8):
        column_name = f"ColorID_{i}"
        arrange_color.append(column_name)
    for index,value in enumerate(list):
        list[index] = value[arrange_color]

In [19]:
change_color(all_color)
print_color(all_color)

Dummy Color 1
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          1          0          0          0          0          0          0
1          1          0          0          0          0          0          0
2          0          1          0          0          0          0          0
3          1          0          0          0          0          0          0
4          1          0          0          0          0          0          0
Shape of the table (14993, 7)
Dummy Color 2
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          0          0          0          0          0          0          1
1          0          1          0          0          0          0          0
2          0          0          0          0          0          0          1
3          0          1          0          0          0          0          0
4          0          0          0          0          0          0      

all dummies have now same attributes and shape

In [20]:
# adding all the data
one_hot_color = dum_color1 + dum_color2 + dum_color3
one_hot_color.head()

Unnamed: 0,ColorID_1,ColorID_2,ColorID_3,ColorID_4,ColorID_5,ColorID_6,ColorID_7
0,1,0,0,0,0,0,1
1,1,1,0,0,0,0,0
2,0,1,0,0,0,0,1
3,1,1,0,0,0,0,0
4,1,0,0,0,0,0,0


In [21]:
# check one hot encoding with train data
pd.concat([color, one_hot_color],axis = 1).iloc[30:40]

Unnamed: 0,Color1,Color2,Color3,ColorID_1,ColorID_2,ColorID_3,ColorID_4,ColorID_5,ColorID_6,ColorID_7
30,2,7,0,0,1,0,0,0,0,1
31,1,0,0,1,0,0,0,0,0,0
32,5,7,0,0,0,0,0,1,0,1
33,1,6,7,1,0,0,0,0,1,1
34,1,7,0,1,0,0,0,0,0,1
35,6,0,0,0,0,0,0,0,1,0
36,1,4,0,1,0,0,1,0,0,0
37,2,0,0,0,1,0,0,0,0,0
38,1,0,0,1,0,0,0,0,0,0
39,5,0,0,0,0,0,0,1,0,0


 one hot encoding success

In [22]:
df_1= pd.DataFrame({"a": [1],"b":[5]})
df_2 = pd.DataFrame({"b": [100],"a":[200]})
df = df_1 + df_2
print(df)

     a    b
0  201  105


Proof that pandas can add value of column automatically through columns name without arranging order of columns

# Using Classification model KNN on color

## A. Preparing data

### Data Train
Divide the train data in train and test

In [23]:
# X is predictor data and y is target variable
X = one_hot_color
y = train["AdoptionSpeed"]
y.head()

0    2
1    0
2    3
3    2
4    2
Name: AdoptionSpeed, dtype: int64

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, stratify =y)

### Data Test
import test data, do one hot encoding one more time for test data

In [25]:
# import test.csv
url_test = "https://raw.githubusercontent.com/naufaldi-fir/adoption-prediction-project/main/Data%20Set/test.csv"
new_test = pd.read_csv(url_test)
print(new_test.head())
#print(new_test.info())

   Type                       Name  Age  Breed1  Breed2  Gender  Color1  \
0     2               Dopey & Grey    8     266     266       1       2   
1     2                    Chi Chi   36     285     264       2       1   
2     2                     Sticky    2     265       0       1       6   
3     1  Dannie & Kass [In Penang]   12     307       0       2       2   
4     2                    Cuddles   12     265       0       1       2   

   Color2  Color3  MaturitySize  ...  Sterilized  Health  Quantity  Fee  \
0       6       7             1  ...           2       1         2    0   
1       4       7             2  ...           1       2         1    0   
2       7       0             2  ...           2       1         1  200   
3       5       0             2  ...           1       1         2    0   
4       3       7             2  ...           1       1         1    0   

   State                         RescuerID  VideoAmt  \
0  41326  2ece3b2573dcdcebd774e635dca15fd9

In [26]:
color_test = new_test.loc[:,["Color1","Color2","Color3"]]
color_test.head()

Unnamed: 0,Color1,Color2,Color3
0,2,6,7
1,1,4,7
2,6,7,0
3,2,5,0
4,2,3,7


In [27]:
# because it doesn't make a lot of sense to sort the color in Color 1,2 and 3. The table will be 
# transformed to binary data using one hot encoding
dum_color1 = pd.get_dummies(color_test.Color1,prefix = "ColorID")
dum_color2 = pd.get_dummies(color_test.Color2,prefix = "ColorID",drop_first = True) #first column is ColorID_0
dum_color3 = pd.get_dummies(color_test.Color3,prefix = "ColorID",drop_first = True) #first column is ColorID_0
all_color = [dum_color1,dum_color2,dum_color3]
print_color(all_color)

Dummy Color 1
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          0          1          0          0          0          0          0
1          1          0          0          0          0          0          0
2          0          0          0          0          0          1          0
3          0          1          0          0          0          0          0
4          0          1          0          0          0          0          0
Shape of the table (3972, 7)
Dummy Color 2
   ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          0          0          0          0          1          0
1          0          0          1          0          0          0
2          0          0          0          0          0          1
3          0          0          0          1          0          0
4          0          1          0          0          0          0
Shape of the table (3972, 6)
Dummy Color 3
   ColorID_3  Colo

In [28]:
# changing the attribute and shape of the table so we can sum all the tables
dum_color2["ColorID_1"] = 0 # Add ColorID_1 columns to table 2 with value 0
dum_color3["ColorID_2"] = 0
dum_color3["ColorID_1"] = 0
change_color(all_color)
print_color(all_color)

Dummy Color 1
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          0          1          0          0          0          0          0
1          1          0          0          0          0          0          0
2          0          0          0          0          0          1          0
3          0          1          0          0          0          0          0
4          0          1          0          0          0          0          0
Shape of the table (3972, 7)
Dummy Color 2
   ColorID_1  ColorID_2  ColorID_3  ColorID_4  ColorID_5  ColorID_6  ColorID_7
0          0          0          0          0          0          1          0
1          0          0          0          1          0          0          0
2          0          0          0          0          0          0          1
3          0          0          0          0          1          0          0
4          0          0          1          0          0          0       

In [29]:
one_hot_color = dum_color1 + dum_color2 + dum_color3
one_hot_color.head()

Unnamed: 0,ColorID_1,ColorID_2,ColorID_3,ColorID_4,ColorID_5,ColorID_6,ColorID_7
0,0,1,0,0,0,1,1
1,1,0,0,1,0,0,1
2,0,0,0,0,0,1,1
3,0,1,0,0,1,0,0
4,0,1,1,0,0,0,1


In [30]:
#for predicting new test data
X_new = one_hot_color

## B. KNN and SVC Implementation
### 1. KNN

In [31]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

In [32]:
param_grid = {"n_neighbors":np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, param_grid, cv = 5)
knn_cv.fit(X_train, y_train)
print(f"best parameter:{knn_cv.best_params_}")
print(f"best score:{knn_cv.best_score_}")      

best parameter:{'n_neighbors': 47}
best score:0.2746069556931872


The score is the mean of cv-score for n_neighbors = 47

### 2. SVC 

In [38]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
svc = SVC()
cv_results = cross_val_score(svc,X,y,cv = 5)
print("Average 5-Fold CV Score: {}".format(np.mean(cv_results)))

Average 5-Fold CV Score: 0.2838666035220546


## Conclusion
SVC is better to predict data than knn, but both model still are'nt good model to predict adoption speed because the score are too low.