In [8]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import warnings
warnings.simplefilter('ignore')

In [41]:
df = pd.read_csv('train.csv')
df[df['Gender'].isnull()].sample(5)

Unnamed: 0.1,Unnamed: 0,Type,Name,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,...,Description,PetID,PhotoAmt,ColorName_x,ColorName_y,ColorName,BreedName_x,BreedName_y,StateName_x,AdoptionSpeed
362,27,1,,2,,2.0,1,2,1,2,...,Very Cute.....Very Healthy........taught and k...,22a0baf52,3.0,Brown,Yellow,,Mixed Breed,Mixed Breed,Selangor,2
494,826,1,Tank & Zorro (Urgent Adoption!),120,,3.0,2,1,1,1,...,Our two adult dogs are available for adoption ...,ef7bdaf3a,3.0,Black,Brown,,Labrador Retriever,Harrier,Selangor,0
919,918,2,Kitties,1,,2.0,1,2,2,2,...,The kittens and their mother are quite healthy...,ebff4722c,6.0,Black,Gray,,Persian,Domestic Medium Hair,Kuala Lumpur,1
130,878,2,3 Kitties,3,,1.0,1,2,2,2,...,There are three kittens I'm putting up for ado...,8867ee30a,6.0,Brown,Gray,,Domestic Short Hair,Domestic Short Hair,Kuala Lumpur,1
65,942,1,Mitchie,3,,2.0,1,3,1,2,...,Mitchie is a cute little intelligent 3 month o...,979bab13e,3.0,Cream,Gray,,Husky,Poodle,Kuala Lumpur,0


In [42]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Type,Age,Gender,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,AdoptionSpeed
count,1220.0,1220.0,1220.0,1160.0,1096.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0,1220.0
mean,679.3,1.562295,8.767213,2.101724,1.864051,1.585246,1.770492,1.556557,1.943443,1.02377,2.279508,19.658197,41345.639344,0.083607,4.740164,1.022951
std,390.022296,0.496308,16.287005,0.748883,0.532714,0.631504,0.61944,0.661305,0.553235,0.162806,2.089013,59.898386,32.266188,0.502228,4.068187,0.725351
min,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,41324.0,0.0,0.0,0.0
25%,342.75,1.0,2.0,2.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,41326.0,0.0,2.0,0.0
50%,680.5,2.0,3.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,0.0,41326.0,0.0,4.0,1.0
75%,1015.25,2.0,8.0,3.0,2.0,2.0,2.0,2.0,2.0,1.0,3.0,0.0,41401.0,0.0,5.0,2.0
max,1355.0,2.0,212.0,3.0,4.0,3.0,3.0,3.0,3.0,3.0,20.0,750.0,41401.0,8.0,30.0,2.0


In [43]:
df.columns

Index(['Unnamed: 0', 'Type', 'Name', 'Age', 'Gender', 'MaturitySize',
       'FurLength', 'Vaccinated', 'Dewormed', 'Sterilized', 'Health',
       'Quantity', 'Fee', 'State', 'RescuerID', 'VideoAmt', 'Description',
       'PetID', 'PhotoAmt', 'ColorName_x', 'ColorName_y', 'ColorName',
       'BreedName_x', 'BreedName_y', 'StateName_x', 'AdoptionSpeed'],
      dtype='object')

## Nan values ##

In [44]:
def count_nan(data: pd.DataFrame):
    missing = list()
    for x in data.columns:
        if data[x].isnull().sum() != 0:
            print(x, data[x].isnull().sum())
            missing.append(x)

count_nan(df)

Name 101
Gender 60
MaturitySize 124
ColorName 187
BreedName_x 131
BreedName_y 121


- Drop unnecessary fields (Dewormed has high correlation level with Vaccinated)

In [45]:
ds = df.drop(['Unnamed: 0', 'Name', 'RescuerID', 'PetID', 'Dewormed', 'Description'], axis=1)

- Drop rows without gender, because we can't make up a gender and it's less than 5% of all rows

In [46]:
ds.dropna(how='any', subset=['Gender'], inplace=True)
ds[['Gender']].isnull().sum()
ds['Gender'] = ds['Gender'].astype(float)

- Nan values of breed fill with 'Unknown'

In [47]:
ds[['BreedName_x', 'BreedName_y']] = ds[['BreedName_x', 'BreedName_y']].fillna('Unknown')
ds[['BreedName_x', 'BreedName_y']].isnull().sum()

BreedName_x    0
BreedName_y    0
dtype: int64

- Mode fill of MaturitySize and ColorName

In [48]:
def mode_fill(data, columns):
    for col in columns:
        data[col] = data[col].fillna(data[col].mode()[0])
    return data

mode_fill(ds, ['MaturitySize', 'ColorName'])
ds[['MaturitySize', 'ColorName']].isnull().sum()

MaturitySize    0
ColorName       0
dtype: int64

- Check the empty values

In [49]:
ds.isnull().sum()

Type             0
Age              0
Gender           0
MaturitySize     0
FurLength        0
Vaccinated       0
Sterilized       0
Health           0
Quantity         0
Fee              0
State            0
VideoAmt         0
PhotoAmt         0
ColorName_x      0
ColorName_y      0
ColorName        0
BreedName_x      0
BreedName_y      0
StateName_x      0
AdoptionSpeed    0
dtype: int64

## Categorical data encoding ##
**One Hot Encoding**

In [50]:
encoder = OneHotEncoder(categories='auto',
                       drop='first',
                       sparse_output=False)
encoder.fit(ds[['ColorName_x', 'ColorName_y', 'ColorName', 'StateName_x']])

In [51]:
encoder.categories_

[array(['Black', 'Brown', 'Cream', 'Golden', 'Yellow'], dtype=object),
 array(['Brown', 'Cream', 'Golden', 'Gray', 'Yellow'], dtype=object),
 array(['Cream', 'Golden', 'Gray', 'White', 'Yellow'], dtype=object),
 array(['Johor', 'Kedah', 'Kelantan', 'Kuala Lumpur', 'Melaka',
        'Negeri Sembilan', 'Pahang', 'Perak', 'Pulau Pinang', 'Selangor',
        'Terengganu'], dtype=object)]

In [52]:
encoded_data = encoder.transform(ds[['ColorName_x', 'ColorName_y', 'ColorName', 'StateName_x']])

encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(['ColorName_x', 'ColorName_y', 'ColorName', 'StateName_x']))

ds = ds.drop(columns=['ColorName_x', 'ColorName_y', 'ColorName', 'StateName_x'])
ds = pd.concat([ds.reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)

**Target Encoding**

In [53]:
breed_x_target_mean = ds.groupby('BreedName_x')['AdoptionSpeed'].mean()
breed_y_target_mean = ds.groupby('BreedName_y')['AdoptionSpeed'].mean()
ds['BreedName_x'] = ds['BreedName_x'].map(breed_x_target_mean)
ds['BreedName_y'] = ds['BreedName_y'].map(breed_y_target_mean)

In [54]:
ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1160 entries, 0 to 1159
Data columns (total 38 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Type                         1160 non-null   int64  
 1   Age                          1160 non-null   int64  
 2   Gender                       1160 non-null   float64
 3   MaturitySize                 1160 non-null   float64
 4   FurLength                    1160 non-null   int64  
 5   Vaccinated                   1160 non-null   int64  
 6   Sterilized                   1160 non-null   int64  
 7   Health                       1160 non-null   int64  
 8   Quantity                     1160 non-null   int64  
 9   Fee                          1160 non-null   int64  
 10  State                        1160 non-null   int64  
 11  VideoAmt                     1160 non-null   int64  
 12  PhotoAmt                     1160 non-null   float64
 13  BreedName_x       

## Scaling ##

In [55]:
y = ds['AdoptionSpeed']
X = ds.drop('AdoptionSpeed', axis = 1)

In [56]:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))

In [57]:
X['AdoptionSpeed'] = y
X.to_csv('pets_data.csv', index=False)

In [58]:
X.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28,29,30,31,32,33,34,35,36,AdoptionSpeed
0,-1.126969,-0.475739,1.200005,0.249867,-0.931648,0.359706,0.09549,-0.146889,0.335676,-0.320023,...,-0.029374,-0.57735,-0.077917,-0.125546,-0.058824,-0.157274,-0.216637,0.807726,-0.058824,1
1,-1.126969,-0.349542,1.200005,0.249867,-0.931648,1.983281,1.911365,-0.146889,-0.139183,-0.320023,...,-0.029374,-0.57735,-0.077917,-0.125546,-0.058824,-0.157274,-0.216637,0.807726,-0.058824,1
2,0.887336,-0.538837,1.200005,0.249867,-0.931648,0.359706,0.09549,-0.146889,0.810534,-0.320023,...,-0.029374,-0.57735,-0.077917,-0.125546,-0.058824,-0.157274,-0.216637,0.807726,-0.058824,1
3,-1.126969,-0.475739,-1.471792,-1.721875,0.64834,0.359706,1.911365,-0.146889,-0.614041,-0.320023,...,-0.029374,-0.57735,-0.077917,-0.125546,-0.058824,-0.157274,-0.216637,0.807726,-0.058824,1
4,0.887336,-0.349542,-0.135893,0.249867,2.228329,0.359706,0.09549,-0.146889,-0.614041,-0.320023,...,-0.029374,-0.57735,-0.077917,-0.125546,-0.058824,-0.157274,-0.216637,0.807726,-0.058824,1
