# Data pre-processing and Feature Engineering 
In this file, we use the training and test data (split in the previous file into train/test) as the input. 
<br> After performing the EDA, we review the current state of each data column/feature to perform any necessary data pre-processing or feature transformation/engineering. 

### Imports & Read files

In [1]:
# Remove warning prints
import warnings
warnings.filterwarnings("ignore")

In [2]:
# standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
from mlxtend.plotting import heatmap
from wordcloud import WordCloud
import openpyxl
from fuzzywuzzy import fuzz
from fuzzywuzzy import process as fuzz_process
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import category_encoders as ce


# tf and keras
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Dropout
# import tensorflow_datasets as tfds

# shap
import shap

# plots and images
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import Image

# others
import re
from sklearn.decomposition import PCA
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import words
from collections import Counter

In [3]:
# Universal to entire EDA 
file_path = './data/' #might need to update if you don't have your data here
train_file = 'split_train.csv' #created at end of file 1 (parse and merge)
test_file = 'split_test.csv' #created at end of file 1 (parse and merge)

# Import train and test
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
columns = train.columns

# Additional files 
breed_lbl = 'PetFinder-BreedLabels.csv'
color_lbl = 'PetFinder-ColorLabels.csv'
state_lbl = 'PetFinder-StateLabels.csv'

#labels for numbered categorical features
breeds = pd.read_csv(file_path+breed_lbl)
colors = pd.read_csv(file_path+color_lbl)
states = pd.read_csv(file_path+state_lbl)

## Alberto's Feature Engineering / Data pre-processing

<b>Text Based Features in Dataset</b>
 - Name - Name of pet (Empty if not named)
 - RescuerID - ID of Rescuer
 
<b> Categorical Features in Dataset </b>

 - Health - Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified)
 - Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
 - Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
 - Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)
 - State - State location in Malaysia (Refer to StateLabels dictionary)
 
 <b> Interval Features in Dataset</b>
 - Fee - Adoption fee (0 = Free)
 - Quantity - Number of pets represented in profile
 - Age - Age of pet when listed, in months

## Feature: Name

<b> Name: Text-based feature. Name of pet (Empty if not named) </b>
 - Circa 7,500 unique names. Top names have very low frequency 
 - Inputting/missing values: Fill NAs with No_Name values
 - Add binary variable: valid vs. invalid_names. Invalid_names includes: NAs, generic inputs (such as Puppies, pup, kitten, cat) and non-descriptive inputs (e.g. "No Name Yet" and two letter names such as V1)
 - Accuracy of the above-described name categorization: After sampling 200 rows: 5 false positives, 1 false negative, 194 true positives/negatives

In [4]:
# Identify and process invalid names 

# Fill NAs in train and test
train['Name'].fillna('No_Name', inplace=True)
test['Name'].fillna('No_Name', inplace=True)

# Function to create invalid_name binary column
def invalid_name(df, column_name):
    
    # Check if column_name exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the DataFrame.")

    # Prepoc: Create a new column with default value 0 and fill NAs
    df['Invalid_name'] = 0
    
    # Set the value to 1 if short length
    df.loc[df[column_name].str.len() <= 2, 'Invalid_name'] = 1
    
    # Set the value to 1 if contains generic words
    generic_words = ['Kitten', 'Kittens', 'Kitty', 'Adoption', 'Baby', 'Name', 'Kitties', 
                     'Stray', 'Lost Dog', 'Mixed', 'Male', 'Female', 'MPSP', 
                     'Puppy', 'Puppies', 'Pups', 'Pup' ]
    df.loc[df[column_name].str.contains('|'.join(generic_words), case=False), 'Invalid_name'] = 1
    
    return df

# Apply function to train and test
invalid_name(train, 'Name')
invalid_name(test, 'Name')

# Percentage of invalid names in train
print('Percentage of invalid_names in train')
train['Invalid_name'].value_counts(normalize=True)

Percentage of invalid_names in train


Invalid_name
0    0.770969
1    0.229031
Name: proportion, dtype: float64

## Feature: RescuerID

<b> RescuerID: Text/Alphanumeric feature. ID of Rescuer</b>
 - Create new feature ItTopRescuer if RescuerID in Top 25 with most ads. Top rescuers have lower AdoptionSpeed
 - Create new feature RescuerCount to assign to each rescuer the corresponding % of total ads. 

In [5]:
# Create new feature IsTopRescuer
top_rescuers = train.loc[train['Type'] == 1]['RescuerID'].value_counts().nlargest(25).index
top_rescuers.append(train.loc[train['Type'] == 2]['RescuerID'].value_counts().nlargest(25).index)
train['IsTopRescuer'] = train['RescuerID'].isin(top_rescuers).astype(int)
test['IsTopRescuer'] = test['RescuerID'].isin(top_rescuers).astype(int)

# Create new feature RescuerCount
train['RescuerCount'] = train['RescuerID'].map(train['RescuerID'].value_counts(normalize=True))
test['RescuerCount'] = test['RescuerID'].map(test['RescuerID'].value_counts(normalize=True))

# Drop RescuerID column
train.drop(columns = ['RescuerID'], axis = 1, inplace=True)
test.drop(columns = ['RescuerID'], axis = 1, inplace=True)

In [6]:
# AdoptionSpeed for top rescuers for both cats and dogs 
breeds = ['Dogs', 'Cats']
for item in [1,2]:
    print(f'-- AdoptioSpeed for TopRescuer for {breeds[item-1]} --')
    display(train.loc[train['Type'] == item].groupby('IsTopRescuer')['AdoptionSpeed'].mean())

-- AdoptioSpeed for TopRescuer for Dogs --


IsTopRescuer
0    2.661525
1    2.518557
Name: AdoptionSpeed, dtype: float64

-- AdoptioSpeed for TopRescuer for Cats --


IsTopRescuer
0    2.407204
1    2.272727
Name: AdoptionSpeed, dtype: float64

## Feature: Health

<b> Health: Ordinal feature. Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified) </b>
 - This is an ordinal feature because there is a rank to it ie 1 is more healthy than 2
 - Change numerical encoding to range [0,2] instead of [1,3]

In [7]:
# Relabel numeric values to [0,2] instead of [1,3]
def relabel_col(df, target, to_range, from_range):
    
    # Check if column_name exists in the DataFrame
    if target not in df.columns:
        raise ValueError(f"Column '{target}' does not exist in the DataFrame.")
        
    # Create dict and apply function
    dict_lab = {i: j for i, j in zip(to_range, from_range)}
    df[target] = df[target].map(dict_lab)
    return df

# Apply relabing function to train and test
relabel_col(train,'Health',range(1,4), range(0,3))
relabel_col(test,'Health',range(1,4), range(0,3))

# Value counts on train
train['Health'].value_counts()

Health
0    11578
1      388
2       28
Name: count, dtype: int64

## Feature: Vaccinated, Dewormed

<b> Vaccianted: Nominal feature. Pet has been Vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
<br> Dewormed: Nominal feature. Pet has been Dewormed (1 = Yes, 2 = No, 3 = Not Sure)
</b>
 - These are nominal features with low cardinality so we OHE both 

In [8]:
# Define function to OHE selected columns
def OHE_vars(train, test, target):
    '''
    Takes in the train and test datasets, and the target feature to encode.
    Fits to the train and then transforms both and returns the entire dataset
    with the target feature transformed.
    '''
    
    # Check if column_name exists in the DataFrame
    if target not in train.columns:
        raise ValueError(f"Column '{target}' does not exist in the DataFrame.")
    
    #creating encoder
    temp_encoder = ce.OneHotEncoder(cols=[target])
    # fitting the encoder to train
    temp_encoder.fit(train)
    # transforming both datasets with encoder
    train = temp_encoder.transform(train)
    test = temp_encoder.transform(test)
    
    return train, test

# Apply function
train, test = OHE_vars(train, test, 'Vaccinated')
train, test = OHE_vars(train, test, 'Dewormed')

## Feature: Sterelized

<b> Sterelized: Nominal feature. Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)</b>
 - This is a nominal feature with low cardinality so we also OHE 

In [9]:
# Run OHE on Sterilized for both train and test df
train, test = OHE_vars(train, test, 'Sterilized')

 ## Feature: State

<b> State - State location in Malaysia (Refer to StateLabels dictionary) </b>
 - Since there are 14 states I decide to do BinaryEncoding so we only need 4 additional columns

In [10]:
# Binary Encoding function
def binary_encoding(train, test, target):
    '''
    Takes in the train and test datasets, and the target feature to encode. 
    Fits to the train and then transforms both and returns the entire dataset 
    with the target feature transformed.
    '''
    # Check if column_name exists in the DataFrame
    if target not in train.columns:
        raise ValueError(f"Column '{target}' does not exist in the DataFrame.")
    
    # creating the encoder
    temp_encoder = ce.BinaryEncoder(cols=[target])
    #fitting the encoder to train
    temp_encoder.fit(train)
    #transforming both datasets with encoder
    train = temp_encoder.transform(train)
    test = temp_encoder.transform(test)
    #returning both datasets
    return train, test

# Apply function
train, test = binary_encoding(train, test, 'StateID')

## Feature: Fee

<b> Fee: Adoption fee (0 = Free)</b>
 - Fee variable clipped for outliers
 - Create Fee_binary variable which equals 1 when the pet is not offered for free
 - Create bins for Fee and OHE
 - Fee variable needs to be normalized but this will be done at the top of the modeling notebook with all the other features 

In [11]:
# Clipping values between $0-400
train['Fee'] = train['Fee'].clip(0, 400)
test['Fee'] = test['Fee'].clip(0, 400)

# Create binary variable if pets is Free or not
train['Fee_binary'] = (train['Fee'] > 1).astype(int)
test['Fee_binary'] = (test['Fee'] > 1).astype(int)

# Create bins for Fee and OHE for train
bins_f = [0, 1, 51, 101, float('inf')]
labels_f = ['0', '1-50', '51-100', '+100']
train['Fee_bin'] = pd.cut(train['Fee'], bins=bins_f, labels=labels_f, right=False)
test['Fee_bin'] = pd.cut(test['Fee'], bins=bins_f, labels=labels_f, right=False)

# Apply OHE function
train, test = OHE_vars(train, test, 'Fee_bin')

## Feature: Quantity

<b> Quantity: Numerical feature. Number of pets represented in profile</b>
 - Transform to Quantity_binary. 0 is single pet, 1 if multiple pets (i.e. groups)
 - Create bins for Quantity and OHE
 - Meaningful overlap of Quantity_binary = 1 with Gender = 3 so we might decide to drop Quantity_binary if we want sparse models

In [12]:
# Create new binary feature Quantity_binary
train['Quantity_binary'] = (train['Quantity'] > 1).astype(int)
test['Quantity_binary'] = (test['Quantity'] > 1).astype(int)

# Create bins for Quantity and OHE for train/test
bins_q = [1, 2, 5, float('inf')]
labels_q = ['1', '2-4', '+5']
train['Quantity_bin'] = pd.cut(train['Quantity'], bins=bins_q, labels=labels_q, right=False)
test['Quantity_bin'] = pd.cut(test['Quantity'], bins=bins_q, labels=labels_q, right=False)

# Apply OHE function
train, test = OHE_vars(train, test, 'Quantity_bin')

In [13]:
# Overlap between Gender = 3 and Quantity_binary = 1
group_pets = train.loc[train['Quantity_binary'] == 1]

pd.crosstab(
    index=[group_pets['Quantity']],
    columns=group_pets['Gender'],
    values=group_pets['Quantity'],
    aggfunc=['count'])[:5]

Unnamed: 0_level_0,count,count,count
Gender,1,2,3
Quantity,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
2,231.0,375.0,526.0
3,67.0,138.0,381.0
4,28.0,52.0,334.0
5,19.0,30.0,228.0
6,1.0,27.0,116.0


## Feature: Age

<b> Age: Numerical feature. Age of pet when listed, in months</b>
 - No empty values
 - Clip data due to "Outliers": Most pets have < 60 months (both cats and dogs) 
 - Create new feature: 20% of pets have their Ages guessed (multiple of 12). These pets get adopted slower. 
 - Create age bins and OHE
 - Could create new feature Log_age which is the log(1+x) of age (need to add 1 because the dataset has zeros)

In [14]:
# Clipping values between 0 and 100 months
train['Age'] = train['Age'].clip(0, 100)
test['Age'] = test['Age'].clip(0, 100)

# New feature: Age_guessed if Age is multiple of 12 
train['Age_guessed'] = train['Age'].apply(lambda x: 1 if x in range(12,12*10, 12) else 0)
test['Age_guessed'] = test['Age'].apply(lambda x: 1 if x in range(12,12*10, 12) else 0)

# Create bins for age and OHE for train/test
bins_a = [0, 7, 13, 37, 61, float('inf')]
labels_a = ['0-6m', '6-12m', '12-36m', '37-60m','+60m']
train['Age_bin'] = pd.cut(train['Age'], bins=bins_a, labels=labels_a, right=False)
test['Age_bin'] = pd.cut(test['Age'], bins=bins_a, labels=labels_a, right=False)

# Apply OHE function
train, test = OHE_vars(train, test, 'Age_bin')

# Take the log of the Age to standarize
# train['Age'] = np.log1p(train['Age'])
# test['Age'] = np.log1p(test['Age'])

### -----begin Erik's EDA-----

# Erik's Feature Engineering / Data Cleaning

---
#### Table of Feature Modifications

**Below is a table showing the feature modifications and additions that are being made by the code that follows.** Please refer to EDA file #2 for additional back-up to reasoning.

| Feature | Modified Existing or New | Feature Type | Notes | 
|--|--|--|--|
| Type  | Mod. (E) | Binary (1 = Dog, 0 = Cat) | 
| has_Video | New | Binary (1 = has video, 0 = does not have video) |
| has_Photo | New | Binary (1 = has photo, 0 = does not have photo) |
| MaturitySize | Mod. (E) | Ordinal (1 = Small, 2 = Medium, 3 = Large, 4 = Extra Large, -1 = unspecified) | multiply by Maturity_isSpecified |
| ~~Maturity_isSpecified~~ | ~~New~~ | ~~Binary (1 = Maturity_isSpecified, 0 = Maturity_notSpecified)~~ | irrelevant to current dataset |
| FurLength | Mod. (E) | Ordinal(1 = Short, 2 = Medium, 3 = Long, -1 = unspecified) | multiply by Fur_isSpecified |
| ~~FurLength_isSpecified~~ | ~~New~~ | ~~Binary(1 = isSpecified, 0 = notSpecified)~~ | irrelevant to current dataset |
| isMale | New | Binary (1 = male present, 0 = no_males) | mixed gender has 1 in both |
| isFemale | New | Binary (1 = female present, 0 = no_females) | mixed gender has 1 in both |
| {Color} | New | Binary OHE (1 = color is present, 0 = color not present) | one feature for each color |
| ColorCount | New | Interval (1 - 3 = number of colors identified) |
| isGeneric_Breed | New | Binary (1 = generic description, 0 = more specific) | 0 if another breed present |
| {Breed1_i} | New | Binary_Encoder (i = 0 - 7 ) | using category encoder to fit_transform Breed1 feature |
| {Breed2_i} | New | Binary_Encoder (i = 0 - 6 ) | using category encoder to fit_transform Breed2 feature |

**Below is a table of raw features in the dataset that are not needed for modeling after addition of the above features.** These will be dropped from the datasets. 

| Feature | Feature Type | Notes | 
|--|--|--|
| Gender  |  Nominal (1 = Male, 2 = Female, 3 = Mixed) | replaced by 'isMale' 'isFemale'|
| Color1 | Nominal (1-7) | OHE and count now exists for this feature |
| Color2 | Nominal (1-7) | OHE and count now exists for this feature |
| Color3 | Nominal (1-7) | OHE and count now exists for this feature |

**Below is a table of the features that remain in the dataset untouched for potential use in modeling.**

| Feature | Feature Type | Notes | 
|--|--|--|
| VideoAmt | Interval | Total uploaded videos for this pet |
| PhotoAmt | Interval | Total uploaded photos for this pet |

In [15]:
#Type - transforming all 2's to 0's
train['Type'] = train['Type'].replace(2, 0)
test['Type'] = test['Type'].replace(2, 0)

#has_Video - transforming all non 0's to 1's
train['has_Video'] = (train['VideoAmt'] != 0).astype(int)
test['has_Video'] = (test['VideoAmt'] != 0).astype(int)

#has_Photo - transforming all non 0's to 1's
train['has_Photo'] = (train['PhotoAmt'] != 0).astype(int)
test['has_Photo'] = (test['PhotoAmt'] != 0).astype(int)

#MaturitySize - replacing all 0's with -1's
train['MaturitySize'] = train['MaturitySize'].replace(0, -1)
test['MaturitySize'] = test['MaturitySize'].replace(0, -1)

#### Feature was originally added out of principle to align with documentation 
#### indicated potential for maturity to not be specified. 
#### However Maturity is specified for all records, so this feature is of no use. 
# #Maturity_isSpecified
# train['Maturity_isSpecified'] = (train['MaturitySize'] != 0).astype(int)
# test['Maturity_isSpecified'] = (test['MaturitySize'] != 0).astype(int)

#FurLength - replacing all 0's with -1's
train['FurLength'] = train['FurLength'].replace(0, -1)
test['FurLength'] = test['FurLength'].replace(0, -1)

#### Feature was originally added out of principle to align with documentation 
#### indicated potential for FurLength to not be specified. 
#### However FurLength is specified for all records, so this feature is of no use. 
# #FurLength_isSpecified
# train['FurLength_isSpecified'] = (train['FurLength'] != 0).astype(int)
# test['FurLength_isSpecified'] = (test['FurLength'] != 0).astype(int)

#isMale - transform to binary
train['isMale'] = train['Gender'].apply(lambda x: 1 if x == 1 or x == 3 else 0)
test['isMale'] = test['Gender'].apply(lambda x: 1 if x == 1 or x == 3 else 0)

#isFemale - transform to binary
train['isFemale'] = train['Gender'].apply(lambda x: 1 if x == 2 or x == 3 else 0)
test['isFemale'] = test['Gender'].apply(lambda x: 1 if x == 2 or x == 3 else 0)

#{Color} - OHE for presence of each color
#iterating through each color in the color labels file
for color_num, color in zip(colors['ColorID'], colors['ColorName']):
    #creating a new feature named by color as OHE
    train[color] = train[['Color1', 'Color2', 'Color3']].apply(lambda row: 1 if color_num in row.values else 0, axis=1)
    test[color] = test[['Color1', 'Color2', 'Color3']].apply(lambda row: 1 if color_num in row.values else 0, axis=1)

#ColorCount
color_columns = colors['ColorName'].tolist()
train['ColorCount'] = train[color_columns].sum(axis=1)
test['ColorCount'] = test[color_columns].sum(axis=1)

#isGeneric_Breed
#identifying the 4 IDs for breeds that are generic 'mixed-breed, domestic-short/medium/long hair
generic_breeds = [307,264,265,266]
train['isGeneric_Breed'] = train[['Breed1', 'Breed2']].apply(
    lambda row: 1 if all(breed in generic_breeds or breed == 0 for breed in row) else 0,axis=1)
test['isGeneric_Breed'] = test[['Breed1', 'Breed2']].apply(
    lambda row: 1 if all(breed in generic_breeds or breed == 0 for breed in row) else 0,axis=1)

#Breed1
#utilizing 'binary_encoding' function to minimize number of encodings necessary
#since there are 307 possible IDs
train, test = binary_encoding(train, test, target='Breed1') # need to specify 'target' arg

#Breed2
train, test = binary_encoding(train, test, target ='Breed2') # need to specify 'target' arg

#dropping features Gender, Color1, Color2, Color3
columns_to_drop = ['Gender', 'Color1', 'Color2', 'Color3']
train.drop(columns_to_drop, axis=1, inplace=True)
test.drop(columns_to_drop, axis=1, inplace=True)

### -----begin Nicole's EDA-----

# 3. Nicole's Feature Engineering / Data Cleaning

In [16]:
#check that all rows that have texts have a sentiment score
null_score = train[train['doc_scores'].isnull()]
null_score['text'].isnull().all() #should return true 

True

In [17]:
null_score = test[test['doc_scores'].isnull()]
null_score['text'].isnull().all() #should return true 

True

In [18]:
#create a new column that has value 1 if there is a descriptoin for the pet, otherwise 0
train['has_descriptoin'] = (~train['doc_scores'].isnull()).astype(int)

#fill in missing scores as 0 
# in the model, please use the interaction term "has_description" * "docs_scores" and "has_description" * "doc_magnitudes"

train['doc_scores'] = train['doc_scores'].fillna(0)
train['doc_magnitudes'] = train['doc_magnitudes'].fillna(0)

#upon examination, results from sentiment analysis for non-english text are outliers (don't make sense), set them to 0
train.loc[train['languages'] != 'en', ['doc_scores', 'doc_magnitudes']] = 0

In [19]:
#create a new column that has value 1 if there is a descriptoin for the pet, otherwise 0
test['has_descriptoin'] = (~test['doc_scores'].isnull()).astype(int)

#fill in missing scores as 0 
# in the model, please use the interaction term "has_description" * "docs_scores" and "has_description" * "doc_magnitudes"

test['doc_scores'] = test['doc_scores'].fillna(0)
test['doc_magnitudes'] = test['doc_magnitudes'].fillna(0)

#upon examination, results from sentiment analysis for non-english text are outliers (don't make sense), set them to 0
test.loc[test['languages'] != 'en', ['doc_scores', 'doc_magnitudes']] = 0

In [20]:
#fill in missing values in langauges 
train['languages'] = train['languages'].fillna('NA')
train['Description'] = train['Description'].fillna('NA')

In [21]:
#fill in missing values in langauges 
test['languages'] = test['languages'].fillna('NA')
test['Description'] = test['Description'].fillna('NA')

In [22]:
#take out the sentence scores from the main train df and preserve them in a new df
train_sent_scores_df = train[['PetID','sent_scores','sent_magnitudes']]

#take out the sentence scores from the main test df and preserve them in a new df
test_sent_scores_df = test[['PetID','sent_scores','sent_magnitudes']]

In [23]:
#drop my petID column as it's a duplicate from PetID, as well as other columns not going into the model
train = train.drop(['petID','sent_scores','sent_magnitudes','text'],axis = 1)

test = test.drop(['petID','sent_scores','sent_magnitudes','text'],axis = 1)


In [24]:
train.tail()

Unnamed: 0.1,Unnamed: 0,Type,Name,Age,Breed1_0,Breed1_1,Breed1_2,Breed1_3,Breed1_4,Breed1_5,...,Black,Brown,Golden,Yellow,Cream,Gray,White,ColorCount,isGeneric_Breed,has_descriptoin
11989,5191,1,Rocky,48,0,0,0,1,1,1,...,1,0,0,0,0,0,0,1,0,1
11990,13418,1,Hearty,3,0,0,0,0,0,0,...,0,0,0,0,1,0,1,2,1,1
11991,5390,0,Arwen,12,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,1
11992,860,0,Shy Kittens,5,0,0,0,0,0,0,...,1,0,0,1,1,0,0,3,1,1
11993,7270,0,Toby,16,0,0,0,0,0,0,...,1,0,0,0,0,0,1,2,1,1


In [25]:
test.tail()

Unnamed: 0.1,Unnamed: 0,Type,Name,Age,Breed1_0,Breed1_1,Breed1_2,Breed1_3,Breed1_4,Breed1_5,...,Black,Brown,Golden,Yellow,Cream,Gray,White,ColorCount,isGeneric_Breed,has_descriptoin
2994,8191,1,Monica,3,0,0,0,0,0,0,...,1,1,0,0,0,0,0,2,1,1
2995,3297,1,Scott & Tyler,84,0,0,1,0,1,0,...,1,1,0,0,0,0,1,3,0,1
2996,14107,0,TT (TsingTao),3,0,0,0,1,0,0,...,1,1,0,0,0,0,0,2,0,1
2997,5513,1,Boy,60,0,0,0,0,0,1,...,0,0,0,0,1,1,0,2,0,1
2998,9938,0,Mochi,2,0,0,0,0,0,0,...,1,0,0,0,1,0,1,3,1,1


In [26]:
train.columns[:50]

Index(['Unnamed: 0', 'Type', 'Name', 'Age', 'Breed1_0', 'Breed1_1', 'Breed1_2',
       'Breed1_3', 'Breed1_4', 'Breed1_5', 'Breed1_6', 'Breed1_7', 'Breed2_0',
       'Breed2_1', 'Breed2_2', 'Breed2_3', 'Breed2_4', 'Breed2_5', 'Breed2_6',
       'MaturitySize', 'FurLength', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health', 'Quantity',
       'Fee', 'StateID_0', 'StateID_1', 'StateID_2', 'StateID_3', 'VideoAmt',
       'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'vertex_xs',
       'vertex_ys', 'bounding_confidences', 'bounding_importance_fracs',
       'dominant_blues', 'dominant_greens', 'dominant_reds',
       'dominant_pixel_fracs'],
      dtype='object')

In [27]:
train.columns[50:150]

Index(['dominant_scores', 'label_descriptions', 'label_scores', 'doc_scores',
       'doc_magnitudes', 'languages', 'StateName', 'state_population',
       'median_state_income', 'Invalid_name', 'IsTopRescuer', 'RescuerCount',
       'Fee_binary', 'Fee_bin_1', 'Fee_bin_2', 'Fee_bin_3', 'Fee_bin_4',
       'Quantity_binary', 'Quantity_bin_1', 'Quantity_bin_2', 'Quantity_bin_3',
       'Age_guessed', 'Age_bin_1', 'Age_bin_2', 'Age_bin_3', 'Age_bin_4',
       'Age_bin_5', 'has_Video', 'has_Photo', 'isMale', 'isFemale', 'Black',
       'Brown', 'Golden', 'Yellow', 'Cream', 'Gray', 'White', 'ColorCount',
       'isGeneric_Breed', 'has_descriptoin'],
      dtype='object')

# Lucy FE/Data Cleaning

CURRENT FEATURE | ISSUE | ACTION | NEW FEATURE(S) CREATED | ORIGINAL FEATURE
-|-|-|-|-
'label_descriptions' | nominal (e.g., "dog", "mammal"), 73 unique, ~2% nulls | fill NA's w/ 'Unknown' & binary encode | 7: 'label_descriptions_0'- 'label_descriptions_6'| drop 
'languages' | nominal string (5 unique vals) | binary encode | 'languages_1', 'languages_2' | drop 
'Name' | nominal/string, many unique vals. | transform to binary (0=no name, 1=name)| 'has_name' | drop
'Description' | text strings| capture info in sentiment features/embeddings| 'doc_scores', 'has_description', ...  | drop 
'StateName' | nominal, redundant to binary encoded State Ids | drop | already done: 'State_ID 1' - '3' | drop 
'PetID' | nominal ID number not meaningful as quantity | retain for reference | – | KEEP 

## Check null values in train & test sets 

In [28]:
# define function to count nulls in each col 
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Dataframe has " + str(df.shape[1]) + " columns.\n"      
           + str(mis_val_table_ren_columns.shape[0]) + " columns have missing values.")
    return mis_val_table_ren_columns

missing_values_table(train)

Dataframe has 91 columns.
1 columns have missing values.


Unnamed: 0,Missing Values,% of Total Values
label_descriptions,269,2.2


In [29]:
missing_values_table(test)

Dataframe has 91 columns.
1 columns have missing values.


Unnamed: 0,Missing Values,% of Total Values
label_descriptions,74,2.5


### Fill missing 'label_descriptions' values & binary encode to use in modeling 

In [30]:
# fill NA values in label_descriptions column in with "Unknown" class
train.label_descriptions.fillna("Unknown", inplace=True)
test.label_descriptions.fillna("Unknown", inplace=True)

In [31]:
# check train 
missing_values_table(train)

Dataframe has 91 columns.
0 columns have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [32]:
# check test
missing_values_table(test)

Dataframe has 91 columns.
0 columns have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [33]:
# binary encode 'label_descriptions' feature
train, test = binary_encoding(train, test, target='label_descriptions')

In [34]:
# check train set: created 7 new features (label_descriptions_0 - label_descriptions_6)
train.columns 

Index(['Unnamed: 0', 'Type', 'Name', 'Age', 'Breed1_0', 'Breed1_1', 'Breed1_2',
       'Breed1_3', 'Breed1_4', 'Breed1_5', 'Breed1_6', 'Breed1_7', 'Breed2_0',
       'Breed2_1', 'Breed2_2', 'Breed2_3', 'Breed2_4', 'Breed2_5', 'Breed2_6',
       'MaturitySize', 'FurLength', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health', 'Quantity',
       'Fee', 'StateID_0', 'StateID_1', 'StateID_2', 'StateID_3', 'VideoAmt',
       'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'vertex_xs',
       'vertex_ys', 'bounding_confidences', 'bounding_importance_fracs',
       'dominant_blues', 'dominant_greens', 'dominant_reds',
       'dominant_pixel_fracs', 'dominant_scores', 'label_descriptions_0',
       'label_descriptions_1', 'label_descriptions_2', 'label_descriptions_3',
       'label_descriptions_4', 'label_descriptions_5', 'label_descriptions_6',
       'label_scores', 'doc_scores'

In [35]:
# check test set: created 7 new features (label_descriptions_0 - label_descriptions_6)
test.columns 

Index(['Unnamed: 0', 'Type', 'Name', 'Age', 'Breed1_0', 'Breed1_1', 'Breed1_2',
       'Breed1_3', 'Breed1_4', 'Breed1_5', 'Breed1_6', 'Breed1_7', 'Breed2_0',
       'Breed2_1', 'Breed2_2', 'Breed2_3', 'Breed2_4', 'Breed2_5', 'Breed2_6',
       'MaturitySize', 'FurLength', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health', 'Quantity',
       'Fee', 'StateID_0', 'StateID_1', 'StateID_2', 'StateID_3', 'VideoAmt',
       'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'vertex_xs',
       'vertex_ys', 'bounding_confidences', 'bounding_importance_fracs',
       'dominant_blues', 'dominant_greens', 'dominant_reds',
       'dominant_pixel_fracs', 'dominant_scores', 'label_descriptions_0',
       'label_descriptions_1', 'label_descriptions_2', 'label_descriptions_3',
       'label_descriptions_4', 'label_descriptions_5', 'label_descriptions_6',
       'label_scores', 'doc_scores'

### Binary encode 'languages' to use in modeling 

In [36]:
# binary encode 'language"
train, test = binary_encoding(train, test, target='languages')

In [37]:
# check train/test
train.columns

Index(['Unnamed: 0', 'Type', 'Name', 'Age', 'Breed1_0', 'Breed1_1', 'Breed1_2',
       'Breed1_3', 'Breed1_4', 'Breed1_5', 'Breed1_6', 'Breed1_7', 'Breed2_0',
       'Breed2_1', 'Breed2_2', 'Breed2_3', 'Breed2_4', 'Breed2_5', 'Breed2_6',
       'MaturitySize', 'FurLength', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health', 'Quantity',
       'Fee', 'StateID_0', 'StateID_1', 'StateID_2', 'StateID_3', 'VideoAmt',
       'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'vertex_xs',
       'vertex_ys', 'bounding_confidences', 'bounding_importance_fracs',
       'dominant_blues', 'dominant_greens', 'dominant_reds',
       'dominant_pixel_fracs', 'dominant_scores', 'label_descriptions_0',
       'label_descriptions_1', 'label_descriptions_2', 'label_descriptions_3',
       'label_descriptions_4', 'label_descriptions_5', 'label_descriptions_6',
       'label_scores', 'doc_scores'

In [38]:
test.columns

Index(['Unnamed: 0', 'Type', 'Name', 'Age', 'Breed1_0', 'Breed1_1', 'Breed1_2',
       'Breed1_3', 'Breed1_4', 'Breed1_5', 'Breed1_6', 'Breed1_7', 'Breed2_0',
       'Breed2_1', 'Breed2_2', 'Breed2_3', 'Breed2_4', 'Breed2_5', 'Breed2_6',
       'MaturitySize', 'FurLength', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health', 'Quantity',
       'Fee', 'StateID_0', 'StateID_1', 'StateID_2', 'StateID_3', 'VideoAmt',
       'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed', 'vertex_xs',
       'vertex_ys', 'bounding_confidences', 'bounding_importance_fracs',
       'dominant_blues', 'dominant_greens', 'dominant_reds',
       'dominant_pixel_fracs', 'dominant_scores', 'label_descriptions_0',
       'label_descriptions_1', 'label_descriptions_2', 'label_descriptions_3',
       'label_descriptions_4', 'label_descriptions_5', 'label_descriptions_6',
       'label_scores', 'doc_scores'

### ~~'Name': transform to binary (0=no name, 1=name)~~ (TBD)

### Drop unused categorical features (transformed to new features)

In [39]:
train = train.drop(['Unnamed: 0','Name','Description','StateName'], axis=1)
test = test.drop(['Unnamed: 0','Name','Description','StateName'], axis=1)

In [40]:
print(train.columns[:50])
print(train.columns[50:100])
print(test.columns[:50])
print(test.columns[50:100])

Index(['Type', 'Age', 'Breed1_0', 'Breed1_1', 'Breed1_2', 'Breed1_3',
       'Breed1_4', 'Breed1_5', 'Breed1_6', 'Breed1_7', 'Breed2_0', 'Breed2_1',
       'Breed2_2', 'Breed2_3', 'Breed2_4', 'Breed2_5', 'Breed2_6',
       'MaturitySize', 'FurLength', 'Vaccinated_1', 'Vaccinated_2',
       'Vaccinated_3', 'Dewormed_1', 'Dewormed_2', 'Dewormed_3',
       'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health', 'Quantity',
       'Fee', 'StateID_0', 'StateID_1', 'StateID_2', 'StateID_3', 'VideoAmt',
       'PetID', 'PhotoAmt', 'AdoptionSpeed', 'vertex_xs', 'vertex_ys',
       'bounding_confidences', 'bounding_importance_fracs', 'dominant_blues',
       'dominant_greens', 'dominant_reds', 'dominant_pixel_fracs',
       'dominant_scores', 'label_descriptions_0', 'label_descriptions_1'],
      dtype='object')
Index(['label_descriptions_2', 'label_descriptions_3', 'label_descriptions_4',
       'label_descriptions_5', 'label_descriptions_6', 'label_scores',
       'doc_scores', 'doc_magnitu

### Check column data types 

In [41]:
print(test.dtypes[0:50])
print(test.dtypes[50:100])

Type                           int64
Age                            int64
Breed1_0                       int64
Breed1_1                       int64
Breed1_2                       int64
Breed1_3                       int64
Breed1_4                       int64
Breed1_5                       int64
Breed1_6                       int64
Breed1_7                       int64
Breed2_0                       int64
Breed2_1                       int64
Breed2_2                       int64
Breed2_3                       int64
Breed2_4                       int64
Breed2_5                       int64
Breed2_6                       int64
MaturitySize                   int64
FurLength                      int64
Vaccinated_1                   int64
Vaccinated_2                   int64
Vaccinated_3                   int64
Dewormed_1                     int64
Dewormed_2                     int64
Dewormed_3                     int64
Sterilized_1                   int64
Sterilized_2                   int64
S

## Output files

In [42]:
# train
train.to_csv('split_train_clean.csv')

In [43]:
# test 
test.to_csv('split_test_clean.csv')