# From existing notebook 3

In [1]:
# Standard
import pandas as pd
import numpy as np 

# sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

# tf and keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras import initializers
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import GlobalAveragePooling1D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam

# shap
import shap

# plots and images
import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import Image, display

import warnings
warnings.filterwarnings("ignore")

IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
Importing display from IPython.core.display is deprecated since IPython 7.14, please import from IPython display


In [15]:
# Load train file
train_file = 'split_train_clean.csv' 
train = pd.read_csv(train_file)

# Load test file
test_file = 'split_test_clean.csv'
test = pd.read_csv(test_file)

print(f'Shape of train file: {train.shape}')
print(f'Shape of test file: {test.shape}')
print(f'Train cols: {train.columns}')

Shape of train file: (11994, 92)
Shape of test file: (2999, 92)
Train cols: Index(['Unnamed: 0.1', 'Unnamed: 0', 'Type', 'Name', 'Age', 'Breed1_0',
       'Breed1_1', 'Breed1_2', 'Breed1_3', 'Breed1_4', 'Breed1_5', 'Breed1_6',
       'Breed1_7', 'Breed2_0', 'Breed2_1', 'Breed2_2', 'Breed2_3', 'Breed2_4',
       'Breed2_5', 'Breed2_6', 'MaturitySize', 'FurLength', 'Vaccinated_1',
       'Vaccinated_2', 'Vaccinated_3', 'Dewormed_1', 'Dewormed_2',
       'Dewormed_3', 'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'Health',
       'Quantity', 'Fee', 'StateID_0', 'StateID_1', 'StateID_2', 'StateID_3',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt', 'AdoptionSpeed',
       'vertex_xs', 'vertex_ys', 'bounding_confidences',
       'bounding_importance_fracs', 'dominant_blues', 'dominant_greens',
       'dominant_reds', 'dominant_pixel_fracs', 'dominant_scores',
       'label_descriptions', 'label_scores', 'doc_scores', 'doc_magnitudes',
       'languages', 'StateName', 'state_populati

In [16]:
# Input data
print(f' Number of missing data points pre: {sum(train.isna().sum())}')

# Drop NAs
train = train.dropna()

print(f' Number of missing data points post: {sum(train.isna().sum())}')

 Number of missing data points pre: 1441
 Number of missing data points post: 0


In [17]:
# Group 0 & 1 and relabel target variable in train and test
relabel_dict = { 0:0, 1:0, 2:1, 3:2, 4:3}
train['AdoptionSpeed'] = train['AdoptionSpeed'].map(relabel_dict)
test['AdoptionSpeed'] = test['AdoptionSpeed'].map(relabel_dict)

# Value counts of re-labeled AdoptionSpeed
train['AdoptionSpeed'].value_counts()

1    2930
3    2820
0    2478
2    2360
Name: AdoptionSpeed, dtype: int64

In [18]:
group_size = 2360

# Downsample so all 4 categories have equal reviews in train
temp_0 = train[train.AdoptionSpeed.eq(0)].sample(
    n=group_size,
    replace=False)

temp_1 = train[train.AdoptionSpeed.eq(1)].sample(
    n=group_size,
    replace=False)

temp_2 = train[train.AdoptionSpeed.eq(2)].sample(
    n=group_size,
    replace=False)

temp_3 = train[train.AdoptionSpeed.eq(3)].sample(
    n=group_size,
    replace=False)

train_bal = pd.concat(
    [temp_0, temp_1, temp_2, temp_3],
    axis=0)

# shuffle df_balanced
train_bal.sample(frac=1) # frac=1 retains all the data
train_bal.reset_index(drop=True, inplace=True) # reset index

print('After downsampling, our data contains', train_bal.shape[0], 'pets and', train_bal.shape[1], 'columns')

After downsampling, our data contains 9440 pets and 92 columns


## previous model: only use Description column

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train_bal['Description'], train_bal['AdoptionSpeed'], test_size=0.2, random_state=42)

vocab_size = 10000
max_sequence_length = 50
embedding_dim = 64

#initialize TextVectorization layer
vectorize_layer = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=max_sequence_length)

#fit it on the training data
vectorize_layer.adapt(X_train.to_numpy())


#vectorize the data
def vectorize_text(text):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text)


X_train = vectorize_text(X_train)
X_val = vectorize_text(X_val)

#build the model
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(
        input_dim=vocab_size,
        output_dim=embedding_dim,
        input_length=max_sequence_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(8, activation='relu'),
    tf.keras.layers.Dense(4, activation='softmax')  # 4 classes in 'AdoptionSpeed'
])

model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(),
              optimizer='adam',
              metrics=['accuracy'])

#Train the model
history = model.fit(
    X_train,
    y_train,
    epochs=10,
    batch_size=32,
    validation_data=(X_val, y_val),
    verbose=1
)


2023-07-29 23:13:40.711125: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Metal device set to: Apple M2 Pro
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [7]:
# apply the model to test set 
test = test.dropna(subset=['Description'])


X_test = test['Description']
Y_test = test['AdoptionSpeed']

X_test = vectorize_text(X_test)

# evaluate the model on test set
test_loss, test_accuracy = model.evaluate(X_test, Y_test, verbose=1)
print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_accuracy))


Test Loss: 1.752939224243164
Test Accuracy: 0.3457944095134735


## now try with all other numeric features

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Flatten, Concatenate

In [19]:
# Drop columns 



drop_cols = ['Unnamed: 0.1', 'Unnamed: 0', 'Name',
             'PetID','label_descriptions','StateName','languages'] #val_accuracy ~0.3

# drop_cols = ['Unnamed: 0.1', 'Unnamed: 0', 'Name',
#              'PetID', 'vertex_xs', 'vertex_ys', 'bounding_confidences',
#              'bounding_importance_fracs', 'dominant_blues', 'dominant_greens',
#              'dominant_reds', 'dominant_pixel_fracs','label_descriptions','StateName','languages']

# drop_cols = ['Unnamed: 0.1', 'Unnamed: 0', 'Name','Breed1_0','PetID',
#        'Breed1_1', 'Breed1_2', 'Breed1_3', 'Breed1_4', 'Breed1_5', 'Breed1_6',
#        'Breed1_7', 'Breed2_0', 'Breed2_1', 'Breed2_2', 'Breed2_3', 'Breed2_4',
#        'Breed2_5', 'Breed2_6', 'Vaccinated_1',
#        'Vaccinated_2', 'Vaccinated_3', 'Dewormed_1', 'Dewormed_2',
#        'Dewormed_3', 'Sterilized_1', 'Sterilized_2', 'Sterilized_3', 'vertex_xs', 'vertex_ys', 'bounding_confidences',
#        'bounding_importance_fracs', 'dominant_blues', 'dominant_greens',
#        'dominant_reds', 'dominant_pixel_fracs', 'dominant_scores',
#        'label_descriptions', 'label_scores', 'doc_scores', 'doc_magnitudes',
#        'languages', 'StateName', 'Invalid_name', 'IsTopRescuer', 'RescuerCount', 'Fee_binary',
#        'Fee_bin_1', 'Fee_bin_2', 'Fee_bin_3', 'Fee_bin_4', 'Quantity_binary',
#        'Quantity_bin_1', 'Quantity_bin_2', 'Quantity_bin_3', 'Age_guessed',
#        'Age_bin_1', 'Age_bin_2', 'Age_bin_3', 'Age_bin_4', 'Age_bin_5'] #produced very bad results val_accuracy 0.05
    


useful_train = train_bal.drop(columns=drop_cols).copy()

useful_train.shape 



(9440, 85)

In [20]:
# Train data
y_train = useful_train['AdoptionSpeed']
X_train = useful_train.drop('AdoptionSpeed', axis=1)


# # Test data
# y_test = test['AdoptionSpeed']
# X_test = test.drop(columns = drop_cols,axis = 1, inplace =True)
# X_test = test.drop('AdoptionSpeed', axis=1)

# Print statement
print(f'Shape of X_train: {X_train.shape}')
print(f'Shape of y_train: {y_train.shape}')


Shape of X_train: (9440, 84)
Shape of y_train: (9440,)


In [26]:
NUM_WORDS = 10000
MAXLEN = 100
EMBEDDING_DIM = 64

tokenizer = Tokenizer(num_words=NUM_WORDS)

tokenizer.fit_on_texts(X_train['Description'])

sequences = tokenizer.texts_to_sequences(X_train['Description'])

X_train_description = pad_sequences(sequences, maxlen=MAXLEN)

#standardize the numerical features
#need to do the same for test later 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.drop('Description',axis = 1))



In [27]:
def build_ffnn_with_embedding(num_features, num_words, embedding_dim, maxlen, learning_rate=0.001):
    tf.keras.backend.clear_session()
    tf.random.set_seed(0)
    np.random.seed(0)
  
    input_text = tf.keras.Input(shape=(maxlen,), name='Input_Text')
    input_num = tf.keras.Input(shape=(num_features,), name='Input_Numeric')
    
    #embedding layer for text
    x_text = tf.keras.layers.Embedding(input_dim=num_words, output_dim=embedding_dim, input_length=maxlen)(input_text)
    x_text = tf.keras.layers.Flatten()(x_text)

    #concatenate all inputs
    x = tf.keras.layers.Concatenate()([x_text, input_num])

    x = tf.keras.layers.Dense(units=32,activation='relu', name='fc_1')(x)
    x = tf.keras.layers.Dropout(0.1)(x) #regularization

    x = tf.keras.layers.Dense(units=16,activation='relu', name='fc_2')(x)
    x = tf.keras.layers.Dropout(0.1)(x) #regularization

    output = tf.keras.layers.Dense(
          units=4,  
          use_bias=False,
          activation='softmax',
          name='Output')(x)
    
    model = tf.keras.Model(inputs=[input_text, input_num], outputs=output)
    
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    
    model.compile(
        loss=tf.keras.losses.SparseCategoricalCrossentropy(),
        optimizer=optimizer,
        metrics=['accuracy'])
    
    return model


In [28]:
model = build_ffnn_with_embedding(num_features=X_train_scaled.shape[1], num_words=NUM_WORDS, embedding_dim=EMBEDDING_DIM, maxlen=MAXLEN)




In [29]:
history = model.fit(
    x=[X_train_description, X_train_scaled],
    y=y_train,
    epochs=15,
    batch_size=16,
    validation_split=0.1,
    verbose=1)


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
