## Read data frame and keep all unique pet name with other improtant attributes

In [56]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns
# Step 1: Load and clean the data
data = pd.read_csv('/content/drive/MyDrive/Sudata Mentoring Programme Shared Folder/integrated data /optipaw_FINAL.csv')
# we only need the unique pet name, so drop all duplicated rows
data_clean_unique = data.drop_duplicates(subset='Animal.ID')
# Optional: Reset index if needed
data_clean_unique.reset_index(drop=True, inplace=True)
data_clean = data_clean_unique[['Animal.Type', 'Breed', 'Sex', 'Color', 'Age', 'Name']].dropna()

## Feature encoding
There are multiple ways to do the feature encoding. One is one-hot encoding, I modified YC's code to make it works on the recommendation system. The other way is the classical Term Frequency - Inverse Document Frequency (TF-IDF) encoding, please google the term and learn the detial of the method by yourself. We can present these two ways in the final presentation.

### one hot encoding function on train data and test data

In [58]:
def preprocessing(df):
    animal_mapping = {
        "Dog": 1, "Cat": 2, "Other": 3, "Bird": 4, "Livestock": 5,
        "House Rabbit": 6, "Rat": 7, "Ferret": 8, "Pig": 9, "Hamster": 10,
        "Guinea Pig": 11, "Gerbil": 12, "Hedgehog": 13, "Chinchilla": 14,
        "Goat": 15, "Mouse": 16, "Sugar Glider": 17, "Snake": 18,
        "Wildlife": 19, "Lizard": 20
    }
    # df = dft.copy()
    df['Animal.Type'] = df['Animal.Type'].map(animal_mapping)

    # For Breed we will perform one hot encoding onto it
    # Remove parentheses and their contents, and replace '/' with space
    df['Breed'] = df['Breed'].str.replace(r'\(.*?\)', '', regex=True).str.replace('/', ' ').str.replace(',', ' ')

    # Split the 'Breed' column into a list and capitalize the first letter of each word
    df['Breed'] = df['Breed'].str.split().apply(lambda breeds: [breed.rstrip('-').capitalize() for breed in breeds])

    # Create dummy variables for each unique breed
    df_breeds = df['Breed'].str.join(' ').str.get_dummies(sep=' ')

    # Concatenate the original dataframe with the one-hot encoded breed dataframe
    df = pd.concat([df, df_breeds], axis=1)

    # Drop the original 'Breed' column
    df = df.drop(columns=['Breed'])

    # For Sex we will map Int Values to specific Sex
    sex_mapping = {'Neutered Male': 1, 'Spayed Female': 2, 'Intact Female': 3, 'Intact Male': 4, 'Unknown': 5, 'Female': 6, 'Male': 7}

    # Map the Sex column using the defined mapping
    df['Sex'] = df['Sex'].map(sex_mapping)

    # For colours, we will split into individual colours and use one hot encoding, which is assigning binary values to it

    # Split the 'Color' column by '/', 'and', and ','
    df['Color'] = df['Color'].str.replace('/', ' ').str.replace('and', ' ').str.replace(',', ' ').str.replace(r'-\b', '', regex=True)

    # Split the 'Color' column into a list and capitalize the first letter of each word
    df['Color'] = df['Color'].str.split().apply(lambda colors: [color.capitalize() for color in colors])

    # Create dummy variables for each unique color
    df_colors = df['Color'].str.join(' ').str.get_dummies(sep=' ')

    # Concatenate the original dataframe with the one-hot encoded color dataframe
    df = pd.concat([df, df_colors], axis=1)

    # Drop the original 'Color' column
    df = df.drop(columns=['Color'])

    # For Age, we will just store it as int and impute it with 0 if it is null, and store it as float
    df['Age'] = df['Age'].fillna(0).astype(float)
    return df, list(df_breeds.columns), list(df_colors.columns)


def transform(df, train_breed_columns, train_color_columns):
    # Perform the same preprocessing as above

    animal_mapping = {
        "Dog": 1, "Cat": 2, "Other": 3, "Bird": 4, "Livestock": 5,
        "House Rabbit": 6, "Rat": 7, "Ferret": 8, "Pig": 9, "Hamster": 10,
        "Guinea Pig": 11, "Gerbil": 12, "Hedgehog": 13, "Chinchilla": 14,
        "Goat": 15, "Mouse": 16, "Sugar Glider": 17, "Snake": 18,
        "Wildlife": 19, "Lizard": 20
    }

    df['Animal.Type'] = df['Animal.Type'].map(animal_mapping)

    # For Breed
    df['Breed'] = df['Breed'].str.replace(r'\(.*?\)', '', regex=True).str.replace('/', ' ').str.replace(',', ' ')
    df['Breed'] = df['Breed'].str.split().apply(lambda breeds: [breed.rstrip('-').capitalize() for breed in breeds])
    df_breeds = df['Breed'].str.join(' ').str.get_dummies(sep=' ')

     # Identify missing columns in the new dataset compared to the training set
    missing_breed_cols = list(set(train_breed_columns) - set(df_breeds.columns))  # Convert set to list

    # Create a DataFrame with the missing columns set to 0
    missing_breed_df = pd.DataFrame(0, index=df_breeds.index, columns=missing_breed_cols)

    # Concatenate the original breed DataFrame with the missing columns DataFrame
    df_breeds = pd.concat([df_breeds, missing_breed_df], axis=1)

    # Ensure the column order matches the training set
    df_breeds = df_breeds.reindex(columns=train_breed_columns)

    # Optionally, reset the index of df_breeds if needed to avoid any misalignment
    df_breeds.reset_index(drop=True, inplace=True)
    # For Sex we will map Int Values to specific Sex
    sex_mapping = {'Neutered Male': 1, 'Spayed Female': 2, 'Intact Female': 3, 'Intact Male': 4, 'Unknown': 5, 'Female': 6, 'Male': 7}

    df['Sex'] = df['Sex'].map(sex_mapping)

    # For Color
    df['Color'] = df['Color'].str.replace('/', ' ').str.replace('and', ' ').str.replace(',', ' ').str.replace(r'-\b', '', regex=True)
    df['Color'] = df['Color'].str.split().apply(lambda colors: [color.capitalize() for color in colors])
    df_colors = df['Color'].str.join(' ').str.get_dummies(sep=' ')

    # Ensure the new data has the same color columns as the training set
    for col in train_color_columns:
        if col not in df_colors.columns:
            df_colors[col] = 0  # Add missing columns with zero
    df_colors = df_colors[train_color_columns]  # Keep only the columns from the training set

    # Concatenate with the original dataframe
    df = pd.concat([df, df_breeds, df_colors], axis=1).drop(columns=['Breed', 'Color'])

    # For Age
    df['Age'] = df['Age'].fillna(0).astype(float)

    return df

# Use the transform function to process new data


In [59]:
## this is based on cosine similarity and the one-hot encoding.
## encoding the train data
train_data, train_breed_columns, train_color_columns = preprocessing(data_clean)
train_df = train_data.drop(columns=['Name'])

# Define the recommendation function
def recommend_pet_names(animal_type, breed, sex, color, age, top_n=5):
    # Create a feature array for the input
    input_features = pd.DataFrame([[animal_type, breed, sex, color, age]],
                                  columns= ['Animal.Type', 'Breed', 'Sex', 'Color','Age'])
    # One-hot encode the input features
    input_encoded  = transform(input_features, train_breed_columns, train_color_columns)
    # Calculate similarity between input and all existing pets
    input_similarity = cosine_similarity(input_encoded, train_df).flatten()
    # print(input_similarity.argsort()[-top_n:])
    # Get indices of the top N most similar pets
    similar_pet_indices = input_similarity.argsort()[-top_n:][::-1]
    print(data_clean.iloc[similar_pet_indices])
    # Return the names of the most similar pets
    return train_data.iloc[similar_pet_indices]['Name'].values

In [60]:
recommendations = recommend_pet_names('Dog', 'Labrador Retriever/Pit Bull', 'Spayed Female', 'Black/White', 3)
print("Recommended pet names:", recommendations)


       Animal.Type                             Breed            Sex  \
80287            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   
42966            1  [Labrador, Retriever, Pit, Bull]  Spayed Female   
79291            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   
55826            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   
65714            1  [Labrador, Retriever, Pit, Bull]  Spayed Female   

             Color  Age    Name  
80287  Black/White  3.0    Ruby  
42966  Black/White  3.0  Aurora  
79291  Black/White  3.0  Ripley  
55826  Black/White  3.0    Niki  
65714  Black/White  3.0    Lola  
Recommended pet names: ['Ruby' 'Aurora' 'Ripley' 'Niki' 'Lola']


Here, I updated a code chunk to print the cosine similarity score. You can see the some cosine values = 1, which means there might be excatly matched "animal_type, breed, sex, color, age" in the dataset.

In [61]:
def recommend_pet_names(animal_type, breed, sex, color, age, top_n=5):
    # Create a feature array for the input
    input_features = pd.DataFrame([[animal_type, breed, sex, color, age]],
                                  columns= ['Animal.Type', 'Breed', 'Sex', 'Color','Age'])
    # One-hot encode the input features
    input_encoded  = transform(input_features, train_breed_columns, train_color_columns)
    return(input_encoded)
X= recommend_pet_names('Dog', 'Labrador Retriever/Pit Bull', 'Spayed Female', 'Black/White', 3)
np.sort(cosine_similarity(X, train_df).flatten())[::-1]

array([1.        , 1.        , 1.        , ..., 0.25604209, 0.24140952,
       0.2363339 ])

### TF-IDF encoding

The code are more simple than the above because we can directly use the tfidf function, we don't need to write our own encoding function.

In [62]:
#Combine textual features (Breed, Color, Sex, Animal.Type)
data_clean['combined_features'] = data_clean.apply(lambda x: f"{x['Breed']}:{x['Color']}:{x['Sex']}:{x['Animal.Type']}", axis=1)

#Apply TF-IDF to the textual features
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data_clean['combined_features'])

#Standardize the Age feature
scaler = StandardScaler()
age_scaled = scaler.fit_transform(data_clean[['Age']])

#Combine the TF-IDF matrix with the scaled Age feature
combined_features_matrix = np.hstack((tfidf_matrix.toarray(), age_scaled))

#Define the recommendation function
def recommend_tf_idf(animal_type, breed, sex, color, age, top_n=5):
    # Combine the input features for the test vector (excluding Age for now)
    test_combined_features = f"{breed}:{color}:{sex}:{animal_type}"
    # Transform the test vector using the same TF-IDF model
    test_tfidf_vector = tfidf.transform([test_combined_features])
    # Standardize the input Age feature
    test_age_scaled = scaler.transform([[age]])
    # Combine the test TF-IDF vector with the scaled Age
    test_combined_features_vector = np.hstack((test_tfidf_vector.toarray(), test_age_scaled))
    # Calculate the cosine similarity between the test vector and all pets in the dataset
    similarity_scores = cosine_similarity(test_combined_features_vector, combined_features_matrix).flatten()
    # Get the indices of the most similar pets
    similar_pet_indices = similarity_scores.argsort()[-top_n:][::-1]
    # Print details of the most similar pets
    print(data_clean.iloc[similar_pet_indices][['Animal.Type','Breed','Sex','Color','Age','Name']])
    # Return the names of the most similar pets
    return data_clean.iloc[similar_pet_indices]['Name'].values


In [63]:
recommendations = recommend_tf_idf('Dog', 'Labrador Retriever/Pit Bull', 'Spayed Female', 'Black/White', 3)
print("Recommended pet names:", recommendations)



       Animal.Type                             Breed            Sex  \
42966            1  [Labrador, Retriever, Pit, Bull]  Spayed Female   
80287            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   
79291            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   
65714            1  [Labrador, Retriever, Pit, Bull]  Spayed Female   
55826            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   

             Color  Age    Name  
42966  Black/White  3.0  Aurora  
80287  Black/White  3.0    Ruby  
79291  Black/White  3.0  Ripley  
65714  Black/White  3.0    Lola  
55826  Black/White  3.0    Niki  
Recommended pet names: ['Aurora' 'Ruby' 'Ripley' 'Lola' 'Niki']


### KNN method

In [64]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sklearn.neighbors import NearestNeighbors

# Step 1: Combine textual features (Breed, Color, Sex, Animal.Type)
data_clean['combined_features'] = data_clean.apply(lambda x: f"{x['Breed']}:{x['Color']}:{x['Sex']}:{x['Animal.Type']}", axis=1)

# Step 2: Apply TF-IDF to the textual features
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(data_clean['combined_features'])

# Step 3: Standardize the Age feature
scaler = StandardScaler()
age_scaled = scaler.fit_transform(data_clean[['Age']])

# Step 4: Combine the TF-IDF matrix with the scaled Age feature
combined_features_matrix = np.hstack((tfidf_matrix.toarray(), age_scaled))

knn = NearestNeighbors(n_neighbors=5, metric='manhattan')  # or metric='minkowski'
 ## try other Distance Metrics: manhattan,minkowski,euclidean
knn.fit(combined_features_matrix)
# Step 5: Define the recommendation function
def recommend_pet_names_knn(animal_type, breed, sex, color, age, top_n=5):
    # Combine the input features for the test vector (excluding Age for now)
    test_combined_features = f"{breed}:{color}:{sex}:{animal_type}"

    # Transform the test vector using the same TF-IDF model
    test_tfidf_vector = tfidf.transform([test_combined_features])

    # Standardize the input Age feature
    test_age_scaled = scaler.transform([[age]])

    # Combine the test TF-IDF vector with the scaled Age
    test_combined_features_vector = np.hstack((test_tfidf_vector.toarray(), test_age_scaled))

    distances, indices = knn.kneighbors(test_combined_features_vector, n_neighbors=5)
    print(distances)
    print(indices)
    # Get the indices of the most similar pets
    # similar_pet_indices = similarity_scores.argsort()[-top_n:][::-1]

    # Print details of the most similar pets
    print(data_clean.iloc[indices[0]])

    # Return the names of the most similar pets
    return data_clean.iloc[indices[0]]['Name'].values



In [65]:
recommend_pet_names_knn('Dog', 'Labrador Retriever/Pit Bull', 'Spayed Female', 'Black/White', 3)

[[0.91280355 0.91280355 0.91280355 0.91280355 0.91280355]]
[[55822 42964 23685 79286 65709]]
       Animal.Type                             Breed            Sex  \
55826            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   
42966            1  [Labrador, Retriever, Pit, Bull]  Spayed Female   
23685            1  [Labrador, Retriever, Pit, Bull]  Spayed Female   
79291            1  [Pit, Bull, Labrador, Retriever]  Spayed Female   
65714            1  [Labrador, Retriever, Pit, Bull]  Spayed Female   

             Color  Age      Name  \
55826  Black/White  3.0      Niki   
42966  Black/White  3.0    Aurora   
23685  Black/White  3.0  Midnight   
79291  Black/White  3.0    Ripley   
65714  Black/White  3.0      Lola   

                                       combined_features  
55826  ['Pit', 'Bull', 'Labrador', 'Retriever']:Black...  
42966  ['Labrador', 'Retriever', 'Pit', 'Bull']:Black...  
23685  ['Labrador', 'Retriever', 'Pit', 'Bull']:Black...  
79291  ['Pit', 'Bull'



array(['Niki', 'Aurora', 'Midnight', 'Ripley', 'Lola'], dtype=object)