In [1]:
import pandas as pd

# Step 1: Load Optipaw Data
optipaw_data = pd.read_csv('optipaw_FINAL.csv')

# Step 2: Clean the data
# Drop duplicate rows based on 'Animal.ID'
data_clean_unique = optipaw_data.drop_duplicates(subset='Animal.ID').copy()

# Drop names that contain any integers
data_clean_unique = data_clean_unique[~data_clean_unique['Name'].str.contains(r'\d', na=False)]

# Strip leading asterisks from the 'Name' column
data_clean_unique['Name'] = data_clean_unique['Name'].str.lstrip('*').copy()

# Capitalize the first letter of the 'Name' and strip whitespace
data_clean_unique['Name'] = data_clean_unique['Name'].str.strip().str.capitalize()

# # Step 3: Remove duplicate rows based on 'Name'
# data_clean_unique = data_clean_unique.drop_duplicates(subset='Name')

# Step 4: Select relevant columns and drop any rows with missing values (remove NaN)
data_clean = data_clean_unique[['Animal.Type', 'Breed', 'Sex', 'Color', 'Age', 'Name']].dropna()

# Reset index if needed
data_clean.reset_index(drop=True, inplace=True)

# Optional: Display the cleaned data to verify
print(data_clean.head())

  Animal.Type                        Breed            Sex         Color   Age  \
0         Dog         Spinone Italiano Mix  Neutered Male  Yellow/White   7.0   
1         Dog                    Dachshund  Neutered Male      Tricolor  10.0   
2         Dog            Shetland Sheepdog  Neutered Male   Brown/White  16.0   
3         Dog  Labrador Retriever/Pit Bull  Spayed Female   Black/White  15.0   
4         Dog      Miniature Schnauzer Mix  Intact Female    Black/Gray  15.0   

     Name  
0   Scamp  
1    Oreo  
2  Bandit  
3  Bettie  
4   Sasha  


In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
# Preprocessing function
def preprocess_data(data_clean):
    # Step 1: Combine textual features (Breed, Color, Sex, Animal.Type)
    data_clean['combined_features'] = data_clean.apply(lambda x: f"{x['Breed']}:{x['Color']}:{x['Sex']}:{x['Animal.Type']}", axis=1)

    # Step 2: Apply TF-IDF to the textual features
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(data_clean['combined_features'])

    # Step 3: Standardize the Age feature
    scaler = StandardScaler()
    age_scaled = scaler.fit_transform(data_clean[['Age']])

    # Step 4: Combine the TF-IDF matrix with the scaled Age feature
    combined_features_matrix = np.hstack((tfidf_matrix.toarray(), age_scaled))

    return combined_features_matrix, tfidf, scaler, data_clean

In [3]:
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Preprocessing function
def preprocess_data(data_clean):
    # Step 1: Combine textual features (Breed, Color, Sex, Animal.Type)
    data_clean['combined_features'] = data_clean.apply(lambda x: f"{x['Breed']}:{x['Color']}:{x['Sex']}:{x['Animal.Type']}", axis=1)

    # Step 2: Apply TF-IDF to the textual features
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(data_clean['combined_features'])

    # Step 3: Standardize the Age feature
    scaler = StandardScaler()
    age_scaled = scaler.fit_transform(data_clean[['Age']])

    # Step 4: Combine the TF-IDF matrix with the scaled Age feature
    combined_features_matrix = np.hstack((tfidf_matrix.toarray(), age_scaled))

    return combined_features_matrix, tfidf, scaler, data_clean

# KNN recommendation function
def recommend_pet_names_knn(animal_type, breed, sex, color, age, tfidf, scaler, knn, data_clean, top_n=5):
    # Combine the input features for the test vector (excluding Age for now)
    test_combined_features = f"{breed}:{color}:{sex}:{animal_type}"

    # Transform the test vector using the same TF-IDF model
    test_tfidf_vector = tfidf.transform([test_combined_features])

    # Standardize the input Age feature
    test_age_scaled = scaler.transform([[age]])

    # Combine the test TF-IDF vector with the scaled Age
    test_combined_features_vector = np.hstack((test_tfidf_vector.toarray(), test_age_scaled))

    # Use KNN to find the nearest neighbors
    distances, indices = knn.kneighbors(test_combined_features_vector, n_neighbors=top_n)

    # Print the distances and indices of the nearest neighbors (for debugging)
    print("Distances to Neighbors:\n", distances)
    print("Indices of Neighbors:\n", indices)

    # Get the details of the most similar pets using the indices
    similar_pets = data_clean.iloc[indices[0]]

    # Print the details of the most similar pets
    print("\nSelected Pets for Recommendations:")
    print(similar_pets[['Animal.Type', 'Breed', 'Sex', 'Color', 'Age', 'Name']])

    # Return the names of the most similar pets
    return similar_pets['Name'].values

# # Example usage:

# # Step 1: Preprocess the data
# combined_features_matrix, tfidf, scaler, data_clean_processed = preprocess_data(data_clean.copy())

# # Step 2: Initialize the KNN model
# knn = NearestNeighbors(n_neighbors=5, metric='manhattan')
# knn.fit(combined_features_matrix)

# # Step 3: Use the KNN recommendation function
# recommended_names = recommend_pet_names_knn(
#     animal_type='Dog',
#     breed='Labrador Retriever/Pit Bull',
#     sex='Spayed Female',
#     color='Black/White',
#     age=3,
#     tfidf=tfidf,
#     scaler=scaler,
#     knn=knn,
#     data_clean=data_clean_processed
# )

# print("\nRecommended Pet Names:", recommended_names)

In [4]:
# TF-IDF Recommendation

import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Define the recommendation function for TF-IDF
def recommend_tf_idf(df, animal_type, breed, sex, color, age, tfidf, scaler, combined_features_matrix, top_n=5):
    # Prepare the combined test input features
    test_combined_features = f"{breed}:{color}:{sex}:{animal_type}"

    # Vectorize the combined input features
    test_tfidf_vector = tfidf.transform([test_combined_features])

    # Scale the input age
    test_age_scaled = scaler.transform([[age]])

    # Combine the test TF-IDF vector with the scaled Age
    test_combined_features_vector = np.hstack((test_tfidf_vector.toarray(), test_age_scaled))

    # Calculate cosine similarity between the test vector and all pets in the dataset
    similarity_scores = cosine_similarity(test_combined_features_vector, combined_features_matrix).flatten()

    # Get the indices of the top_n most similar pets
    similar_pet_indices = similarity_scores.argsort()[-top_n:][::-1]

    # Print the selected entries from the original DataFrame
    print("\nSelected Entries for Recommendations:")
    print(df.iloc[similar_pet_indices][['Animal.Type', 'Breed', 'Sex', 'Color', 'Age', 'Name']])

    # Return the names of the most similar pets
    return df.iloc[similar_pet_indices]['Name'].values

# # 1. Preprocess the data
# combined_features_matrix, tfidf, scaler, data_clean_processed = preprocess_data(data_clean.copy())

# # 2. Test the TF-IDF recommender with sample input
# recommended_names_tfidf = recommend_tf_idf(
#     df=data_clean_processed,
#     animal_type='Dog',
#     breed='Labrador Retriever/Pit Bull',
#     sex='Spayed Female',
#     color='Black/White',
#     age=3,
#     tfidf=tfidf,
#     scaler=scaler,
#     combined_features_matrix=combined_features_matrix
# )

# # 3. Print the recommended pet names
# print("\nRecommended Pet Names:", recommended_names_tfidf)


In [5]:
import warnings

# Suppress the specific warning from sklearn about feature names
warnings.filterwarnings("ignore", message="X does not have valid feature names, but StandardScaler was fitted with feature names")

# Validation function to get user input
def validate_input(prompt, valid_options=None, allow_spaces_and_slashes=False, return_default=False):
    while True:
        user_input = input(prompt).strip()
        if valid_options:
            if user_input.title() in valid_options:
                return user_input.title()
            else:
                if return_default:
                    return 0  # Return default value (e.g., 0) for invalid input
                else:
                    print(f"Invalid input! Please choose from: {', '.join(valid_options)}")
        else:
            # Clean input for spaces and slashes if allowed
            if allow_spaces_and_slashes:
                user_input = user_input.replace('/', ' ').strip()
            if user_input:
                return user_input
            if return_default:
                return 0  # Return default for empty input


# Valid options for user input
animal_type_options = ['Dog', 'Cat', 'Other', 'Bird', 'Livestock', 'House Rabbit', 'Rat', 'Ferret', 'Pig', 'Hamster', 'Guinea Pig', 'Gerbil', 'Hedgehog', 'Chinchilla', 'Goat', 'Mouse', 'Sugar Glider', 'Snake', 'Wildlife', 'Lizard']
sex_options = ['Neutered Male', 'Spayed Female', 'Intact Female', 'Intact Male', 'Unknown', 'Female', 'Male']


# Function to run both recommendation systems
def run_recommendation_systems():
    while True:
        print("\nPlease enter details for pet name recommendation (type 'exit' to quit):")

        # Get user input with validation, using 0 for invalid entries
        animal_type = validate_input("Enter Animal Type: ", animal_type_options, return_default=True)
        if animal_type == 'exit':
            break

        breed = validate_input("Enter Breed: ", allow_spaces_and_slashes=True)
        sex = validate_input("Enter Sex: ", sex_options, return_default=True)
        color = validate_input("Enter Color: ", allow_spaces_and_slashes=True)

        try:
            age = float(validate_input("Enter Age: ", return_default=True))
        except ValueError:
            print("Invalid age input. Returning 0 as default.")
            age = 0

        knn = NearestNeighbors(n_neighbors=5, metric='manhattan')
        knn.fit(combined_features_matrix)

        # Run the KNN recommendation system
        print("\nRunning KNN Recommendation System:")
        recommended_names_knn = recommend_pet_names_knn(
            animal_type=animal_type,
            breed=breed,
            sex=sex,
            color=color,
            age=age,
            tfidf=tfidf,
            scaler=scaler,
            knn=knn,
            data_clean=data_clean_processed
        )
        print("\nKNN Recommended Names:")
        print(recommended_names_knn)

        # Run the TF-IDF recommendation system
        print("\nRunning TF-IDF Recommendation System:")
        recommended_names_tfidf = recommend_tf_idf(
            df=data_clean_processed,
            animal_type=animal_type,
            breed=breed,
            sex=sex,
            color=color,
            age=age,
            tfidf=tfidf,
            scaler=scaler,
            combined_features_matrix=combined_features_matrix
        )
        print("\nTF-IDF Recommended Names:")
        print(recommended_names_tfidf)

        # Ask if the user wants to enter another set of inputs
        continue_choice = input("\nWould you like to enter another pet? (yes/no): ").strip().lower()
        if continue_choice != 'yes':
            print("Exiting the recommendation systems")
            break


# Preprocess the data for both systems using preprocess_data function
combined_features_matrix, tfidf, scaler, data_clean_processed = preprocess_data(data_clean.copy())

# Run the loop
run_recommendation_systems()

# Test input: ('Dog', 'Labrador Retriever/Pit Bull', 'Spayed Female', 'Black/White', 3)


Please enter details for pet name recommendation (type 'exit' to quit):
Enter Animal Type: Dog
Enter Breed: Labrador Retriever/Pit Bull
Enter Sex: Spayed Female
Enter Color: Black/White
Enter Age: 3

Running KNN Recommendation System:
Distances to Neighbors:
 [[0. 0. 0. 0. 0.]]
Indices of Neighbors:
 [[65251 55525 42862 77254 23650]]

Selected Pets for Recommendations:
      Animal.Type                        Breed            Sex        Color  \
65251         Dog  Labrador Retriever/Pit Bull  Spayed Female  Black/White   
55525         Dog  Pit Bull/Labrador Retriever  Spayed Female  Black/White   
42862         Dog  Labrador Retriever/Pit Bull  Spayed Female  Black/White   
77254         Dog  Pit Bull/Labrador Retriever  Spayed Female  Black/White   
23650         Dog  Labrador Retriever/Pit Bull  Spayed Female  Black/White   

       Age      Name  
65251  3.0      Lola  
55525  3.0      Niki  
42862  3.0    Aurora  
77254  3.0    Ripley  
23650  3.0  Midnight  

KNN Recommended Nam