## data preprocessing 


In [1]:
import pandas as pd
import sqlite3
pd.set_option('display.max_rows', None)


## Import the data

In [2]:
data_acc_intakes = pd.read_csv('data/animal_data_8_24/aac_intakes.csv')


In [3]:
data_acc_intakes.head(
)

Unnamed: 0,age_upon_intake,animal_id,animal_type,breed,color,datetime,datetime2,found_location,intake_condition,intake_type,name,sex_upon_intake
0,8 years,A706918,Dog,English Springer Spaniel,White/Liver,2015-07-05T12:59:00.000,2015-07-05T12:59:00.000,9409 Bluegrass Dr in Austin (TX),Normal,Stray,Belle,Spayed Female
1,11 months,A724273,Dog,Basenji Mix,Sable/White,2016-04-14T18:43:00.000,2016-04-14T18:43:00.000,2818 Palomino Trail in Austin (TX),Normal,Stray,Runster,Intact Male
2,4 weeks,A665644,Cat,Domestic Shorthair Mix,Calico,2013-10-21T07:59:00.000,2013-10-21T07:59:00.000,Austin (TX),Sick,Stray,,Intact Female
3,4 years,A682524,Dog,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,2014-06-29T10:38:00.000,2014-06-29T10:38:00.000,800 Grove Blvd in Austin (TX),Normal,Stray,Rio,Neutered Male
4,2 years,A743852,Dog,Labrador Retriever Mix,Chocolate,2017-02-18T12:46:00.000,2017-02-18T12:46:00.000,Austin (TX),Normal,Owner Surrender,Odin,Neutered Male


In [4]:
data_acc_intakes = data_acc_intakes[data_acc_intakes['animal_type'].str.contains('Bird') == False]
data_acc_intakes = data_acc_intakes[data_acc_intakes['animal_type'].str.contains('Other') == False]
data_acc_intakes = data_acc_intakes[data_acc_intakes['animal_type'].str.contains('Livestock') == False]

data_acc_intakes['animal_type'].value_counts()

Dog    45743
Cat    29659
Name: animal_type, dtype: int64

In [5]:
all_color = data_acc_intakes['color']
all_color = pd.DataFrame(all_color)

## Define preprocessing functions

In [6]:
#color name substitution 
def replace_colors(color_list, new_color, series):
    series = series.copy()
    for color in color_list:
        series = series.str.replace(color, new_color)
    return series

In [7]:
#check for duplicate values separated by "/"

def is_duplicate_value(color):
    color_list = color.split("/")
    if len(color_list) != 2:
        return color
    if color_list[0] == color_list[1]:
        return color_list[0]
    else:
        return color

In [8]:
#defining all tricolor animals in tricolor
def is_tricolor(in_color):
    colors = ["Calico", "Tricolor"]
    color_list = " ".join(in_color.split("/")).split(" ")
    for color in colors:
        if color in color_list:
            return "Tricolor"
    return in_color

In [9]:
#defining all animals in striped
def has_stripes(in_color):
    colors = ["Torbie", "Striped", "Tabby", "Tortie", "Tiger", "Brindle", "Sable"]
    color_list = " ".join(in_color.split("/")).split(" ")
    for color in colors:
        if color in color_list:
            return "Striped"
    return in_color

In [10]:
#defining all animals with spots
def has_spots(in_color):
    colors = ["Merle", "Spotted"]
    color_list = " ".join(in_color.split("/")).split(" ")
    for color in colors:
        if color in color_list:
            return "Spotted"
    return in_color

## Call cleaning functions

In [38]:
#function for performing all of the above operations at once!
def perform_all_color_cleaning(series):
    color_map = {
        "": [" Tick"],
        "Brown": ["Chocolate", "Liver", "Ruddy"], 
        "White": ["Flame Point", "Lilac Point"], 
        "Beige": ["Buff", "Tan", "Fawn", "Yellow", "Gold", "Cream", "Seal Point", "Lynx Point", "Brown Point", "Apricot", "Pink"],
        "Orange": ["Orange Tabby", "Red"], 
        "Tricolor": ["Tricolor", "Calico"], 
        "Spotted": ["Black Merle", "Brown Merle", "Gray Merle", "Orange Merle" ], 
        "Striped": ["Tiger", "Tabby"],
        "Gray": ["Black Smoke", "Gray Smoke", "Gray Point", "Silver Lynx Point", "Silver", "Agouti", "Grey", "Blue", "Gray Beige"],
    }
    for key, val in color_map.items():
        series = replace_colors(val, key, series)
    series = series.map(is_duplicate_value)
    series = series.map(has_spots)
    series = series.map(has_stripes)
    series = series.map(is_tricolor)
    return series

In [39]:
type(all_color["color"])

pandas.core.series.Series

In [36]:
final = []
for el in [color.split("/") for color in colors]:
    final += el
list(set(final))

['Spotted',
 'Gray',
 'Brown',
 'Black',
 'White',
 'Tricolor',
 'Orange',
 'Beige',
 'Striped']

In [37]:
len(colors)

39

In [16]:
colors[-1].replace("Gray Tick", "Gray")

'Gray Point/Beige'

## Old unused code

In [17]:
#complex colors
# all_color = replace_colors(["Torbie", "Tabby", "Tortie", "Blue Striped", "Brown Striped", "Brindle", "Black Striped" ], "Striped", all_color["color"]) #preliminary striped

# def is_striped(color):
#     colors = ["Striped"]
#     color_list = " ".join(color.split("/")).split(" ")
#     for color in colors:
#         if color in color_list:
#             return True
#     return False


# all_color = all_color.replace(to_replace=['Black/Black Tabby','White/Black Tabby','Black Tabby/White','Black Tabby/Black', 'Brown/Black Tabby',
#                                          'Black Tabby/Orange', 'Black Tabby/Gray Tabby', 'Black Tabby/Gray', 'Black Tabby', 
#                                          'Black Brindle/White', 'Black Brindle', 'White/Black Brindle', 'Black Brindle/Brown', 'Black/Black Brindle',
#                                           'Black Brindle/Black', 'Brown/Black Brindle', 'Brown Tabby/Black Brindle', 'Black Brindle/Blue',
#                                           'Black Brindle/Blue Tick', 'Black Brindle/Brown Brindle', 'Brown Brindle/White',
#                                           'Brown Brindle', 'White/Brown Brindle', 'Black/Brown Brindle', 'Brown Brindle/Black', 'Tricolor/Brown Brindle',
#                                           'Brown Brindle/Tan', 'Brown Brindle/Brown', 'Blue/Brown Brindle', 'Brown/Brown Brindle',
#                                           'Brown Brindle/Red Tick', 'Fawn/Brown Brindle', 'Brown Brindle/Blue Tick',
#                                           'Chocolate/Brown Brindle', 'Tan/Brown Brindle', 'Blue Tick/Brown Brindle', 'Brown Striped', 'Brown Brindle/Blue',
#                                           'Brown Brindle/Blue Cream', 'Brown Brindle/Brown Brindle', 'Brown Brindle/Brown Merle',
#                                           'Brown Brindle/Gray', 'Black Brindle/Brown Brindle', 'Blue Merle/Brown Brindle',
#                                           'Yellow Brindle/White', 'Yellow Brindle', 'White/Yellow Brindle', 'Sable/Brown',
#                                           'Black/Yellow Brindle', 'Blue/Yellow Brindle', 'Tan/Yellow Brindle', 'Yellow Brindle/Blue', 'Torbie', 'Tabby'], value= 'Striped')