In [46]:
import pandas as pd
import numpy as np
import animalhelper as ah

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [78]:
df = pd.read_csv('./data/engineered.csv', index_col = 0, dtype={'intake_condition': str})

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67211 entries, 0 to 28302
Data columns (total 16 columns):
age                 67211 non-null int64
animal_id           67211 non-null object
animal_type         67211 non-null object
datetime_in         67211 non-null object
days_in_shelter     67211 non-null int64
dow                 67211 non-null int64
fixed_status        67211 non-null object
gender              67211 non-null int64
group               67211 non-null object
intake_condition    67211 non-null object
intake_season       67211 non-null object
intake_type         67211 non-null object
mix                 67211 non-null int64
name                67211 non-null int64
outcome_type        67211 non-null object
simple_color        67211 non-null object
dtypes: int64(6), object(10)
memory usage: 8.7+ MB


In [149]:
df_mod = pd.DataFrame()

In [150]:
df_mod[['Cat', 'Dog']] = pd.get_dummies(df['animal_type'])

In [151]:
df_mod['age'] = df['age']

In [152]:
df['fixed_status'].value_counts()

intact    45759
fixed     21451
Name: fixed_status, dtype: int64

## Impute unknown fixed status values

In [153]:
df['fixed_status'] = df['fixed_status'].apply(lambda x: ah.impute_fixed_status(x))

In [154]:
df_mod[['fixed', 'intact']] = pd.get_dummies(df['fixed_status'])

In [155]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67210 entries, 0 to 67209
Data columns (total 16 columns):
age                 67210 non-null int64
animal_id           67210 non-null object
animal_type         67210 non-null object
datetime_in         67210 non-null object
days_in_shelter     67210 non-null int64
dow                 67210 non-null int64
fixed_status        67210 non-null object
gender              67210 non-null int64
group               67210 non-null object
intake_condition    67210 non-null object
intake_season       67210 non-null object
intake_type         67210 non-null object
mix                 67210 non-null int64
name                67210 non-null int64
outcome_type        67210 non-null object
simple_color        67210 non-null object
dtypes: int64(6), object(10)
memory usage: 8.2+ MB


In [156]:
df_mod[['male', 'female']] = pd.get_dummies(df['gender'])

# Clean up groups and add to `df_mod`

In [157]:
df['group'].value_counts()

short hair      27058
Terrier          9635
Toy              7376
Sporting         6530
Herding          5897
Working          4604
Hound            2995
Non-Sporting     1870
long hair        1245
Name: group, dtype: int64

In [158]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67210 entries, 0 to 67209
Data columns (total 16 columns):
age                 67210 non-null int64
animal_id           67210 non-null object
animal_type         67210 non-null object
datetime_in         67210 non-null object
days_in_shelter     67210 non-null int64
dow                 67210 non-null int64
fixed_status        67210 non-null object
gender              67210 non-null int64
group               67210 non-null object
intake_condition    67210 non-null object
intake_season       67210 non-null object
intake_type         67210 non-null object
mix                 67210 non-null int64
name                67210 non-null int64
outcome_type        67210 non-null object
simple_color        67210 non-null object
dtypes: int64(6), object(10)
memory usage: 8.2+ MB


In [159]:
def clean_groups(s):
    if s == 'Terrier & Toy':
        return 'Terrier'
    return s

In [160]:
df['group'] = df['group'].apply(lambda x: clean_groups(x))

In [161]:
df = df[~(df['group'] == 'Unknown')]

In [162]:
df['group'].value_counts()

short hair      27058
Terrier          9635
Toy              7376
Sporting         6530
Herding          5897
Working          4604
Hound            2995
Non-Sporting     1870
long hair        1245
Name: group, dtype: int64

In [163]:
pd.get_dummies(df['group']).columns.values

array(['Herding', 'Hound', 'Non-Sporting', 'Sporting', 'Terrier', 'Toy',
       'Working', 'long hair', 'short hair'], dtype=object)

In [164]:
s = set(df_mod.columns.values)
m = set(df['group'].unique())

In [165]:
s.intersection(m)

set()

In [166]:
df['group'].value_counts()

short hair      27058
Terrier          9635
Toy              7376
Sporting         6530
Herding          5897
Working          4604
Hound            2995
Non-Sporting     1870
long hair        1245
Name: group, dtype: int64

In [167]:
pd.get_dummies(df['group']).columns.values

array(['Herding', 'Hound', 'Non-Sporting', 'Sporting', 'Terrier', 'Toy',
       'Working', 'long hair', 'short hair'], dtype=object)

In [168]:
df_mod.reset_index(inplace=True, drop=True)

In [169]:
pd.get_dummies(df['group']).shape

(67210, 9)

In [170]:
df.reset_index(inplace=True, drop=True)

In [171]:
df_mod.index.duplicated().sum()

0

In [172]:
df.index.duplicated().sum()

0

In [173]:
df_mod = pd.concat([df_mod, pd.get_dummies(df['group'])], axis=1)

## Add intake condition

In [174]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67210 entries, 0 to 67209
Data columns (total 16 columns):
age                 67210 non-null int64
animal_id           67210 non-null object
animal_type         67210 non-null object
datetime_in         67210 non-null object
days_in_shelter     67210 non-null int64
dow                 67210 non-null int64
fixed_status        67210 non-null object
gender              67210 non-null int64
group               67210 non-null object
intake_condition    67210 non-null object
intake_season       67210 non-null object
intake_type         67210 non-null object
mix                 67210 non-null int64
name                67210 non-null int64
outcome_type        67210 non-null object
simple_color        67210 non-null object
dtypes: int64(6), object(10)
memory usage: 8.2+ MB


In [175]:
df_mod = pd.concat([df_mod, pd.get_dummies(df['intake_condition'])], axis=1)

## Add intake season, intake type, name or not, and simple color

In [176]:
df_mod = pd.concat([df_mod, pd.get_dummies(df[['intake_condition', 'intake_season', 'intake_type', 'mix', 'name',
                                      'simple_color']])], axis=1)

In [195]:
df_mod = pd.concat([df_mod, pd.get_dummies(df['dow'])], axis=1)

In [198]:
df_mod.drop(labels=[0, 1, 2, 3, 4, 5, 6], axis=1, inplace=True)

In [202]:
df_mod.columns.values

array(['Cat', 'Dog', 'age', 'fixed', 'intact', 'male', 'female',
       'Herding', 'Hound', 'Non-Sporting', 'Sporting', 'Terrier', 'Toy',
       'Working', 'long hair', 'short hair', 'Aged', 'Feral', 'Injured',
       'Normal', 'Nursing', 'Other', 'Pregnant', 'Sick', 'mix', 'name',
       'intake_condition_Aged', 'intake_condition_Feral',
       'intake_condition_Injured', 'intake_condition_Normal',
       'intake_condition_Nursing', 'intake_condition_Other',
       'intake_condition_Pregnant', 'intake_condition_Sick',
       'intake_season_fall', 'intake_season_spring',
       'intake_season_summer', 'intake_season_winter',
       'intake_type_Euthanasia Request', 'intake_type_Owner Surrender',
       'intake_type_Public Assist', 'intake_type_Stray',
       'simple_color_Black', 'simple_color_Blue', 'simple_color_Brown',
       'simple_color_Gray', 'simple_color_Red', 'simple_color_Sable',
       'simple_color_Tricolor', 'simple_color_White',
       'simple_color_Yellow'], dtype=objec

In [203]:
df_mod = pd.concat([df_mod, pd.get_dummies(df['dow'])], axis=1)

In [204]:
df_mod = pd.concat([df_mod, df['outcome_type']], axis=1)

In [205]:
df_mod.head()

Unnamed: 0,Cat,Dog,age,fixed,intact,male,female,Herding,Hound,Non-Sporting,...,simple_color_White,simple_color_Yellow,0,1,2,3,4,5,6,outcome_type
0,0,1,2920,1,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,1,Return to Owner
1,0,1,330,0,1,1,0,0,1,0,...,0,0,0,0,0,1,0,0,0,Return to Owner
2,0,1,1460,1,0,1,0,0,0,0,...,0,1,0,0,0,0,0,0,1,Return to Owner
3,0,1,730,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,Return to Owner
4,0,1,730,0,1,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,Return to Owner


In [206]:
df_mod.to_csv('./data/df_mod.csv')

In [189]:
df.to_csv('./data/master_df_1128.csv')