In [148]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sn

#Scalers
from sklearn.preprocessing import StandardScaler

In [149]:
# loading the data
df = pd.read_csv('train.csv')

In [150]:
df.head()

Unnamed: 0,AnimalID,Name,DateTime,OutcomeType,OutcomeSubtype,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
0,A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
1,A656520,Emily,2013-10-13 12:44:00,Euthanasia,Suffering,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
2,A686464,Pearce,2015-01-31 12:28:00,Adoption,Foster,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
3,A683430,,2014-07-11 19:09:00,Transfer,Partner,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
4,A667013,,2013-11-15 12:52:00,Transfer,Partner,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [151]:
df.shape

(26729, 10)

In [152]:
# drop outcomesubtype - not relevant for model, teh same as Name, DateTIme
# df = df.drop(['OutcomeSubtype'], axis=1)

# DATA CLEANING

## Missing Data



In [153]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent * 100], axis=1, keys=['Total', 'Percent(%)'])
missing_data.head(10)

Unnamed: 0,Total,Percent(%)
OutcomeSubtype,13612,50.925961
Name,7691,28.773991
AgeuponOutcome,18,0.067343
SexuponOutcome,1,0.003741
AnimalID,0,0.0
DateTime,0,0.0
OutcomeType,0,0.0
AnimalType,0,0.0
Breed,0,0.0
Color,0,0.0


In [154]:
df = df.loc[df['AgeuponOutcome'].notnull()]
df = df.loc[df['SexuponOutcome'].notnull()]

## AnimalType

In [155]:
df['AnimalType'] = df['AnimalType'].replace({'Dog': 0, 'Cat': 1})

## SexuponOutcome

In [156]:
temp = pd.get_dummies(df['SexuponOutcome'])
df = pd.concat([df, temp], axis=1)
df.drop("SexuponOutcome", axis=1, inplace=True)
df = df.rename(columns={'Unknown': 'Unknown Sex'})

## AgeuponOutcome

In [157]:
age_dict = {'1 year':12, '2 years':24, '3 weeks':0.75, '1 month':1, '5 months':5, '4 years':48,
       '3 months':3, '2 weeks':0.5, '2 months':2, '10 months':10, '6 months':6,
       '5 years':60, '7 years':84, '3 years':36, '4 months':4, '12 years':144, '9 years':108,
       '6 years':72, '1 weeks':0.25, '11 years':132, '4 weeks':1, '7 months':7, '8 years':12,
       '11 months':11, '4 days':0, '9 months':9, '8 months':8, '15 years':180,
       '10 years':120, '1 week':0.25, '0 years':0, '14 years':168, '3 days':0, '6 days':0,
       '5 days':0, '5 weeks':1.25, '2 days':0, '16 years':192, '1 day':0, '13 years':156,
       '17 years':204, '18 years':216, '19 years':228, '20 years':240}

df['AgeuponOutcome'] = df['AgeuponOutcome'].map(age_dict)

## Breed

In [158]:
# utworzenie nowej kolumny MixOrNot- przypisanie dla Mix wartości 0, pozostałe - 1
df['MixOrNot'] = df['Breed'].apply(lambda x: 0 if 'Mix' in x else 1)

## Color

In [159]:
unique_colors = df['Color'].unique()
df['Color'].sum()
df['numberOfColor'] = df['Color'].str.count('/') + 1

## Dropping

In [160]:
df = df.drop(['Name'], axis=1)
df = df.drop(['OutcomeSubtype'], axis=1)
df = df.drop(['DateTime'], axis=1)
df = df.drop(['Breed'], axis=1)
df = df.drop(['Color'], axis=1)

## Data after cleaning


In [161]:
df.head()

Unnamed: 0,AnimalID,OutcomeType,AnimalType,AgeuponOutcome,Intact Female,Intact Male,Neutered Male,Spayed Female,Unknown Sex,MixOrNot,numberOfColor
0,A671945,Return_to_owner,0,12.0,0,0,1,0,0,0,2
1,A656520,Euthanasia,1,12.0,0,0,0,1,0,0,1
2,A686464,Adoption,0,24.0,0,0,1,0,0,0,2
3,A683430,Transfer,1,0.75,0,1,0,0,0,0,1
4,A667013,Transfer,0,24.0,0,0,1,0,0,1,1
