# **LIBRERÍAS**

In [1]:
import warnings
warnings.filterwarnings('ignore')

#Importar librerías
import pandas as pd
import datetime as dt
import numpy as np

In [2]:
#Cargar el dataset

df = pd.read_csv('https://query.data.world/s/hemljxoec4fnxuefoccoywvohivnla')
df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A771830,*Bradley,12/27/2018 01:25:00 PM,12/27/2018 01:25:00 PM,05/10/2017,Adoption,Foster,Dog,Neutered Male,1 year,Pit Bull Mix,White/Blue
1,A779576,*Rajah,10/01/2018 05:03:00 PM,10/01/2018 05:03:00 PM,04/18/2018,Adoption,,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Brown Tabby
2,A741715,*Pebbles,01/11/2017 06:17:00 PM,01/11/2017 06:17:00 PM,03/07/2016,Adoption,,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico
3,A658751,Benji,11/13/2016 01:38:00 PM,11/13/2016 01:38:00 PM,07/14/2011,Return to Owner,,Dog,Neutered Male,5 years,Border Terrier Mix,Tan
4,A721285,,02/24/2016 02:42:00 PM,02/24/2016 02:42:00 PM,02/24/2014,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray


#Verificar y limpieza de datos

In [3]:
#Describomos los datos, de esta manera podemos obtener el valor que mas se repite en cada columna
df.describe(include=[np.object])

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
count,95367,65938,95367,95367,95367,95360,43280,95367,95365,95363,95367,95367
unique,85672,16383,78742,78742,6312,9,20,5,5,48,2348,567
top,A721033,Bella,04/18/2016 12:00:00 AM,04/18/2016 12:00:00 AM,09/01/2015,Adoption,Partner,Dog,Neutered Male,1 year,Domestic Shorthair Mix,Black/White
freq,24,419,39,39,118,41332,23549,54327,33767,17559,28334,9914


In [4]:
#Compruebo si faltan datos
df.isnull().sum().sort_values(ascending=False)

Outcome Subtype     52087
Name                29429
Outcome Type            7
Age upon Outcome        4
Sex upon Outcome        2
Animal ID               0
DateTime                0
MonthYear               0
Date of Birth           0
Animal Type             0
Breed                   0
Color                   0
dtype: int64

In [5]:
#Imputación
#Reemplazamos los datos faltantes con los valores que más se repiten
df['Name'].replace(np.nan, 'Cockie', inplace = True)
df['Outcome Type'].replace(np.nan, 'Adoption	', inplace = True)
df['Outcome Subtype'].replace(np.nan, 'Partner', inplace = True)
df['Sex upon Outcome'].replace(np.nan, 'Neutered Male', inplace = True)
df['Age upon Outcome'].replace(np.nan, '1 year', inplace = True)

df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A771830,*Bradley,12/27/2018 01:25:00 PM,12/27/2018 01:25:00 PM,05/10/2017,Adoption,Foster,Dog,Neutered Male,1 year,Pit Bull Mix,White/Blue
1,A779576,*Rajah,10/01/2018 05:03:00 PM,10/01/2018 05:03:00 PM,04/18/2018,Adoption,Partner,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Brown Tabby
2,A741715,*Pebbles,01/11/2017 06:17:00 PM,01/11/2017 06:17:00 PM,03/07/2016,Adoption,Partner,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico
3,A658751,Benji,11/13/2016 01:38:00 PM,11/13/2016 01:38:00 PM,07/14/2011,Return to Owner,Partner,Dog,Neutered Male,5 years,Border Terrier Mix,Tan
4,A721285,Cockie,02/24/2016 02:42:00 PM,02/24/2016 02:42:00 PM,02/24/2014,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray


In [6]:
#Verifico si faltan datos
df.isnull().sum().sort_values(ascending=False)

Animal ID           0
Name                0
DateTime            0
MonthYear           0
Date of Birth       0
Outcome Type        0
Outcome Subtype     0
Animal Type         0
Sex upon Outcome    0
Age upon Outcome    0
Breed               0
Color               0
dtype: int64

In [7]:
#Verificar los tipos de datos con lo que se van a trabajar
df.dtypes 

Animal ID           object
Name                object
DateTime            object
MonthYear           object
Date of Birth       object
Outcome Type        object
Outcome Subtype     object
Animal Type         object
Sex upon Outcome    object
Age upon Outcome    object
Breed               object
Color               object
dtype: object

In [8]:
#Manejo de fechas
df['Date of Birth'] = pd.to_datetime(df['Date of Birth'], format="%m/%d/%Y")

In [9]:
fecha_actual = pd.to_datetime( dt.date.today(), format="%Y/%m/%d")
#Validar si hay fechas mayores a la actual
df[df['Date of Birth']>fecha_actual]

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color


In [10]:
df.dtypes

Animal ID                   object
Name                        object
DateTime                    object
MonthYear                   object
Date of Birth       datetime64[ns]
Outcome Type                object
Outcome Subtype             object
Animal Type                 object
Sex upon Outcome            object
Age upon Outcome            object
Breed                       object
Color                       object
dtype: object

In [11]:
#Transformo de tipo objeto a datetime
df['MonthYear'] = pd.to_datetime(df['MonthYear'],infer_datetime_format=True)

In [12]:
df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
0,A771830,*Bradley,12/27/2018 01:25:00 PM,2018-12-27 13:25:00,2017-05-10,Adoption,Foster,Dog,Neutered Male,1 year,Pit Bull Mix,White/Blue
1,A779576,*Rajah,10/01/2018 05:03:00 PM,2018-10-01 17:03:00,2018-04-18,Adoption,Partner,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Brown Tabby
2,A741715,*Pebbles,01/11/2017 06:17:00 PM,2017-01-11 18:17:00,2016-03-07,Adoption,Partner,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico
3,A658751,Benji,11/13/2016 01:38:00 PM,2016-11-13 13:38:00,2011-07-14,Return to Owner,Partner,Dog,Neutered Male,5 years,Border Terrier Mix,Tan
4,A721285,Cockie,02/24/2016 02:42:00 PM,2016-02-24 14:42:00,2014-02-24,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray


In [13]:
#Compruebo que se realizaron los cambios
df.dtypes

Animal ID                   object
Name                        object
DateTime                    object
MonthYear           datetime64[ns]
Date of Birth       datetime64[ns]
Outcome Type                object
Outcome Subtype             object
Animal Type                 object
Sex upon Outcome            object
Age upon Outcome            object
Breed                       object
Color                       object
dtype: object

In [14]:
#Verifico que valores se encuentran en la columna Age upon Outcome
df['Age upon Outcome'].unique()

array(['1 year', '5 months', '10 months', '5 years', '2 years', '1 month',
       '3 years', '8 months', '3 weeks', '9 years', '2 weeks', '4 months',
       '2 months', '6 years', '10 years', '13 years', '4 years',
       '6 months', '3 months', '11 years', '1 week', '9 months',
       '4 weeks', '11 months', '8 years', '7 years', '12 years',
       '7 months', '1 weeks', '3 days', '5 days', '14 years', '2 days',
       '5 weeks', '4 days', '15 years', '16 years', '17 years', '6 days',
       '1 day', '18 years', '0 years', '24 years', '19 years', '20 years',
       '22 years', '25 years', '-1 years'], dtype=object)

In [15]:
#Busco las filas que contengan -1 years
df.loc[df.loc[:, 'Age upon Outcome'] == '-1 years']

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color
89541,A753893,Chato,07/02/2015 11:06:00 AM,2015-07-02 11:06:00,2016-07-12,Transfer,Partner,Dog,Intact Male,-1 years,American Bulldog Mix,White/Brown


In [16]:
#Borro esa fila
df = df.drop([89541])

In [17]:

#Compruebo que se borró
#Busco las filas que contengan -1 years
df.loc[df.loc[:, 'Age upon Outcome'] == '-1 years']

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color


#Aplicar algún método de ingeniería de características que sea adecuado para el proceso de transformación de sus datos (por ejemplo: standardization,normalization, etc.)

In [18]:
#Agrego columna extra para tener solo el color principal de la mascota
df['Main Color']=0

def agregaColorPrin():
  for i in df.index :
    global txt_tmp 
    txt_tmp = df['Color'][i].split('/')
    df['Main Color'][i]=txt_tmp[0]

agregaColorPrin()

In [19]:
#Compruebo 
df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color,Main Color
0,A771830,*Bradley,12/27/2018 01:25:00 PM,2018-12-27 13:25:00,2017-05-10,Adoption,Foster,Dog,Neutered Male,1 year,Pit Bull Mix,White/Blue,White
1,A779576,*Rajah,10/01/2018 05:03:00 PM,2018-10-01 17:03:00,2018-04-18,Adoption,Partner,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Brown Tabby,Brown Tabby
2,A741715,*Pebbles,01/11/2017 06:17:00 PM,2017-01-11 18:17:00,2016-03-07,Adoption,Partner,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico,Calico
3,A658751,Benji,11/13/2016 01:38:00 PM,2016-11-13 13:38:00,2011-07-14,Return to Owner,Partner,Dog,Neutered Male,5 years,Border Terrier Mix,Tan,Tan
4,A721285,Cockie,02/24/2016 02:42:00 PM,2016-02-24 14:42:00,2014-02-24,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray,Black


In [20]:
#Agrego otra columna para la cantidad de días
df['Days of stay'] = 0
#Lo calculo
tmp_date = df['MonthYear']-df['Date of Birth']

In [21]:
#Agrego los valores a la columna (dias que se mantuvo la mascota en el refugio)
for i in df.index :
  df['Days of stay'][i]=int(str(tmp_date[i]).split(' ')[0])

In [22]:
df.head()

Unnamed: 0,Animal ID,Name,DateTime,MonthYear,Date of Birth,Outcome Type,Outcome Subtype,Animal Type,Sex upon Outcome,Age upon Outcome,Breed,Color,Main Color,Days of stay
0,A771830,*Bradley,12/27/2018 01:25:00 PM,2018-12-27 13:25:00,2017-05-10,Adoption,Foster,Dog,Neutered Male,1 year,Pit Bull Mix,White/Blue,White,596
1,A779576,*Rajah,10/01/2018 05:03:00 PM,2018-10-01 17:03:00,2018-04-18,Adoption,Partner,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Brown Tabby,Brown Tabby,166
2,A741715,*Pebbles,01/11/2017 06:17:00 PM,2017-01-11 18:17:00,2016-03-07,Adoption,Partner,Cat,Spayed Female,10 months,Domestic Shorthair Mix,Calico,Calico,310
3,A658751,Benji,11/13/2016 01:38:00 PM,2016-11-13 13:38:00,2011-07-14,Return to Owner,Partner,Dog,Neutered Male,5 years,Border Terrier Mix,Tan,Tan,1949
4,A721285,Cockie,02/24/2016 02:42:00 PM,2016-02-24 14:42:00,2014-02-24,Euthanasia,Suffering,Other,Unknown,2 years,Raccoon Mix,Black/Gray,Black,730


#Hallar X y Y (Selección de características.)

In [23]:
X=df.loc[:,['Animal Type','Sex upon Outcome','Breed','Main Color']]
Y=df['Days of stay']

#Dividir el dataset en training and testing 

In [24]:
#Entrenamiento
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                        X, #Aqui tomo la X (Selecciono las características)
                                        Y, #Aqui tomo la Y
                                        train_size   = 0.7,
                                        random_state = 1234,
                                        shuffle      = True
                                    )

In [33]:
#Características de entrenamiento
X_train.tail()

Unnamed: 0,Animal Type,Sex upon Outcome,Breed,Main Color
89460,Dog,Spayed Female,Staffordshire Mix,Brown Brindle
60620,Dog,Neutered Male,Queensland Heeler Mix,White
34086,Cat,Neutered Male,Domestic Shorthair Mix,Orange Tabby
58067,Dog,Neutered Male,Australian Cattle Dog Mix,Red
92976,Dog,Spayed Female,Border Collie/Bernese Mountain Dog,Tricolor


In [35]:
#Características de test
y_train.tail()

89460    1827
60620     829
34086     141
58067    1462
92976    3119
Name: Days of stay, dtype: int64