In [18]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [9]:
def convert_AgeuponOutcome_to_weeks(df):
    result = {}
    for k in df['AgeuponOutcome'].unique():
        if type(k) != type(""):
            result[k] = -1
        else:
            v1, v2 = k.split()
            if v2 in ["year", "years"]:
                result[k] = int(v1) * 52
            elif v2 in ["month", "months"]:
                result[k] = int(v1) * 4.5
            elif v2 in ["week", "weeks"]:
                result[k] = int(v1)
            elif v2 in ["day", "days"]:
                result[k] = int(v1) / 7
                
    df['_AgeuponOutcome'] = df['AgeuponOutcome'].map(result).astype(float)
    df = df.drop('AgeuponOutcome', axis = 1)
                
    return df

train_df = convert_AgeuponOutcome_to_weeks(train_df)
test_df = convert_AgeuponOutcome_to_weeks(test_df)
print("AgeuponOutcome conversion done.")

AgeuponOutcome conversion done.


In [10]:
   # add a column with a Name frequency count

names = pd.concat([test_df['Name'], train_df['Name']])
values = dict(names.value_counts())

train_df['_NameFreq'] = train_df['Name'].map(values)
test_df['_NameFreq'] = test_df['Name'].map(values)

train_df['_NameFreq'] = train_df['_NameFreq'].fillna(99999)
test_df['_NameFreq'] = test_df['_NameFreq'].fillna(99999)

print(test_df.info())
print(train_df.info())

print("Name frequency count done.")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11456 entries, 0 to 11455
Data columns (total 9 columns):
ID                 11456 non-null int64
Name               8231 non-null object
DateTime           11456 non-null object
AnimalType         11456 non-null object
SexuponOutcome     11456 non-null object
Breed              11456 non-null object
Color              11456 non-null object
_AgeuponOutcome    11456 non-null float64
_NameFreq          11456 non-null float64
dtypes: float64(2), int64(1), object(6)
memory usage: 805.6+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 11 columns):
AnimalID           26729 non-null object
Name               19038 non-null object
DateTime           26729 non-null object
OutcomeType        26729 non-null object
OutcomeSubtype     13117 non-null object
AnimalType         26729 non-null object
SexuponOutcome     26728 non-null object
Breed              26729 non-null object
Color            

In [11]:
# convert all of the remaining features to numeric values

def convert_to_numeric(df):
    for col in ['Name', 'AnimalType', 'SexuponOutcome',
                'Breed', 'Color', 'OutcomeType']:
        if col in df.columns:
            _col = "_%s" % (col)
            values = df[col].unique()
            _values = dict(zip(values, range(len(values))))
            df[_col] = df[col].map(_values).astype(int)
            df = df.drop(col, axis = 1)
    return df

train_df = convert_to_numeric(train_df)
test_df = convert_to_numeric(test_df)

print("Numerical conversion of features done.")

Numerical conversion of features done.


In [12]:
# fix the DateTime column

def fix_date_time(df):
    def extract_field(_df, start, stop):
        return _df['DateTime'].map(lambda dt: int(dt[start:stop]))
    df['Year'] = extract_field(df,0,4)
    df['Month'] = extract_field(df,5,7)
    df['Day'] = extract_field(df,8,10)
    df['Hour'] = extract_field(df,11,13)
    df['Minute'] = extract_field(df,14,16)
    
    return df.drop(['DateTime'], axis = 1)

train_df = fix_date_time(train_df)
test_df = fix_date_time(test_df)

print("DateTime column split into parts done.")

DateTime column split into parts done.


In [13]:
# re-index train_df so that ID is first and Target (_OutcomeType) is last

train_df = train_df.reindex(columns = ['AnimalID', '_Name', '_NameFreq',
                                       '_AnimalType', '_SexuponOutcome',
                                       '_AgeuponOutcome', '_Breed', '_Color',
                                       'Year', 'Month', 'Day', 'Hour', 'Minute',
                                       '_OutcomeType'])
print(train_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26729 entries, 0 to 26728
Data columns (total 14 columns):
AnimalID           26729 non-null object
_Name              26729 non-null int32
_NameFreq          26729 non-null float64
_AnimalType        26729 non-null int32
_SexuponOutcome    26729 non-null int32
_AgeuponOutcome    26729 non-null float64
_Breed             26729 non-null int32
_Color             26729 non-null int32
Year               26729 non-null int64
Month              26729 non-null int64
Day                26729 non-null int64
Hour               26729 non-null int64
Minute             26729 non-null int64
_OutcomeType       26729 non-null int32
dtypes: float64(2), int32(6), int64(5), object(1)
memory usage: 2.2+ MB
None


In [14]:
# split the data into a training set (80%) and a validation set (20%)

cut = int(len(train_df) * 0.8)
_validation_df = train_df[cut:]
_train_df = train_df[:cut]

print(len(_train_df))
print(len(_validation_df))

21383
5346


In [15]:
# build a classifier with scikit-learn

import sklearn
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

A1 = AdaBoostClassifier(DecisionTreeClassifier(max_depth = 2),
                        n_estimators = 100,
                        learning_rate = 0.1)

classifiers = [c.fit(_train_df.values[:,1:-1],
                     _train_df.values[:,-1].astype(int)) \
               for c in [A1]]
results = [c.predict_proba(_validation_df.values[:,1:-1]) \
           for c in classifiers]
print(results[0])

[[0.23575149 0.17671444 0.2567242  0.21744647 0.1133634 ]
 [0.24151578 0.18907436 0.24397104 0.2314309  0.09400792]
 [0.24163204 0.21602865 0.19585125 0.22252202 0.12396605]
 ...
 [0.23546703 0.18209391 0.23351246 0.22367143 0.12525517]
 [0.16315222 0.23452682 0.08883499 0.30012701 0.21335896]
 [0.142658   0.201988   0.14134818 0.29365645 0.22034937]]


In [16]:
# calculate the log loss of result

from sklearn.metrics import log_loss

print([log_loss(_validation_df.values[:,-1].astype(int), r) for r in results])

[1.3768400692976586]


In [17]:
# re-build the selected classifier on the entire training set

ab = classifiers[0].fit(train_df.values[:,1:-1],
                        train_df.values[:,-1].astype(int))

# and use the classifier on test_df

ab_result = ab.predict_proba(test_df.values[:,1:])
ab_sub_df = pd.DataFrame(ab_result, columns=['Adoption', 'Died', 'Euthanasia',
                                             'Return_to_owner', 'Transfer'])
ab_sub_df.insert(0, 'ID', test_df.values[:,0].astype(int))

print(ab_sub_df)

# write to submission files

ab_sub_df.to_csv("submission.csv", index = False)

print("Done.")

          ID  Adoption      Died  Euthanasia  Return_to_owner  Transfer
0          1  0.259431  0.151746    0.131017         0.330570  0.127235
1          2  0.254188  0.151699    0.145897         0.326564  0.121653
2          3  0.267932  0.141946    0.128941         0.341324  0.119857
3          4  0.254163  0.149507    0.145492         0.323777  0.127060
4          5  0.255446  0.150687    0.145860         0.325609  0.122398
5          6  0.240435  0.148808    0.133893         0.336211  0.140654
6          7  0.231780  0.156951    0.169878         0.301481  0.139910
7          8  0.203742  0.161135    0.178211         0.299555  0.157357
8          9  0.251536  0.150331    0.141563         0.320990  0.135580
9         10  0.280542  0.146306    0.136396         0.326609  0.110147
10        11  0.272173  0.146302    0.129851         0.319663  0.132011
11        12  0.241864  0.152204    0.133986         0.338223  0.133723
12        13  0.262780  0.144593    0.122912         0.332815  0