In [81]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
import xgboost as xgb

In [82]:
import scipy as sp
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [83]:
def getAge(x):
    if pd.notnull(x):
        value = int(x.split(' ')[0])
        if x.endswith('day') or x.endswith('days'):
            return value
        if x.endswith('weeks') or x.endswith('week'):
            return value*7
        if x.endswith('month') or x.endswith('months'):
            return value*30
        if x.endswith('year') or x.endswith('years'):
            return value*365
    return -1

def cleanBreed(x):
    x = x.replace('Mix','')
    x = x.replace('Shorthair','')
    x = x.replace('Medium Hair','')
    x = x.replace('Longhair','')
    x = x.replace('  ',' ')
    x = x.replace('   ',' ')
    x = x.replace('    ',' ')
    x = x.strip()
    
    return x

def cleanColor(x):
    x = x.replace('Tabby','')
    x = x.replace('  ',' ')
    x = x.replace('   ',' ')
    x = x.replace('    ',' ')
    x = x.strip()
    
    return x

In [84]:
def generate_features(data):
    ###### RETIRANDO O OUTCOMESUBTYPE O NAME DO TREINO E TESTE ######
    data['has_name'] = data['Name'].apply(lambda x :  pd.notnull(x) )
    
    #Date
    data['DateTime'] = data['DateTime'].apply(lambda x : pd.tslib.Timestamp(x, tz=None))
    data['day'] = data['DateTime'].apply(lambda x : x.day)
    data['month'] = data['DateTime'].apply(lambda x : x.month)
    data['year'] = data['DateTime'].apply(lambda x : x.year)
    data['hour'] = data['DateTime'].apply(lambda x : x.hour)
    data['minute'] = data['DateTime'].apply(lambda x : x.minute)
    data['weekday'] = data['DateTime'].apply(lambda x : x.weekday() < 5)
    data['working_hour'] = data['DateTime'].apply(lambda x : x.hour >= 8 and x.hour <= 18)
    data['madrugada'] = data['DateTime'].apply(lambda x : x.hour < 6)
    data['manha'] = data['DateTime'].apply(lambda x : x.hour >= 6 and x.hour < 12)
    data['tarde'] = data['DateTime'].apply(lambda x : x.hour >= 12 and x.hour < 18)
    data['noite'] = data['DateTime'].apply(lambda x : x.hour >= 18)
    
    #Age
    data['age'] = data['AgeuponOutcome'].apply(lambda x : getAge(x))
    data['puppy'] = data['age'].apply(lambda x : x <= 365)
    
    #Sex
    data['male'] = data['SexuponOutcome'].apply(lambda x : pd.notnull(x) and x.endswith('Male'))
    data['female'] = data['SexuponOutcome'].apply(lambda x : pd.notnull(x) and x.endswith('Female'))
    data['intact'] = data['SexuponOutcome'].apply(lambda x : pd.notnull(x) and x.startswith('Intact'))
    data['spayed'] = data['SexuponOutcome'].apply(lambda x : pd.notnull(x) and x.startswith('Spayed'))
    data['neutered'] = data['SexuponOutcome'].apply(lambda x : pd.notnull(x) and x.startswith('Neutered'))
    
    #Breed
    data['mix'] = data['Breed'].apply(lambda x : pd.notnull(x) and x.endswith('Mix'))
    data['shorthair'] = data['Breed'].apply(lambda x : pd.notnull(x) and 'Shorthair' in x)
    data['mediumhair'] = data['Breed'].apply(lambda x : pd.notnull(x) and 'Medium Hair' in x)
    data['longhair'] = data['Breed'].apply(lambda x : pd.notnull(x) and 'Longhair' in x)
    
    data['Breed'] = data['Breed'].apply(lambda x : cleanBreed(x))
    
    #Color
    data['tabby'] = data['Color'].apply(lambda x : pd.notnull(x) and 'Tabby' in x)
    data['Color'] = data['Color'].apply(lambda x : cleanColor(x))

    data.drop(['Name', 'DateTime','AgeuponOutcome', 'SexuponOutcome'], axis=1, inplace=True)
    
    ###### TRATAMENTO DOS DADOS CATEGORICOS ######
    for col in ['AnimalType', 'Breed', 'Color']:
        le = LabelEncoder().fit(np.append(train[col], test[col]))
        data[col] = le.transform(data[col])

In [85]:
###### LENDO O TREINO E TESTE ######
train = pd.read_csv('input/train.csv', index_col='AnimalID')
test = pd.read_csv('input/test.csv', index_col='ID')

train.drop('OutcomeSubtype', axis=1, inplace=True)
    
pd.Series.unique(train['SexuponOutcome'])

array(['Neutered Male', 'Spayed Female', 'Intact Male', 'Intact Female',
       'Unknown', nan], dtype=object)

In [86]:

train.head()

Unnamed: 0_level_0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
A656520,Emily,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
A686464,Pearce,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
A683430,,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
A667013,,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [87]:
generate_features(train)
generate_features(test)

In [88]:
train.head()
pd.Series.unique(train['Breed']).size

1220

In [89]:
#print train.head(3)

###### SEPARA O TREINO EM DADOS DE TREINO E LISTA DE CLASSES ######
data   = train.drop('OutcomeType', axis=1)
target = train['OutcomeType']

In [311]:
###### ESCOLHA O SEU CLASSIFICADOR ######
#classifiers = [KNeighborsClassifier(3),KNeighborsClassifier(3),#SVC(probability=True),SVC(gamma=2, C=1),
#              GaussianNB(),QuadraticDiscriminantAnalysis(),DecisionTreeClassifier(),RandomForestClassifier(),
#             AdaBoostClassifier(),LinearDiscriminantAnalysis(),GradientBoostingClassifier()]

#classifiers = [GradientBoostingClassifier()]
#clf = KNeighborsClassifier(3)
#clf = SVC(probability=True)
#clf = SVC(gamma=2, C=1)
#clf = GaussianNB()
#clf = QuadraticDiscriminantAnalysis()
#clf = DecisionTreeClassifier()
#clf = RandomForestClassifier()
#clf = AdaBoostClassifier()
#clf = LinearDiscriminantAnalysis()
clf = GradientBoostingClassifier()

In [312]:
###### TREINANDO O CLASSIFICADOR ######
for clf in classifiers:
    fit = clf.fit(data, target)
    print "LogLoss:"
    print cross_val_score(clf, data, target, cv=3, scoring='log_loss', verbose=0)

LogLoss:
[-8.9929701  -8.69860016 -8.79507247]
LogLoss:
[-8.9929701  -8.69860016 -8.79507247]
LogLoss:
[-2.15186923 -2.23321457 -2.17262553]




LogLoss:
[-9.65249037 -9.50242782 -9.82973787]
LogLoss:
[-14.19608819 -14.07522975 -13.72709874]
LogLoss:
[-2.36093432 -2.29458281 -2.32137058]
LogLoss:
[-1.54899188 -1.55057213 -1.54709061]
LogLoss:




[-0.89779906 -0.88673281 -0.8776896 ]
LogLoss:
[-0.78034615 -0.77351597 -0.76792184]


LogLoss:
[-0.77963995 -0.77292823 -0.76704907]

In [313]:
###### DESCOMENTE AS LINHAS ABAIXO PARA GERAR UMA PREDICAO PARA O KAGGLE ######
proba = fit.predict_proba(test)
ret = pd.DataFrame(proba, index=test.index, columns=fit.classes_)
ret.sort_index(inplace=True)
ret.to_csv('output/submission.csv', index_label="ID")

In [90]:
gbm = xgb.XGBClassifier(max_depth=4, n_estimators=300, learning_rate=0.05).fit(data, target,eval_metric='logloss')
#fit = gbm.fit(data, target)
print "LogLoss:"
print cross_val_score(gbm, data, target, cv=3, scoring='log_loss', verbose=0)

LogLoss:
[-0.76862646 -0.76030176 -0.75771909]


In [91]:
ret = pd.DataFrame(predictions, index=test.index, columns=gbm.classes_)
ret.sort_index(inplace=True)
ret.to_csv('output/submission.csv', index_label="ID")

[[ 0.04017166  0.00329722  0.0689159   0.37033021  0.51728505]
 [ 0.75636011  0.00110187  0.01420486  0.17825007  0.05008303]
 [ 0.39472622  0.00170643  0.02629204  0.34987342  0.22740184]
 ..., 
 [ 0.00483217  0.003995    0.04735428  0.00217178  0.94164681]
 [ 0.51488054  0.00117382  0.0360693   0.40046349  0.04741289]
 [ 0.07213571  0.00438254  0.18296948  0.65546978  0.08504251]]
