In [50]:
import pandas as pd
import numpy as np
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

In [51]:
import scipy as sp
def logloss(act, pred):
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

In [52]:
def generate_features(data):
    ###### RETIRANDO O OUTCOMESUBTYPE O NAME DO TREINO E TESTE ######
    data['has_name'] = data['Name'].apply(lambda x :  pd.notnull(x) )
    
    data['DateTime'] = data['DateTime'].apply(lambda x : pd.tslib.Timestamp(x, tz=None))
    data['day'] = data['DateTime'].apply(lambda x : x.day)
    data['month'] = data['DateTime'].apply(lambda x : x.month)
    data['year'] = data['DateTime'].apply(lambda x : x.year)
    data['hour'] = data['DateTime'].apply(lambda x : x.hour)
    data['minute'] = data['DateTime'].apply(lambda x : x.minute)
    data['weekday'] = data['DateTime'].apply(lambda x : x.weekday() < 5)
    data['working_hour'] = data['DateTime'].apply(lambda x : x.hour >= 8 and x.hour <= 18)
    data['madrugada'] = data['DateTime'].apply(lambda x : x.hour < 6)
    data['manha'] = data['DateTime'].apply(lambda x : x.hour > 6 and x.hour < 12)
    data['tarde'] = data['DateTime'].apply(lambda x : x.hour > 12 and x.hour < 18)
    data['noite'] = data['DateTime'].apply(lambda x : x.hour > 18)

    data.drop(['Name', 'DateTime'], axis=1, inplace=True)
    
    ###### TRATAMENTO DOS DADOS CATEGORICOS ######
    for col in ['AnimalType', 'SexuponOutcome', 'AgeuponOutcome', 'Breed', 'Color']:
        le = LabelEncoder().fit(np.append(train[col], test[col]))
        data[col] = le.transform(data[col])

In [53]:
###### LENDO O TREINO E TESTE ######
train = pd.read_csv('input/train.csv', index_col='AnimalID')
test = pd.read_csv('input/test.csv', index_col='ID')

train.drop('OutcomeSubtype', axis=1, inplace=True)
    
#train.head()

In [54]:

train.head()

Unnamed: 0_level_0,Name,DateTime,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A671945,Hambone,2014-02-12 18:22:00,Return_to_owner,Dog,Neutered Male,1 year,Shetland Sheepdog Mix,Brown/White
A656520,Emily,2013-10-13 12:44:00,Euthanasia,Cat,Spayed Female,1 year,Domestic Shorthair Mix,Cream Tabby
A686464,Pearce,2015-01-31 12:28:00,Adoption,Dog,Neutered Male,2 years,Pit Bull Mix,Blue/White
A683430,,2014-07-11 19:09:00,Transfer,Cat,Intact Male,3 weeks,Domestic Shorthair Mix,Blue Cream
A667013,,2013-11-15 12:52:00,Transfer,Dog,Neutered Male,2 years,Lhasa Apso/Miniature Poodle,Tan


In [55]:
generate_features(train)
generate_features(test)

In [56]:
train.head()

Unnamed: 0_level_0,OutcomeType,AnimalType,SexuponOutcome,AgeuponOutcome,Breed,Color,has_name,day,month,year,hour,minute,weekday,working_hour
AnimalID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
A671945,Return_to_owner,1,3,6,1482,146,True,12,2,2014,18,22,True,True
A656520,Euthanasia,0,4,6,775,184,True,13,10,2013,12,44,False,True
A686464,Adoption,1,3,22,1293,97,True,31,1,2015,12,28,False,True
A683430,Transfer,0,2,27,775,47,False,11,7,2014,19,9,True,False
A667013,Transfer,1,3,22,1101,311,False,15,11,2013,12,52,True,True


In [57]:
#print train.head(3)

###### SEPARA O TREINO EM DADOS DE TREINO E LISTA DE CLASSES ######
data   = train.drop('OutcomeType', axis=1)
target = train['OutcomeType']

In [58]:
###### ESCOLHA O SEU CLASSIFICADOR ######
#classifiers = [KNeighborsClassifier(3),KNeighborsClassifier(3),#SVC(probability=True),SVC(gamma=2, C=1),
#              GaussianNB(),QuadraticDiscriminantAnalysis(),DecisionTreeClassifier(),RandomForestClassifier(),
#             AdaBoostClassifier(),LinearDiscriminantAnalysis(),GradientBoostingClassifier()]

classifiers = [GradientBoostingClassifier()]
#clf = KNeighborsClassifier(3)
#clf = SVC(probability=True)
#clf = SVC(gamma=2, C=1)
#clf = GaussianNB()
#clf = QuadraticDiscriminantAnalysis()
#clf = DecisionTreeClassifier()
#clf = RandomForestClassifier()
#clf = AdaBoostClassifier()
#clf = LinearDiscriminantAnalysis()
#clf = GradientBoostingClassifier()

In [59]:
###### TREINANDO O CLASSIFICADOR ######
for clf in classifiers:
    fit = clf.fit(data, target)
    print "LogLoss:"
    print cross_val_score(clf, data, target, cv=3, scoring='log_loss', verbose=0)

LogLoss:
[-0.80066353 -0.79627435 -0.7929841 ]


LogLoss:
[-0.80012116 -0.79410543 -0.79238406]


In [41]:
###### EXECUTANDO VALIDACAO CRUZADA E IMPRIMINDO NO TERMINAL O VALOR DA METRICA LOG_LOSS ######

AnimalID
A671945    Return_to_owner
A656520         Euthanasia
A686464           Adoption
A683430           Transfer
A667013           Transfer
Name: OutcomeType, dtype: object

AnimalID
A671945    Return_to_owner
A656520         Euthanasia
A686464           Adoption
A683430           Transfer
A667013           Transfer
Name: OutcomeType, dtype: object

In [10]:
###### DESCOMENTE AS LINHAS ABAIXO PARA GERAR UMA PREDICAO PARA O KAGGLE ######
proba = fit.predict_proba(test)
ret = pd.DataFrame(proba, index=test.index, columns=fit.classes_)
ret.sort_index(inplace=True)
ret.to_csv('output/submission.csv', index_label="ID")