# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

Now we take a look at the labels, just to understand what these are

In [3]:
breed.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [4]:
color.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


In [5]:
state

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan
5,41324,Melaka
6,41332,Negeri Sembilan
7,41335,Pahang
8,41330,Perak
9,41380,Perlis


And now we are ready to deal with the *original* dataset...

In [6]:
original_df = pd.read_csv('../data/train.csv')

In [7]:
original_df.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'Description',
       'AdoptionSpeed'],
      dtype='object')

In [8]:
original_df.describe()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,AdoptionSpeed
count,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0
mean,1.454734,10.520412,265.469854,74.388868,1.779059,2.230675,3.236912,1.856738,1.860518,1.460971,1.72973,1.566528,1.912115,1.036666,1.584011,20.80996,41345.994613,2.5189
std,0.49797,18.374027,60.12149,123.43401,0.684763,1.743985,2.748595,2.974465,0.547535,0.593843,0.670791,0.701482,0.564041,0.198228,1.488348,78.397243,32.409109,1.176018
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,41324.0,0.0
25%,1.0,2.0,265.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0
50%,1.0,3.0,266.0,0.0,2.0,2.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0
75%,2.0,12.0,307.0,188.0,2.0,3.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,41401.0,4.0
max,2.0,255.0,307.0,307.0,3.0,7.0,7.0,7.0,4.0,3.0,3.0,3.0,3.0,3.0,20.0,3000.0,41415.0,4.0


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets...

In [9]:
# this function can be used to decode colors and breeds
def merge_labels(df, labels_df, df_id_col, label_id_col, label_name_col):
    df = df.merge(labels_df, left_on=df_id_col, right_on=label_id_col)
    df = df.rename(columns={label_name_col: "__TMP__"})
    df = df.drop([df_id_col, label_id_col], axis=1)
    return df.rename(columns={"__TMP__": df_id_col})

In [10]:
def transform_data(data_fname):
    df = pd.read_csv(data_fname)
    df = df.drop(["Description"], axis=1)
    df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
    df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
    df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
    df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
    df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
    df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
    df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
    df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
    df = merge_labels(df, color, "Color1", "ColorID", "ColorName")
    df = merge_labels(df, color, "Color2", "ColorID", "ColorName")
    df = merge_labels(df, color, "Color3", "ColorID", "ColorName")
    df = merge_labels(df, breed, "Breed1", "BreedID", "BreedName")
    df = merge_labels(df, breed, "Breed2", "BreedID", "BreedName")
    
    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    
    if 'AdoptionSpeed' in df.columns:
        y = df['AdoptionSpeed']
        X = df.drop('AdoptionSpeed', axis=1)
    else:
        y = None
        X = df

    return X, y

Load the data...

In [11]:
X, y = transform_data("../data/train.csv")

Create the model and evaluate it

In [12]:
# split training dataset into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# TODO: accuracy?
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

results = pd.DataFrame(columns=('clf', 'best_acc'))

from sklearn.tree import DecisionTreeClassifier as DT
tree_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(0.5, 1, 2, 5, 7, 10, 15, 16, 17),
              'min_samples_split':(2, 3, 5, 10, 50, 100, 125, 140,150)}
tree = DT()
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=3, iid=False)
tree_clf.fit(X_train, y_train)
#y_pred_tree = tree_clf.predict(X_test)
best_tree_clf = tree_clf.best_estimator_
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.37351959966638865
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=16, min_samples_split=140,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=16, min_samples_split=140,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


**And finally**, we predict the unknown label for the testing set

In [22]:
X.columns

Index(['Age', 'Quantity', 'Fee', 'State', 'Type_y', 'Type', 'Type_x_Cat',
       'Type_x_Dog', 'Gender_Female', 'Gender_Male',
       ...
       'Breed2_Terrier', 'Breed2_Tiger', 'Breed2_Tonkinese', 'Breed2_Torbie',
       'Breed2_Tortoiseshell', 'Breed2_Turkish Angora', 'Breed2_Turkish Van',
       'Breed2_Tuxedo', 'Breed2_Welsh Corgi',
       'Breed2_Yorkshire Terrier Yorkie'],
      dtype='object', length=211)

In [33]:
XX, _ = transform_data("../data/train.csv")

In [34]:
yy = results.clf.iloc[0].predict(XX)

In [17]:
pd.DataFrame(yy).to_csv("../data/submission.csv", header=False, index=False)

array([4, 1, 2, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4, 4, 4, 3, 3, 3, 4,
       4, 4, 4, 3, 4, 3, 4, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 2, 4, 1,

### pruebo usando randomforest

In [47]:
from sklearn.ensemble import RandomForestClassifier

In [48]:
results = pd.DataFrame(columns=('clf', 'best_acc'))

RFC_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(0.5, 1, 2, 3, 5, 7, 10, 15, 16, 17),
              'min_samples_split':(2, 3, 5, 10, 50, 100, 135, 136, 137, 138, 139, 140, 141),'random_state':(0, 2, 5)}
RFC = RandomForestClassifier()
RFC_clf = GridSearchCV(RFC, RFC_param, scoring='accuracy', cv=3, iid=False)
RFC_clf.fit(X_train, y_train)
#y_pred_tree = tree_clf.predict(X_test)
best_RFC_clf = RFC_clf.best_estimator_
print('Best Decision Tree accuracy: ', RFC_clf.best_score_)
print(best_RFC_clf)
results = results.append({'clf': best_RFC_clf, 'best_acc': RFC_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])



































































































































Best Decision Tree accuracy:  0.3830169824164245
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=136,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=2, verbose=0, warm_start=False)
The best classifier so far is: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=136,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=2, verbose=0, warm_start=False)




In [37]:
df

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description
0,2,1,265,0,1,1,2,0,2,2,3,3,3,1,1,0,41401,I just found it alone yesterday near my apartm...
1,1,1,307,0,1,2,7,0,2,2,1,1,2,1,1,0,41326,Their pregnant mother was dumped by her irresp...
2,1,0,307,0,2,1,2,7,2,1,2,2,2,1,6,0,41326,Siu Pak just give birth on 13/6/10 to 6puppies...
3,2,12,265,0,2,1,7,0,2,2,3,3,3,1,1,0,41326,"Very manja and gentle stray cat found, we woul..."
4,2,3,264,0,2,1,2,5,3,3,1,1,2,1,1,50,41326,Kali is a super playful kitten who is on the g...
5,1,2,307,0,1,2,5,6,2,3,1,1,2,1,1,0,41326,Peanut was an abused puppy until he was rescue...
6,2,36,285,251,1,3,0,0,3,2,1,1,1,1,1,0,41326,Garfield is a very large cat. Needs daily groo...
7,2,4,266,0,2,1,6,7,1,1,2,2,2,1,2,0,41326,Two gorgeous kittens have just lost their mumm...
8,2,3,266,0,2,1,7,0,1,1,1,2,2,1,1,1,41327,Open for adoption!!!
9,2,12,264,0,2,6,0,0,2,3,2,2,2,1,1,0,41326,email for more enquiry


In [41]:
df= pd.read_csv("../data/train.csv")
y = df['AdoptionSpeed']
X = df.drop(['Description','AdoptionSpeed'], axis=1)

In [42]:
df

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description,AdoptionSpeed
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,Nibble is a 3+ month old ball of cuteness. He ...,2
1,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,"Good guard dog, very alert, active, obedience ...",2
2,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,This handsome yet cute boy is up for adoption....,2
3,2,3,266,0,2,5,6,0,2,1,2,2,2,1,1,0,41326,This is a stray kitten that came to my house. ...,2
4,2,12,264,264,1,1,0,0,2,3,2,2,3,1,1,300,41326,anyone within the area of ipoh or taiping who ...,1
5,2,2,265,0,2,6,0,0,2,2,2,2,2,1,1,0,41326,"healthy and active, feisty kitten found in nei...",1
6,1,2,307,0,1,1,2,7,2,1,2,1,2,1,1,0,41401,"For serious adopter, please do sms or call for...",1
7,2,2,265,0,3,1,6,7,1,2,2,2,3,1,7,0,41326,Hi Pet Lovers! This is my first posting and I ...,1
8,1,3,307,0,2,2,5,7,2,2,3,3,3,1,1,0,41401,"Lost Dog Found (Bandar Menjalara, Kepong／Taman...",2
9,1,78,218,205,1,1,7,0,2,2,3,3,3,1,1,0,41326,We moved out of our apartment to a landed home...,4


In [45]:
# split training dataset into test and train
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# TODO: accuracy?
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

results = pd.DataFrame(columns=('clf', 'best_acc'))

from sklearn.tree import DecisionTreeClassifier as DT
tree_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(0.5, 1, 2, 5,6, 7,8, 10, 15, 16, 17),
              'min_samples_split':(2, 3, 5, 10, 50, 100, 125, 140,149,150,151)}
tree = DT()
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=3, iid=False)
tree_clf.fit(X_train, y_train)
#y_pred_tree = tree_clf.predict(X_test)
best_tree_clf = tree_clf.best_estimator_
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.3611452849570241
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=149,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=149,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')


In [51]:
XX = pd.read_csv("../data/test.csv")
XX = XX.drop(['Description'], axis=1)
yy = results.clf.iloc[0].predict(XX)

In [52]:
XX['AdoptionSpeed']=yy

In [59]:
XX['PID']=XX.index+1

In [61]:
XX[['PID','AdoptionSpeed']].to_csv("../data/submission6.csv")

In [60]:
XX['PID']

0          1
1          2
2          3
3          4
4          5
5          6
6          7
7          8
8          9
9         10
10        11
11        12
12        13
13        14
14        15
15        16
16        17
17        18
18        19
19        20
20        21
21        22
22        23
23        24
24        25
25        26
26        27
27        28
28        29
29        30
        ... 
4381    4382
4382    4383
4383    4384
4384    4385
4385    4386
4386    4387
4387    4388
4388    4389
4389    4390
4390    4391
4391    4392
4392    4393
4393    4394
4394    4395
4395    4396
4396    4397
4397    4398
4398    4399
4399    4400
4400    4401
4401    4402
4402    4403
4403    4404
4404    4405
4405    4406
4406    4407
4407    4408
4408    4409
4409    4410
4410    4411
Name: PID, Length: 4411, dtype: int64