# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

Now we take a look at the labels, just to understand what these are

In [3]:
breed.head()

Unnamed: 0,BreedID,Type,BreedName
0,1,1,Affenpinscher
1,2,1,Afghan Hound
2,3,1,Airedale Terrier
3,4,1,Akbash
4,5,1,Akita


In [4]:
color.head()

Unnamed: 0,ColorID,ColorName
0,1,Black
1,2,Brown
2,3,Golden
3,4,Yellow
4,5,Cream


In [5]:
state

Unnamed: 0,StateID,StateName
0,41336,Johor
1,41325,Kedah
2,41367,Kelantan
3,41401,Kuala Lumpur
4,41415,Labuan
5,41324,Melaka
6,41332,Negeri Sembilan
7,41335,Pahang
8,41330,Perak
9,41380,Perlis


And now we are ready to deal with the *original* dataset...

In [6]:
original_df = pd.read_csv('../data/train.csv')

In [7]:
original_df.columns

Index(['Type', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'Description',
       'AdoptionSpeed', 'PID'],
      dtype='object')

In [8]:
original_df.describe()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,AdoptionSpeed,PID
count,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0,10582.0
mean,1.454734,10.520412,265.469854,74.388868,1.779059,2.230675,3.236912,1.856738,1.860518,1.460971,1.72973,1.566528,1.912115,1.036666,1.584011,20.80996,41345.994613,2.5189,7477.025799
std,0.49797,18.374027,60.12149,123.43401,0.684763,1.743985,2.748595,2.974465,0.547535,0.593843,0.670791,0.701482,0.564041,0.198228,1.488348,78.397243,32.409109,1.176018,4310.921553
min,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,41324.0,0.0,0.0
25%,1.0,2.0,265.0,0.0,1.0,1.0,0.0,0.0,2.0,1.0,1.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,3768.25
50%,1.0,3.0,266.0,0.0,2.0,2.0,2.0,0.0,2.0,1.0,2.0,1.0,2.0,1.0,1.0,0.0,41326.0,2.0,7473.5
75%,2.0,12.0,307.0,188.0,2.0,3.0,6.0,5.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,0.0,41401.0,4.0,11200.75
max,2.0,255.0,307.0,307.0,3.0,7.0,7.0,7.0,4.0,3.0,3.0,3.0,3.0,3.0,20.0,3000.0,41415.0,4.0,14992.0


In [9]:
original_df.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,Description,AdoptionSpeed,PID
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,Nibble is a 3+ month old ball of cuteness. He ...,2,0
1,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,"Good guard dog, very alert, active, obedience ...",2,3
2,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,This handsome yet cute boy is up for adoption....,2,4
3,2,3,266,0,2,5,6,0,2,1,2,2,2,1,1,0,41326,This is a stray kitten that came to my house. ...,2,5
4,2,12,264,264,1,1,0,0,2,3,2,2,3,1,1,300,41326,anyone within the area of ipoh or taiping who ...,1,6


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [10]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

Load the data...

In [11]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

Create the model and evaluate it

In [12]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it couldn be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

results = pd.DataFrame(columns=('clf', 'best_acc'))

from sklearn.tree import DecisionTreeClassifier as DT
tree_param = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(1, 2, 5),
              'min_samples_split':(2, 3, 5, 10, 50, 100)}
tree = DT(random_state=42)
tree_clf = GridSearchCV(tree, tree_param, scoring='accuracy', cv=3, iid=False)
tree_clf.fit(X_train.drop(["PID"], axis=1), y_train)
best_tree_clf = tree_clf.best_estimator_
print('Best Decision Tree accuracy: ', tree_clf.best_score_)
print(best_tree_clf)
results = results.append({'clf': best_tree_clf, 'best_acc': tree_clf.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Best Decision Tree accuracy:  0.3519622095560508
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')
The best classifier so far is: 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=100,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best')


**And finally**, we predict the unknown label for the testing set

In [13]:
X.shape, XX.shape

((10582, 360), (4411, 360))

In [14]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

The last thing we do is generating a file that should be *submitted* on kaggle

In [15]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])

In [16]:
submission.to_csv("../data/submission_tree.csv", header=True, index=False)

## SGDClassifier

In [17]:
from sklearn.linear_model import SGDClassifier
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
results = pd.DataFrame(columns=('clf', 'best_acc'))
param_grid = [{'loss': ['hinge', 'log', 'perceptron'], 'learning_rate': ['constant','optimal','invscaling','adaptive'],\
            'eta0': [0.1, 0.01, 0.001],  'alpha': [0.1, 0.01, 0.001]}]
clf = SGDClassifier(random_state=42,tol=1e-3)
grid_search = GridSearchCV(clf, param_grid, scoring='accuracy', cv=5, iid=False,n_jobs=4,verbose = 5)
grid_search.fit(X_train.drop(["PID"], axis=1), y_train)
best_sgd_clf = grid_search.best_estimator_
print('Best SGDClassifier accuracy: ', grid_search.best_score_)
print(best_sgd_clf)
results = results.append({'clf': best_sgd_clf, 'best_acc': grid_search.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.5s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   39.1s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  2.7min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  4.2min
[Parallel(n_jobs=4)]: Done 540 out of 540 | elapsed:  5.2min finished


Best SGDClassifier accuracy:  0.27881212583567705
SGDClassifier(alpha=0.01, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.1, fit_intercept=True,
       l1_ratio=0.15, learning_rate='adaptive', loss='hinge',
       max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)
The best classifier so far is: 
SGDClassifier(alpha=0.01, average=False, class_weight=None,
       early_stopping=False, epsilon=0.1, eta0=0.1, fit_intercept=True,
       l1_ratio=0.15, learning_rate='adaptive', loss='hinge',
       max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
       penalty='l2', power_t=0.5, random_state=42, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False)


In [18]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

In [19]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission_sgd.csv", header=True, index=False)

## LinearSVC

In [20]:
from sklearn.svm import LinearSVC
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
results = pd.DataFrame(columns=('clf', 'best_acc'))
param_grid = [{'C': [0.001, 0.01, 0.1, 1, 10], 'loss': ['hinge', 'squared_hinge'] }]
linear_svc = LinearSVC(max_iter=10000)
grid_search = GridSearchCV(linear_svc, param_grid, scoring='accuracy', cv=3, iid=False,n_jobs=4,verbose = 5)
grid_search.fit(X_train.drop(["PID"], axis=1), y_train)
best_lin_clf = grid_search.best_estimator_
print('Best LinearSVC accuracy: ', grid_search.best_score_)
print(best_lin_clf)
results = results.append({'clf': best_lin_clf, 'best_acc': grid_search.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  1.3min
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  3.3min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:  3.3min finished


Best LinearSVC accuracy:  0.3191625641658338
LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
The best classifier so far is: 
LinearSVC(C=0.01, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)




In [21]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

In [22]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission_svc.csv", header=True, index=False)

## RandomForest

In [16]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier()
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
results = pd.DataFrame(columns=('clf', 'best_acc'))
param_grid = [{'n_estimators': [100,200,300,400,500,600,700,800,900,1000], \
               'max_depth': [1,2,3,4,5,6,7,8,9,None], 'min_samples_split': [2,3,4,5,6,7,8,9,10], \
               'bootstrap': [True, False],'criterion': ["gini", "entropy"]}]
grid_search = GridSearchCV(rnd_clf, param_grid, cv=3, scoring='accuracy',n_jobs=4,verbose = 5)
grid_search.fit(X_train.drop(["PID"], axis=1), y_train)
best_rnd_clf = grid_search.best_estimator_
print('Best RandomForest accuracy: ', grid_search.best_score_)
print(best_rnd_clf)
results = results.append({'clf': best_rnd_clf, 'best_acc': grid_search.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Fitting 3 folds for each of 3600 candidates, totalling 10800 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:   16.1s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:   38.1s
[Parallel(n_jobs=4)]: Done 280 tasks      | elapsed:  1.2min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 640 tasks      | elapsed:  2.9min
[Parallel(n_jobs=4)]: Done 874 tasks      | elapsed:  4.4min
[Parallel(n_jobs=4)]: Done 1144 tasks      | elapsed:  6.4min
[Parallel(n_jobs=4)]: Done 1450 tasks      | elapsed:  9.0min
[Parallel(n_jobs=4)]: Done 1792 tasks      | elapsed: 12.5min
[Parallel(n_jobs=4)]: Done 2170 tasks      | elapsed: 16.8min
[Parallel(n_jobs=4)]: Done 2584 tasks      | elapsed: 26.2min
[Parallel(n_jobs=4)]: Done 3034 tasks      | elapsed: 31.5min
[Parallel(n_jobs=4)]: Done 3520 tasks      | elapsed: 34.4min
[Parallel(n_jobs=4)]: Done 4042 tasks      | elapsed: 38.7min
[P

Best RandomForest accuracy:  0.37964088024841364
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
The best classifier so far is: 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=10,
            min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)


In [17]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

In [18]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission_rnd.csv", header=True, index=False)

## Multilayer Perceptron

In [26]:
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
results = pd.DataFrame(columns=('clf', 'best_acc'))
param_grid = [{'hidden_layer_sizes': [(10,),(5,5), (2,2)]}]
grid_search = GridSearchCV(mlp_clf, param_grid, cv=3, scoring='accuracy', return_train_score=True, n_jobs=4,verbose = 5)
grid_search.fit(X_train.drop(["PID"], axis=1), y_train)
best_mlp_clf = grid_search.best_estimator_
print('Best MultiLayer Perceptron accuracy: ', grid_search.best_score_)
print(best_mlp_clf)
results = results.append({'clf': best_mlp_clf, 'best_acc': grid_search.best_score_}, ignore_index=True)

print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed:    1.5s remaining:    1.9s
[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed:    2.2s remaining:    1.0s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed:    5.2s finished


Best MultiLayer Perceptron accuracy:  0.2770352369380316
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=42, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)
The best classifier so far is: 
MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(5, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=42, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)


In [27]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

In [28]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission_mlp.csv", header=True, index=False)

## AdaBoostClassifier

In [29]:
from sklearn.ensemble import AdaBoostClassifier
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
results = pd.DataFrame(columns=('clf', 'best_acc'))
ada_clf = AdaBoostClassifier(random_state=42)
param_grid = [{'n_estimators': [50, 100, 500]}]
grid_search = GridSearchCV(ada_clf, param_grid, cv=3, scoring='accuracy', return_train_score=True, n_jobs=4,verbose = 5)
grid_search.fit(X_train.drop(["PID"], axis=1), y_train)
best_ada_clf = grid_search.best_estimator_
print('Best AdaBoostClassifier accuracy: ', grid_search.best_score_)
print(best_ada_clf)
results = results.append({'clf': best_ada_clf, 'best_acc': grid_search.best_score_}, ignore_index=True)
print('The best classifier so far is: ')
print(results.loc[results['best_acc'].idxmax()]['clf'])

Fitting 3 folds for each of 3 candidates, totalling 9 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   4 out of   9 | elapsed:    1.6s remaining:    2.0s
[Parallel(n_jobs=4)]: Done   6 out of   9 | elapsed:    2.4s remaining:    1.1s
[Parallel(n_jobs=4)]: Done   9 out of   9 | elapsed:    8.7s finished


Best AdaBoostClassifier accuracy:  0.36978533819360065
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=42)
The best classifier so far is: 
AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=42)


In [30]:
yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

In [31]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission_ada.csv", header=True, index=False)

## VotingClassifier

In [35]:
from sklearn.ensemble import VotingClassifier
voting_clf = VotingClassifier(estimators=[('tre', best_tree_clf), ('sgd', best_sgd_clf), \
                                          ('lin', best_lin_clf), ('rnd', best_rnd_clf), \
                                          ('mlp', best_mlp_clf), ('ada', best_ada_clf)],voting='hard')
voting_clf.fit(X_train.drop(["PID"], axis=1), y_train)
yy = voting_clf.predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)



In [36]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission_vote.csv", header=True, index=False)

In [43]:
from sklearn.metrics import accuracy_score
y_pred = voting_clf.predict(X_train)
accuracy_score(y_train, yy)

ValueError: Number of features of the model must match the input. Model n_features is 359 and input n_features is 360 

In [49]:
#! pip install tensorflow
#! pip install keras
from keras.layers import Dense 
from keras.models import Sequential 
from keras.utils import to_categorical 

predictors = X.as_matrix() 
target = to_categorical(y)
n_cols = predictors.shape[1] 
model = Sequential() 
model.add(Dense(100, activation='relu', input_shape = (n_cols,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(2, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 
model.fit(predictors, target)

  import sys
W0710 04:26:48.629817  6304 deprecation_wrapper.py:119] From C:\Users\Matías\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0710 04:26:48.647821  6304 deprecation_wrapper.py:119] From C:\Users\Matías\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0710 04:26:48.650806  6304 deprecation_wrapper.py:119] From C:\Users\Matías\Anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0710 04:26:48.679796  6304 deprecation_wrapper.py:119] From C:\Users\Matías\Anaconda3\lib\site-packages\keras\optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0710 04:26:48.694795  6304 deprecation_wrapper.py:119] From C:

ValueError: Error when checking target: expected dense_3 to have shape (2,) but got array with shape (5,)