# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#from ml.visualization import plot_confusion_matrix, plot_learning_curve
from sklearn.datasets import load_wine
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(1234) # Para mayor determinismo


In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

In [3]:
original_df = pd.read_csv('../data/train.csv')


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [4]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = df_test['AdoptionSpeed']
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

In [5]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

In [6]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

### RandomForestClassifier

In [7]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 2, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt','log2']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 8]
# Method of selecting samples for training each tree
bootstrap = [True, False]

criterion = ['gini','entropy']
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap,
               'criterion': criterion
              }
print(random_grid)

{'n_estimators': [2, 224, 446, 668, 890, 1112, 1334, 1556, 1778, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10, 15], 'min_samples_leaf': [1, 2, 4, 8], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']}


In [8]:
X_train.shape, X_valid.shape, y_train.shape, y_valid.shape

((7407, 360), (3175, 360), (7407,), (3175,))

In [9]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train.drop(["PID"], axis=1), y_train)

RandomizedSearchCV(cv=5, error_score='raise-deprecating',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid='warn', n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [2, 224, 446, 668, 890, 1112, 1334, 1556, 1778, 2000], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10, 15], 'min_samples_leaf': [1, 2, 4, 8], 'bootstrap': [True, False], 'criterion': ['gini', 'entropy']},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train

In [10]:
rf_random.best_params_

{'n_estimators': 1112,
 'min_samples_split': 15,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 30,
 'criterion': 'gini',
 'bootstrap': True}

In [11]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = accuracy_score(test_labels, predictions)
    print('Model Performance')
    print('Accuracy = ')
    print(accuracy)
    
    return accuracy

In [12]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train.drop(["PID"], axis=1), y_train)
base_accuracy = evaluate(base_model, X_valid.drop(["PID"], axis=1), y_valid)

Model Performance
Accuracy = 
0.328503937007874


In [13]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_valid.drop(["PID"], axis=1), y_valid)

Model Performance
Accuracy = 
0.3826771653543307


In [14]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 16.49%.


In [31]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [20,25,30,35,40],
    'max_features': ['auto'],
    'min_samples_leaf': [1,2,4,5],
    'min_samples_split': [10,12,14,15,17,19,21],
    'n_estimators': [1112,1000,1050,1150],
    'criterion':['gini']
}

In [32]:
from sklearn.ensemble import RandomForestClassifier

plt.figure(figsize=(14, 4), dpi= 80, facecolor='w', edgecolor='k')

results = pd.DataFrame(columns=('clf', 'best_acc'))
results = []

m = RandomForestClassifier(n_jobs=-1, random_state=42)
model = GridSearchCV(m, param_grid, cv=5, scoring='accuracy')
   
#entrenar los datos de entrenamiento eliminando los PID para entrenar
model.fit(X_train.drop(["PID"], axis=1), y_train)

  
print("Mejor conjunto de parámetros:")
print(model.best_params_, end="\n\n")
    
print("Mejor Estimador:")
print(model.best_estimator_, end="\n\n")
    
print("Mejor Accuracy:")
print(model.best_score_, end="\n\n")



Mejor conjunto de parámetros:
{'bootstrap': True, 'criterion': 'gini', 'max_depth': 30, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 14, 'n_estimators': 1050}

Mejor Estimador:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=30, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=14,
            min_weight_fraction_leaf=0.0, n_estimators=1050, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

Mejor Accuracy:
0.3888213851761847



<Figure size 1120x320 with 0 Axes>

In [33]:
results.append({'clf': model.best_estimator_, 'best_acc': model.best_score_}) #sacar el ignore index para results como lista
results = pd.DataFrame(results)

In [34]:
results

Unnamed: 0,best_acc,clf
0,0.388821,"(DecisionTreeClassifier(class_weight=None, cri..."


In [35]:
X_valid.shape, X_train.shape

((3175, 360), (7407, 360))

In [36]:
y_pred = results.clf.iloc[0].predict(X_valid.drop(["PID"], axis=1))
y_pred = y_pred.astype(np.int)

In [37]:
y_pred

array([4, 4, 4, ..., 1, 2, 2])

In [38]:
y_pred.shape

(3175,)

In [39]:
accuracy_score(y_pred, y_valid)

0.38708661417322837

In [40]:
y_pred_t = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
y_pred_t = y_pred_t.astype(np.int)

In [41]:
submission = pd.DataFrame(list(zip(XX.PID, y_pred_t)), columns=["PID", "AdoptionSpeed"])
submission

Unnamed: 0,PID,AdoptionSpeed
0,1,2
1,2,3
2,7,4
3,9,4
4,11,2
5,12,2
6,25,4
7,33,4
8,34,4
9,35,1


In [42]:
submission.to_csv("../data/submission_rdmf1.csv", header=True, index=False)