# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


#from ml.visualization import plot_confusion_matrix, plot_learning_curve
from sklearn.datasets import load_wine
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(1234) # Para mayor determinismo


In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

In [3]:
original_df = pd.read_csv('../data/train.csv')


Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [4]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

In [5]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

### DecisionTreeRegressor

In [9]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor

plt.figure(figsize=(14, 4), dpi= 80, facecolor='w', edgecolor='k')

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

results = pd.DataFrame(columns=('clf', 'best_acc'))
results = []


#HIPERPARAMETROS

exploring_params = {
        'criterion': ['mse','mae'],
        'min_samples_split': range(2, 10)
        }

m = DecisionTreeRegressor(random_state=0)
model = GridSearchCV(m, exploring_params, cv=5, scoring='accuracy')
   
#entrenar los datos de entrenamiento eliminando los PID para entrenar
model.fit(X_train.drop(["PID"], axis=1), y_train)

print("Mejor conjunto de parámetros:")
print(model.best_params_, end="\n\n")
   
print("Mejor Estimador:")
print(model.best_estimator_, end="\n\n")
    
print("Mejor Accuracy:")
print(model.best_score_, end="\n\n")

    


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

<Figure size 1120x320 with 0 Axes>

In [None]:
results.append({'clf': model.best_estimator_, 'best_acc': model.best_score_}) #sacar el ignore index para results como lista
    results = pd.DataFrame(results)
    
    print('The best classifier so far is: ')
    print(results.loc[results['best_acc'].idxmax()]['clf'])


In [None]:
results

In [None]:
X.shape, XX.shape

In [None]:
## prediccion

print("Reporte de clasificación para el mejor clasificador (sobre conjunto de evaluación):", end="\n\n")
yy = results.model.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

In [None]:
submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission-SGDClassifier.csv", header=True, index=False)