# Diplodatos Kaggle Competition

We present this peace of code to create the baseline for the competition, and as an example of how to deal with these kind of problems. The main goals are that you:

1. Learn
1. Try different models and see which one fits the best the given data
1. Get a higher score than the given one in the current baseline example
1. Try to get the highest score in the class :)

In [1]:
# Import the required packages
import os

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# load the given labels
breed = pd.read_csv('../data/breed_labels.csv')
color = pd.read_csv('../data/color_labels.csv')
state = pd.read_csv('../data/state_labels.csv')

Now we take a look at the labels, just to understand what these are

In [3]:
#breed.head()

In [4]:
#breed.shape

In [5]:
#color

In [6]:
#state

And now we are ready to deal with the *original* dataset...

In [7]:
#original_df = pd.read_csv('../data/train.csv')

In [8]:
#original_df.columns

In [9]:
#original_df.describe()

In [10]:
#original_df.shape

In [11]:
#original_df.head()

Create a function to transform the datasets. This is done by means of a function so that the transformations are the same for the training and testing datasets... We replace the encodings just to make it easy to "visualize" the data

In [12]:
def transform_data(train_data_fname, test_data_fname):
    def transform_columns(df):
        df = df.drop(["Description"], axis=1)
        df.Type = df.Type.replace({1: 'Dog', 2: 'Cat'})
        df.Gender = df.Gender.replace({1:'Male', 2:'Female', 3:'Mixed'})
        df.MaturitySize = df.MaturitySize.replace({1:'S', 2:'M', 3:'L', 4:'XL', 0:'N/A'})
        df.FurLength = df.FurLength.replace({1:'S', 2:'M', 3:'L', 0:'N/A'})
        df.Vaccinated = df.Vaccinated.replace({1:'T', 2:'N', 3:'N/A'})
        df.Dewormed = df.Dewormed.replace({1:'T', 2:'F', 3:'N/A'})
        df.Sterilized = df.Sterilized.replace({1:'T', 2:'F', 3:'N/A'})
        df.Health = df.Health.replace({1:'Healthy', 2: 'MinorInjury', 3:'SeriousInjury', 0: 'N/A'})
        df.Color1 = df.Color1.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color2 = df.Color2.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Color3 = df.Color3.replace(dict(list(zip(color.ColorID, color.ColorName)) + [(0, "N/A")]))
        df.Breed1 = df.Breed1.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        df.Breed2 = df.Breed2.replace(dict(list(zip(breed.BreedID, breed.BreedName)) + [(0, "N/A")]))
        return df
    
    df_train = pd.read_csv(train_data_fname)
    df_train = transform_columns(df_train)
    df_test = pd.read_csv(test_data_fname)
    df_test = transform_columns(df_test)
    
    df = pd.concat([df_train, df_test], sort=True)

    # set dummy variables for everything
    # except from Age, Quantity, Fee
    df = pd.get_dummies(df)
    # get train and test back
    n = len(df_train)
    df_train = df.iloc[:n]
    df_test = df.iloc[n:]
    
    y = df_train['AdoptionSpeed']
    X = df_train.drop('AdoptionSpeed', axis=1)
    yy = None
    XX = df_test.drop('AdoptionSpeed', axis=1)

    return X, y, XX, yy

In [13]:
#pd.get_dummies?

Load the data...

In [14]:
#X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")

Create the model and evaluate it

In [15]:
# split training dataset into train and "validation" 
# (we won't be using validation set in this example, because of the cross-validation;
# but it couldn be useful for you depending on your approach)
from sklearn.model_selection import train_test_split
#
#X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)
#results = pd.DataFrame(columns=('clf', 'best_acc'))

In [16]:
#X_train["PID"].isnull().sum()

In [17]:
def predict_petts(classifier, exploring_params, X_train, X_valid, y_train, y_valid, results):
    
    model = GridSearchCV(classifier, exploring_params, scoring='accuracy', cv=3, iid=False , n_jobs=-1)
    model.fit(X_train, y_train)
    best_model_clf = model.best_estimator_
    print("="*100)
    print('Best clasifier accuracy: ', model.best_score_)
    print(best_model_clf)
    results = results.append({'clf': best_model_clf, 'best_acc': model.best_score_}, ignore_index=True)

    print('The best classifier so far is: ')
    print(results.loc[results['best_acc'].idxmax()]['clf'])
    return results

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
#from sklearn.tree import DecisionTreeClassifier
#exploring_params = {'criterion':('gini', 'entropy'), 'min_samples_leaf':(1, 2, 5),
#              'min_samples_split':(2, 3, 5, 10, 50, 100)}
#classifier = DecisionTreeClassifier(random_state=42)
#X_train = X_train.drop(["PID"], axis=1)
#results = predict_petts(classifier, exploring_params, X_train, X_valid, y_train, y_valid, results)


**And finally**, we predict the unknown label for the testing set

In [19]:
#X.shape, XX.shape

In [20]:
#yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
#yy = yy.astype(np.int)

The last thing we do is generating a file that should be *submitted* on kaggle

In [21]:
#submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])

In [22]:
#submission.to_csv("../data/submission.csv", header=True, index=False)

In [23]:
#submission.shape

In [24]:
#submission.head()

### GRADIENT DESCENT : LOGISTIC REGRESSION / PERCEPTRON / SVM

In [25]:
#from sklearn.linear_model import SGDClassifier
#for idx, loss in enumerate(('hinge', 'log', 'perceptron'), start=1):
#    exploring_params = {
#        'learning_rate': ['constant'],
#        'eta0': [0.1, 0.01, 0.001] ,  # Tasa de entrenamiento
#        'alpha': [0.1, 0.01, 0.001]  # Tasa de regularización
#    }
#    classifier = SGDClassifier(loss=loss, tol=1e-3, penalty='l2')
#    results = predict_petts(classifier, exploring_params, X_train, X_valid, y_train, y_valid, results)


### Logistic regression 


In [87]:
X2, y2, XX2, yy2 = transform_data("../data/train.csv", "../data/test.csv")
random_state=0
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=random_state) 
results2 = pd.DataFrame(columns=('clf', 'best_acc'))

# Fitting Logistic Regression To the training set 
from sklearn.linear_model import LogisticRegression   
  
classifier2 = LogisticRegression(random_state=random_state, fit_intercept=True) 

exploring_params2 = {
        #'penalty': ['l1'],
        'penalty': ['l1', 'l2'],
        #'tol': [1e-3],
        'tol': [1e-3, 1e-2, 1e-1],
        #'C': [0.1],
        'C': [1, 0.1, 0.01, 0.0001]
    }
X_train2 = X_train2.drop(["PID"], axis=1)
results2 = predict_petts(classifier2, exploring_params2, X_train2, X_test2, y_train2, y_test2, results2)

yy2 = results2.clf.iloc[0].predict(XX2.drop(["PID"], axis=1))
yy2 = yy2.astype(np.int)

submission2 = pd.DataFrame(list(zip(XX2.PID, yy2)), columns=["PID", "AdoptionSpeed"])
submission2.to_csv("../data/submission4.csv", header=True, index=False)

Best clasifier accuracy:  0.3699181706307913
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=0, solver='warn',
          tol=0.001, verbose=0, warm_start=False)
The best classifier so far is: 
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=0, solver='warn',
          tol=0.001, verbose=0, warm_start=False)




### PERCEPTRON

In [72]:
X2, y2, XX2, yy2 = transform_data("../data/train.csv", "../data/test.csv")
random_state=0
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=random_state) 
results2 = pd.DataFrame(columns=('clf', 'best_acc'))

# Fitting Logistic Regression To the training set 
from sklearn.linear_model import Perceptron   
  
classifier2 = Perceptron(random_state=random_state, fit_intercept=True) 

exploring_params2 = {
        #'penalty': ['l1'],
        #'penalty': ['l1', 'l2'],
        #'tol': [1e-3],
        #'tol': [1e-3, 1e-2, 1e-1],
        #'C': [0.1],
        #'C': [1, 0.1, 0.01, 0.0001]
    }
X_train2 = X_train2.drop(["PID"], axis=1)
results2 = predict_petts(classifier2, exploring_params2, X_train2, X_test2, y_train2, y_test2, results2)

yy2 = results2.clf.iloc[0].predict(XX2.drop(["PID"], axis=1))
yy2 = yy2.astype(np.int)

submission2 = pd.DataFrame(list(zip(XX2.PID, yy2)), columns=["PID", "AdoptionSpeed"])
submission2.to_csv("../data/submission3.csv", header=True, index=False)

Best clasifier accuracy:  0.27582207522190993
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=None, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=None,
      validation_fraction=0.1, verbose=0, warm_start=False)
The best classifier so far is: 
Perceptron(alpha=0.0001, class_weight=None, early_stopping=False, eta0=1.0,
      fit_intercept=True, max_iter=None, n_iter=None, n_iter_no_change=5,
      n_jobs=None, penalty=None, random_state=0, shuffle=True, tol=None,
      validation_fraction=0.1, verbose=0, warm_start=False)




### SVM

In [82]:
X2, y2, XX2, yy2 = transform_data("../data/train.csv", "../data/test.csv")
random_state=0
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.3, random_state=random_state) 
results2 = pd.DataFrame(columns=('clf', 'best_acc'))

# Fitting Logistic Regression To the training set 
from sklearn.svm import LinearSVC
  
classifier2 = LinearSVC() 

exploring_params2 = {
        #'penalty': ['l1'],
        'penalty': ['l2'],
        #'tol': [1e-3],
        'tol': [1e-3, 1e-2, 1e-1, 10],
        #'C': [0.4],
        'C': [1, 0.4, 0.1, 0.01,]
    }
X_train2 = X_train2.drop(["PID"], axis=1)
results2 = predict_petts(classifier2, exploring_params2, X_train2, X_test2, y_train2, y_test2, results2)

yy2 = results2.clf.iloc[0].predict(XX2.drop(["PID"], axis=1))
yy2 = yy2.astype(np.int)

submission2 = pd.DataFrame(list(zip(XX2.PID, yy2)), columns=["PID", "AdoptionSpeed"])
submission2.to_csv("../data/submission3.csv", header=True, index=False)

Best clasifier accuracy:  0.3226493567464566
LinearSVC(C=0.4, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)
The best classifier so far is: 
LinearSVC(C=0.4, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)




### Logistic regression  + PCA

In [86]:
X, y, XX, yy = transform_data("../data/train.csv", "../data/test.csv")
random_state=0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state) 
results = pd.DataFrame(columns=('clf', 'best_acc'))

# Fitting Logistic Regression To the training set 
from sklearn.linear_model import LogisticRegression   
  
    
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler() 
  
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test) 

# performing preprocessing part 
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
  
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)  


# Applying PCA function on training 
# and testing set of X component 
from sklearn.decomposition import PCA 
  
pca = PCA(n_components=2) 
  
X_train = pca.fit_transform(X_train) 
X_test = pca.transform(X_test) 
  
explained_variance = pca.explained_variance_ratio_ 

    
classifier = LogisticRegression(random_state=random_state, fit_intercept=True) 

exploring_params = {
        #'penalty': ['l1'],
        'penalty': ['l1', 'l2'],
        #'tol': [1e-3],
        'tol': [1e-3, 1e-2, 1e-1],
        #'C': [0.1],
        'C': [1, 0.1, 0.01, 0.0001]
    }



X_train = pd.DataFrame({'PC1':X_train[:,0],'PC2':X_train[:,1]})
results = predict_petts(classifier, exploring_params, X_train, X_test, y_train, y_test, results)

yy = results.clf.iloc[0].predict(XX.drop(["PID"], axis=1))
yy = yy.astype(np.int)

submission = pd.DataFrame(list(zip(XX.PID, yy)), columns=["PID", "AdoptionSpeed"])
submission.to_csv("../data/submission3.csv", header=True, index=False)

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)
  


Best clasifier accuracy:  0.2980936180308626
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=0, solver='warn',
          tol=0.001, verbose=0, warm_start=False)
The best classifier so far is: 
LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l1', random_state=0, solver='warn',
          tol=0.001, verbose=0, warm_start=False)




ValueError: X has 359 features per sample; expecting 2

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0) 


In [None]:
X_set, y_set = X_train, y_train
X_set.shape


In [None]:
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler() 
  
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test) 

In [None]:
# performing preprocessing part 
from sklearn.preprocessing import StandardScaler 
sc = StandardScaler()
  
X_train = sc.fit_transform(X_train) 
X_test = sc.transform(X_test)

In [None]:
# Applying PCA function on training 
# and testing set of X component 
from sklearn.decomposition import PCA 
  
pca = PCA(n_components = 2) 
  
X_train = pca.fit_transform(X_train) 
X_test = pca.transform(X_test) 
  
explained_variance = pca.explained_variance_ratio_ 


In [None]:
# Fitting Logistic Regression To the training set 
from sklearn.linear_model import LogisticRegression   
  
classifier = LogisticRegression(random_state = 0) 


In [None]:
exploring_params = {
        'penalty': ['l1','l2'],
        'tol': [1e-3, 1e-2, 1e-1],
        'C': [170 ,150, 140, 10]
    }
results = predict_petts(classifier, exploring_params, X_train, X_test, y_train, y_test, results)


In [None]:
classifier.fit(X_train, y_train) 


In [None]:
# Predicting the test set result using  
# predict function under LogisticRegression  
y_pred = classifier.predict(X_test) 

In [None]:
# making confusion matrix between 
#  test set of Y and predicted value. 
from sklearn.metrics import confusion_matrix 
  
cm = confusion_matrix(y_test, y_pred) 

In [None]:
X_set, y_set = X_train, y_train

In [None]:
y_set.shape

In [None]:
X_set.shape

In [None]:
X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, 
                     stop = X_set[:, 0].max() + 1, step = 0.01), 
                     np.arange(start = X_set[:, 1].min() - 1, 
                     stop = X_set[:, 1].max() + 1, step = 0.01)) 


In [None]:


# Predicting the training set 
# result through scatter plot  
from matplotlib.colors import ListedColormap 
plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), 
             X2.ravel()]).T).reshape(X1.shape), alpha = 0.75, 
             cmap = ListedColormap(('yellow', 'white', 'aquamarine'))) 
  
plt.xlim(X1.min(), X1.max()) 
plt.ylim(X2.min(), X2.max()) 
  
for i, j in enumerate(np.unique(y_set)): 
    plt.scatter(X_set[y_set == j, 0], X_set[y_set == j, 1], 
                c = ListedColormap(('red', 'green', 'blue'))(i), label = j) 
  
plt.title('Logistic Regression (Training set)') 
plt.xlabel('PC1') # for Xlabel 
plt.ylabel('PC2') # for Ylabel 
plt.legend() # to show legend 
  
# show scatter plot 
plt.show() 