## Place helper functions on top

In [51]:
def one_of_k_encoding(cols, df):
    """ Categorial encoding of columns where datatype is categorical. 
        Input:    cols [list]
                  df   [pd.DataFrame]
        Output:   updated df
    """
    assert type(cols)==list, "Expecting a list."
    for col in cols:
        if col in df.columns.values: # Silently skip if not present
            df = pd.concat([df, pd.get_dummies(df[col], prefix=col)], axis=1)
            df.drop(col, axis=1, inplace=True)
    return df


def load_titanic(file):
    if os.path.isfile(file): 
        return pd.read_csv(file)
    else:
        print(sys.exc_info()[1])
        return pd.DataFrame()

## Place derived classes here

In [None]:
class RandomStackedEnsembleClassifier():
    
    def __init__:
        pass
    
    
class RandomBlendingClassifier():
    
    def __init__:
        pass
    

class TunedStackedEnsembleClassifier(n_classifier=5, 
                                     use_proba=False, ):
    def __init__:
        self.n_classifier == n_classifier # Choose number of clfs to draw from pool
        self.use_proba = use_proba # Use probabilities
        # Draw 'n_classifier' classifiers from space of available classifiers
        n_sampled = 0; 
        while True:
            # Implement choosing here.
            if n_sampled==n_classifier: break
        

## For now this cell contains all variables that govern program flow

In [3]:
classification = True
# Init
binary_classification = False; multiclass_classification = False

In [14]:
import os
import sys
import sklearn
import numpy as np
import pandas as pd
print(sklearn.__version__, 
      pd.__version__
)

0.17.1 0.16.2


In [63]:
data = load_titanic('data/train.csv')
if data.shape[0]>0:
    X_train, y_train = data.loc[:,:], np.array(data.pop('Survived'))
    if len(np.unique(y_train))==2: 
        binary_classification = True
    elif len(np.unique(y_train))>2:
        multiclass_classification = True
else:
    sys.exit(sys.exc_info()[1])
    
uninformative = ['PassengerId', 'Ticket']
for col in uninformative:
    X_train.drop(col, axis=1, inplace=True)
    
information = [('Pclass', 'category'), ('Name', str), ('Sex', 'category'), 
               ('Age', np.int8), ('SibSp', 'category'), ('Parch', 'category'), 
               ('Fare', np.float), ('Cabin', str), ('Embarked', 'category')
]

In [64]:
X_train = one_of_k_encoding(['Sex', 'Embarked'], X_train)

In [66]:
X_train.shape, len(y_train.ravel())

((891, 12), 891)

# Things to consider

We have different types of learning:
* Classification (binary, multiclass)
* Regression
* Unsupervised learning

Some methods calculate probabilities, others do not (but output might be coerced into the correct form)

In [4]:
# Start by importing supervised learning algorithms for classification
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier,  \
GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier

In [8]:
classifiers = [LogisticRegression(), 
               BernoulliNB(),
               GaussianNB(),
               LinearDiscriminantAnalysis(), 
               QuadraticDiscriminantAnalysis(), 
               AdaBoostClassifier(), 
               ExtraTreesClassifier(),
               DecisionTreeClassifier(),
               RandomForestClassifier(),
               GradientBoostingClassifier(),
               LinearSVC(),
               SVC(kernel="rbf", C=0.025, probability=True),
               KNeighborsClassifier(),
               MLPClassifier(),
]

for clf in classifiers:
    ##clf.fit(X_train, y_train)
    name = clf.__class__.__name__
    print(name)  
    
# To do:
# * Plot accuracy (or some other comparative measure) as a comparative bar chart
# * Predict out-of-fold and compure pearson correlation between different classifier predictions:
#   p(x,y) = cov(x,y)/sigma_x/sigma_y

# * Choose an ensemble consisting of the N least correlated classifiers (best effect on overall performance)
# * try different methods of ensembling: boosting, bagging, model stacking/blending
#   See this post: https://mlwave.com/kaggle-ensembling-guide/
#
# * Consider model compression (Caruana et al paper): NN 

LogisticRegression
BernoulliNB
GaussianNB
LinearDiscriminantAnalysis
QuadraticDiscriminantAnalysis
AdaBoostClassifier
ExtraTreesClassifier
DecisionTreeClassifier
RandomForestClassifier
GradientBoostingClassifier
LinearSVC
SVC
KNeighborsClassifier
MLPClassifier


# Important points to remember:

* Multi-layer Perceptron is sensitive to feature scaling, so it is highly recommended to scale your data. For example, scale each attribute on the input vector X to [0, 1] or [-1, +1], or standardize it to have mean 0 and variance 1. Note that you must apply the same scaling to the test set for meaningful results. You can use StandardScaler for standardization.

Attempt to train a single algorithm (perhaps one of the most expensive to train, such as SVM) on the full dataset. If this takes too long then we start by down-sampling the data, until we have something that is manageable