In [10]:
import sys
import pandas as pd
import numpy as np
from time import time
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier, SGDRegressor, Perceptron
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor
from sklearn.svm import SVC, SVR
from sklearn.dummy import DummyClassifier, DummyRegressor
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from persistent_cache import memo, PersistentDict as Perd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from ml_metrics import quadratic_weighted_kappa

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)

#TODO: check which ones are really worth encoding and which can be even dropped (some may contain nonoverlapping values between
#test and training sest)
categorical = {'Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 
               'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 
               'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 
               'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 
               'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 
               'Insurance_History_9', 'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 
               'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 
               'Medical_History_8', 'Medical_History_9', 'Medical_History_10', 'Medical_History_11', 
               'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_16', 
               'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 
               'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 
               'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 
               'Medical_History_30', 'Medical_History_31', 'Medical_History_33', 'Medical_History_34', 
               'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 
#                'Medical_History_38', 
               'Medical_History_39', 'Medical_History_40', 'Medical_History_41','Medical_History_1', 
               'Medical_History_15', 'Medical_History_24', 'Medical_History_32'}

In [11]:
import logging
# create logger
logger = logging.getLogger("logging_tryout2")
logger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
# ch.setLevel(logging.DEBUG)

# create formatter
# formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(message)s")
formatter = logging.Formatter("%(asctime)s;%(levelname)s;%(message)s",
                              "%Y-%m-%d %H:%M:%S")
# add formatter to ch
ch.setFormatter(formatter)
# logger.addHandler(ch)
def info(msg):
    logger.info(msg.replace("\n", "  "))

In [20]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
total = pd.concat([train, test])
median = total.median()
train.fillna(median, inplace=True)
test = test.fillna(median, inplace=True)
encoder = LabelEncoder()
for f in categorical:
    encoder.fit(total[f])
    train[f] = encoder.transform(train[f])
    test[f] = encoder.transform(test[f])

In [4]:
feature_cols = test.columns[1:]
categorical_inds = [i for i, col in enumerate(feature_cols) if col in categorical]
oh_encoder = OneHotEncoder(categorical_features=categorical_inds)

X = np.array(train[test.columns[1:]])
y = np.array(train.Response)
X_actual_test = np.array(test[feature_cols])

oh_encoder.fit(X)
X = oh_encoder.transform(X).todense()
X_actual_test = oh_encoder.transform(X_actual_test).todense()

train_test_folds = list(StratifiedKFold(y, n_folds=6, random_state=0))

In [5]:
@memo(Perd("memo/train_predictions"))
def train_predictions(model):
    ind2pred = {}
    for i, (train, test) in enumerate(train_test_folds):
        info(("fitting fold   "+str(i+1)+ str(model)[:100]))
        model.fit(X[train], y[train])
        info(("fold fitted    "+str(i+1)+  str(model)[:100]))
        preds = model.predict(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@memo(Perd("memo/test_predictions"))
def test_predictions(model):
    model.fit(X, y)
    return model.predict(X_actual_test)


@memo(Perd("memo/stacker_train_predictions"))
def stacker_train_predictions(stacker, base_clfs):
    info("start stacker --------------------------")
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    info("base regressors done")
    ind2pred = {}
    for i, (train, test) in enumerate(train_test_folds):
        info("fitting stacker fold %s   %s" % (i, str(stacker)))

        stacker.fit(stacked_X[train], y[train])
        info("stacker fitted fold %s    %s " % (i, str(stacker)))
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    info("stacker done =========================")
    return np.array([ind2pred[i] for i in range(len(y))])

@memo(Perd("memo/lazy_stacker_train_predictions"))
def lazy_stacker_train_predictions(stacker, base_clfs):
    info("start stacker --------------------------")
    n = len(y)
    stacked_X = np.hstack([train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    info("base regressors done")
    ind2pred = {}
    for i, (train, test) in enumerate(train_test_folds):
        info("fitting stacker fold %s   %s" % (i, str(stacker)))

        stacker.fit(stacked_X[train], y[train])
        info("stacker fitted fold %s    %s " % (i, str(stacker)))
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    info("stacker done =========================")
    return np.array([ind2pred[i] for i in range(len(y))])


@memo(Perd("memo/stacker_test_predictions"))
def stacker_test_predictions(stacker, base_clfs):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
    stacker.fit(stacked_X, y)
    n = X_actual_test.shape[0]
    stacked_test_X = np.hstack([X_actual_test] + [test_predictions(clf).reshape(n, 1) for clf in base_clfs])
    return stacker.predict(stacked_test_X)

In [6]:
def benchmark(model):
    pred = train_predictions(model)
    return eval_wrapper(pred, y)

def make_predictions(model):
    model.fit(X, y)
    return model.predict(X_actual_test)

def benchmark_stacker(model, base_clfs):
    pred = stacker_train_predictions(model, base_clfs)
    return eval_wrapper(pred, y)


def benchmark_lazy_stacker(model, base_clfs):
    pred = lazy_stacker_train_predictions(model, base_clfs)
    return eval_wrapper(pred, y)


In [7]:
xgbr = lambda: XGBRegressor(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9)
xgbc = lambda: XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9)
rfr = lambda: RandomForestRegressor(n_estimators=400)
etr = lambda: ExtraTreesRegressor(n_estimators=400)
etc = lambda: ExtraTreesClassifier(n_estimators=400)
sgdr = lambda: SGDRegressor()
perceptron = lambda: Perceptron()

dream_team = lambda: sorted([xgbr(), rfr(),  etr(), sgdr(), LinearRegression(), Perceptron(),
              SVR(kernel="linear"), SVR(kernel="poly"), SVR(kernel="sigmoid"),
              SVR(kernel="rbf"), LogisticRegression()])

In [9]:
train_predictions(DummyClassifier())

array([6, 1, 1, ..., 4, 8, 6])

In [32]:
print("Load the data using pandas")
xtrain = pd.read_csv("train.csv")
xtest = pd.read_csv("test.csv")

# combine train and test
xall_data = xtrain.append(test)

# factorize categorical variables    
xall_data['Product_Info_2'] = pd.factorize(xall_data['Product_Info_2'])[0]

print('Eliminate missing values')    
# Use -1 for any others
xall_data.fillna(-1, inplace=True)

# fix the dtype on the label column
xall_data['Response'] = xall_data['Response'].astype(int)

# Provide split column
# all_data['Split'] = np.random.randint(5, size=all_data.shape[0])

# split train and test
xtrain = xall_data[xall_data['Response']>0].copy()
xtest = xall_data[xall_data['Response']<1].copy()

Load the data using pandas
Eliminate missing values


In [38]:
xftrain = np.array(xtrain.drop(["Id", "Response"], axis=1))
xftest = np.array(xtrain.drop(["Id", "Response"], axis=1))

In [39]:
xy = y

In [59]:
np.array(xtrain.Response)

array([8, 4, 8, ..., 8, 8, 7])

In [41]:
n = 1000
model = RandomForestRegressor(n_estimators=400)
model.fit(xftrain[:n, :], y[:n])

RandomForestRegressor(bootstrap=True, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=400, n_jobs=1,
           oob_score=False, random_state=None, verbose=0)

In [42]:
n = 1000
model = RandomForestRegressor(n_estimators=400)
model = LinearRegression()

model.fit(xftrain[:n, :], y[:n])


LinearRegression(copy_X=True, fit_intercept=True, normalize=False)

In [63]:
n=100000
for model in [sgdr()]:
    model.fit(xftrain[:n, :], y[:n])
    preds = model.predict(xftrain[:n, :])
    print ((preds - y[:n])**2).mean()

ValueError: Floating-point under-/overflow occurred at epoch #1. Scaling input data with StandardScaler or MinMaxScaler might help.

3.48495861181


In [49]:
model = RandomForestClassifier()
model.fit(xftrain, y)
preds = model.predict(xftrain)
print ((preds - y)**2).mean()

0.112510735757


In [None]:
@staticmethod
    def decider(all_scores):
        """
        Each bit from left to right represents the list below,
        1. director different from company name
        2. director same as company name x
        3. url (primary|secondary)
        4. infoboxcompany (proof of being a company)
        5. company_name > X similarity
        6. company_name_match

        The way the regex is put together represents what different combinations can pass,

        A. Director different from company name
        B. Director same as company name and (url (primary or secondary) or proof of it being a company)
        C. url(primary or secondary) and company name > X similarity with titles/redirect titles
        D. company_name_match and proof of it being a company
        """

        acceptance_criteria = [
            re.compile(ur'1.....'),
            re.compile(ur'.1(10|01|11)..'),
            re.compile(ur'..1.1.'),
            # re.compile(ur'...1.1')
        ]

        ranking_titles = [
            'Distinct director name(different from company name)',
            'Director name equal to company name and (company url or proof of being a company)',
            'Url match and company name similar to wikipedia titles/redirect titles or company parent from infobox',
            'Company name matches wikipedia titles/redirect titles and the page can be a company page, and incorporation date is in infobox'
        ]