In [7]:
MEMO_PATH = "memo_fe/"

#=========================================LOGGING
import logging
# create logger
logging.basicConfig(filename='best_ensemble.log',level=logging.DEBUG, format="%(asctime)s; %(levelname)s;  %(message)s")
logger = logging.getLogger("trainlo")
logger.setLevel(logging.DEBUG)

def info(msg):
    logger.info(msg.replace("\n", "  "))
#=========================================LOGGING

import sys
import pandas as pd
import numpy as np
from time import time
from scipy.optimize import fmin_powell
from sklearn.cross_validation import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.dummy import DummyClassifier, DummyRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression, SGDClassifier, \
    SGDRegressor, Perceptron, PassiveAggressiveRegressor, BayesianRidge, Lasso
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC, SVR
from xgboost.sklearn import XGBClassifier, XGBRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from persistent_cache import memo, PersistentDict as Perd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from ml_metrics import quadratic_weighted_kappa
from feature_engineering import train_test_sets

def eval_wrapper(yhat, y):  
    y = np.array(y)
    y = y.astype(int)
    yhat = np.array(yhat)
    yhat = np.clip(np.round(yhat), np.min(y), np.max(y)).astype(int)   
    return quadratic_weighted_kappa(yhat, y)


num_classes = 8
# print("Load the data using pandas")
# train = pd.read_csv("train.csv")
# test = pd.read_csv("test.csv")

# # combine train and test
# all_data = train.append(test)

# # factorize categorical variables    
# all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]

# # FEATURE ENGINEERING
# all_data['bmi_ins_age'] = all_data.BMI * all_data.Ins_Age
# all_data['nan_count'] = all_data.isnull().sum(axis=1)
# #all_data['emp_inf_4_sq'] = all_data.Employment_Info_4 ** 2
# #all_data['fam_hist_4_sq'] = all_data.Family_Hist_4 ** 2
# #all_data['fam_hist_2_sq'] = all_data.Family_Hist_2 ** 2

# mk = [col for col in train.columns if col.startswith("Medical_K")]
# all_data['sum_keywords'] = sum(train[col] for col in mk)

# all_data.drop('Medical_History_24')
# all_data.drop('Medical_History_10')



# print('Eliminate missing values')    
# # Use -1 for any others
# all_data.fillna(-1, inplace=True)

# # fix the dtype on the label column
# all_data['Response'] = all_data['Response'].astype(int)

# # Provide split column
# # all_data['Split'] = np.random.randint(5, size=all_data.shape[0])

# # split train and test
# train = all_data[all_data['Response']>0].copy()
# test = all_data[all_data['Response']<1].copy()


# X = np.array(train.drop(["Id", "Response"], axis=1))
# X_actual_test = np.array(test.drop(["Id", "Response"], axis=1))
# y = np.array(train.Response)






y = np.array(pd.read_csv("train.csv").Response)
train_test_folds = list(StratifiedKFold(y, n_folds=4, random_state=0))
#================================================================================================
@memo(Perd(MEMO_PATH + "_train_predictions"))
def train_predictions(model, fe):
    X, _ = train_test_sets(fe)
    ind2pred = {}
    for i, (train, test) in enumerate(train_test_folds):
        info(("fitting fold   "+str(i+1)+ str(model)[:100]))
        model.fit(X[train], y[train])
        info(("fold fitted    "+str(i+1)+  str(model)[:100]))
        preds = model.predict(X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    
    return np.array([ind2pred[i] for i in range(len(y))])

@memo(Perd(MEMO_PATH + "_test_predictions"))
def test_predictions(model, fe):
    X, X_actual_test = train_test_sets(fe)
    info("fitting (on full train set) %s" % model)
    model.fit(X, y)
    info("done fitting for %s" % model)
    return model.predict(X_actual_test)


@memo(Perd(MEMO_PATH + "_stacker_train_predictions"))
def stacker_train_predictions(stacker, base_clfs, fe):
    X, _ = train_test_sets(fe)
    info("start stacker --------------------------")
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf, fe).reshape(n, 1) for clf in base_clfs])
    info("base regressors done")
    ind2pred = {}
    for i, (train, test) in enumerate(train_test_folds):
        info("fitting stacker fold %s   %s" % (i, str(stacker)))

        stacker.fit(stacked_X[train], y[train])
        info("stacker fitted fold %s    %s " % (i, str(stacker)))
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    info("stacker done =========================")
    return np.array([ind2pred[i] for i in range(len(y))])

@memo(Perd(MEMO_PATH + "_lazy_stacker_train_predictions"))
def lazy_stacker_train_predictions(stacker, base_clfs, fe):
    info("start stacker --------------------------")
    n = len(y)
    stacked_X = np.hstack([train_predictions(clf, fe).reshape(n, 1) for clf in base_clfs])
    info("base regressors done")
    ind2pred = {}
    for i, (train, test) in enumerate(train_test_folds):
        info("fitting stacker fold %s   %s" % (i, str(stacker)))

        stacker.fit(stacked_X[train], y[train])
        info("stacker fitted fold %s    %s " % (i, str(stacker)))
        preds = stacker.predict(stacked_X[test])
        for i, p in zip(test, preds):
            ind2pred[i] = p
    info("stacker done =========================")
    return np.array([ind2pred[i] for i in range(len(y))])


@memo(Perd(MEMO_PATH + "_stacker_test_predictions"))
def stacker_test_predictions(stacker, base_clfs, fe):
    n = len(y)
    stacked_X = np.hstack([X] + [train_predictions(clf, fe).reshape(n, 1) for clf in base_clfs])
    stacker.fit(stacked_X, y)
    nn = X_actual_test.shape[0]
    stacked_test_X = np.hstack([X_actual_test] + [test_predictions(clf, fe).reshape(nn, 1) for clf in base_clfs])
    return stacker.predict(stacked_test_X)
#============================================================================
def benchmark(model, fe):
    pred = train_predictions(model, fe)
    return eval_wrapper(pred, y)

def make_predictions(model, fe):
    X, X_actual_test = train_test_sets(fe)
    model.fit(X, y)
    return model.predict(X_actual_test)

def benchmark_stacker(model, base_clfs, fe):
    pred = stacker_train_predictions(model, base_clfs, fe)
    result = eval_wrapper(pred, y)
    info("stacker %.4f   %s, %s, feats = %s" % (result, model, base_clfs, fe))
    return result

def benchmark_lazy_stacker(model, base_clfs, fe):
    pred = lazy_stacker_train_predictions(model, base_clfs, fe)
    result = eval_wrapper(pred, y)
    info("lazy stacker %.4f   %s, %s  feats = %s" % (result, model, base_clfs, fe))
    
    return result
#==============================================================================
# OPTIMISING OFFSETS
# -----------------------------------------------------------------------------
def apply_offset(data, bin_offset, sv, scorer=eval_wrapper):
    # data has the format of pred=0, offset_pred=1, labels=2 in the first dim
    data[1, data[0].astype(int)==sv] = data[0, data[0].astype(int)==sv] + bin_offset
    score = scorer(data[1], data[2])
    return score


def optimize_offsets(predictions, y):
    # train offsets
    info("optimising offsets %s" % len(y))
    offsets = np.ones(num_classes) * -0.5
    offset_train_preds = np.vstack((predictions, predictions, y))
    
    for j in range(num_classes):
        train_offset = lambda x: -apply_offset(offset_train_preds, x, j)
        offsets[j] = fmin_powell(train_offset, offsets[j], disp=False) 
    info("done optimising offsets %s" % len(y))
    return offsets

def actually_apply_offsets(predictions, offsets):
    data = np.vstack((predictions, predictions, -np.ones(len(predictions))))
    for j in range(num_classes):
        data[1, data[0].astype(int)==j] = data[0, data[0].astype(int)==j] + offsets[j] 

    final_test_preds = np.round(np.clip(data[1], 1, 8)).astype(int)
    return final_test_preds

def optimized_train_predictions(raw_train_predictions):
    n = len(y)
    ind2pred = {}
    for i, (train, test) in enumerate(train_test_folds):
        train_preds = raw_train_predictions[train]
        offsets = optimize_offsets(train_preds, y[train])
        test_preds = actually_apply_offsets(raw_train_predictions[test], offsets)
        for i, p in zip(test, test_preds):
            ind2pred[i] = p
    return np.array([ind2pred[i] for i in range(len(y))])

def benchmark_model_optimized(model, fe):
    preds = optimized_train_predictions(train_predictions(model, fe))
    result = eval_wrapper(preds, y)
    info("optimized %.4f   %s  %s" % (result, model, fe))
    return result

def benchmark_optimized_stacker(stacker, base_clfs, fe):
    preds = stacker_train_predictions(stacker, base_clfs, fe)
    opreds = optimized_train_predictions(preds)
    result = eval_wrapper(opreds, y)
    info("optimized stacker %.4f   %s, %s,  feats=%s" % (result, stacker, base_clfs, fe))
    return result

def benchmark_optimized_lazy_stacker(stacker, base_clfs, fe):
    preds = lazy_stacker_train_predictions(stacker, base_clfs, fe)
    opreds = optimized_train_predictions(preds)
    result = eval_wrapper(opreds, y)
    info("optimized stacker %.4f   %s, %s   feats=%s" % (result, stacker, base_clfs, fe))
    return result

def optimized_test_predictions(stacker, base_clfs, fe):
    train_preds = stacker_train_predictions(stacker, base_clfs, fe)
    offsets = optimize_offsets(train_preds, y)
    test_preds = stacker_test_predictions(stacker, base_clfs, fe)
    final_test_preds = actually_apply_offsets(test_preds, offsets)
    #print "print len(train_preds), len(test_preds), len(final_test_preds)"
    #print len(train_preds), len(test_preds), len(final_test_preds)
    info("made optimized predictions for stacker %s with features %s and base regressors %s" % (stacker, fe, base_clfs))
    return final_test_preds

def make_sub_optimized(stacker, base_clfs, fe, filename):
    preds = optimized_test_predictions(stacker, base_clfs, fe)
    #print "len(preds)"
    #print len(preds)
    df = pd.DataFrame()
    df['Id'] = test.Id
    df['Response'] = preds
    info("made submission to file %s. stacker %s, features %s" % (filename, stacker, fe))
    df.to_csv(filename, index=False)
    
def make_sub(stacker, base_clfs, fe, filename):
    preds = stacker_test_predictions(stacker, base_clfs, fe)
    
    df = pd.DataFrame()
    df['Id'] = test.Id
    df['Response'] = preds
    info("making stacker %s, nonoptimized submission to file %s " % (stacker, filename))
    df.to_csv(filename, index=False)
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

xgbr = lambda: XGBRegressor(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9)
xgbc = lambda: XGBClassifier(objective="reg:linear", min_child_weight=80, subsample=0.85, colsample_bytree=0.30, silent=1, max_depth=9)
rfr = lambda: RandomForestRegressor(n_estimators=400)
etr = lambda: ExtraTreesRegressor(n_estimators=400)
etc = lambda: ExtraTreesClassifier(n_estimators=400)
sgdr = lambda: SGDRegressor()
xgbr_poly = lambda: Pipeline([("poly", PolynomialFeatures(degree=2)), ("xgbr", xgbr())])
linreg_poly = lambda: Pipeline([("poly", PolynomialFeatures(degree=2)), ("linreg", LinearRegression())])
linreg = lambda: LinearRegression()
bayes_ridge = lambda: BayesianRidge()
lasso = lambda: Lasso()
svrsig = lambda: SVR(kernel="sigmoid")
svrrbf = lambda: SVR(kernel="rbf")
perc = lambda: Perceptron()

dream_team = lambda: sorted([xgbr(), rfr(),  etr(), LinearRegression(), 
                             #xgbr_poly(), linreg_poly(),
                             bayes_ridge(),
                             lasso(),
                             svrsig(),
                             #svrrbf(),
                             perc()
                            ])




In [13]:
info("=-=-=-=-=-=-=-=-=-=-=-=-=- mic test, disregard  =-=-=-=-=-=-=-=-=-=-=-=-=-")
benchmark(BayesianRidge(), "basic")

INFO:trainlo:=-=-=-=-=-=-=-=-=-=-=-=-=- mic test, disregard  =-=-=-=-=-=-=-=-=-=-=-=-=-
INFO:trainlo:fitting fold   1BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
INFO:trainlo:fold fitted    1BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
INFO:trainlo:fitting fold   2BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
INFO:trainlo:fold fitted    2BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
INFO:trainlo:fitting fold   3BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
INFO:trainlo:fold fitted    3BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
INFO:trainlo:fitting fold   4BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_interce

0.5241169466704168

In [96]:
%%bash
screen -d -m -S traininho python train.py

In [9]:
%%bash
screen -ls

There is a screen on:
	2424..ip-172-31-3-184	(01/17/2016 03:03:41 PM)	(Detached)
1 Socket in /var/run/screen/S-ubuntu.



In [75]:
%%bash
python train.py

Load the data using pandas
Eliminate missing values


Traceback (most recent call last):
  File "train.py", line 29, in <module>
    bos(LinearRegression(), dream_team())
  File "/home/ubuntu/nadbor/kaggle/prudential/train_utils.py", line 232, in benchmark_optimized_stacker
    preds = stacker_train_predictions(stacker, base_clfs)
  File "<decorator-gen-3>", line 2, in stacker_train_predictions
  File "/home/ubuntu/nadbor/kaggle/prudential/persistent_cache.py", line 103, in wrapper
    cache[key] = result = f(*args, **kwargs)
  File "/home/ubuntu/nadbor/kaggle/prudential/train_utils.py", line 120, in stacker_train_predictions
    stacked_X = np.hstack([X] + [train_predictions(clf).reshape(n, 1) for clf in base_clfs])
  File "<decorator-gen-1>", line 2, in train_predictions
  File "/home/ubuntu/nadbor/kaggle/prudential/persistent_cache.py", line 103, in wrapper
    cache[key] = result = f(*args, **kwargs)
  File "/home/ubuntu/nadbor/kaggle/prudential/train_utils.py", line 100, in train_predictions
    model.fit(X[train], y[train])
  File "

In [88]:
%%bash
cat best_ensemble.log

2016-01-24 22:37:44,913; INFO;  start train.py $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$
2016-01-24 22:37:44,915; INFO;  start stacker --------------------------
2016-01-24 22:37:44,916; INFO;  fitting fold   1BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
2016-01-24 22:37:45,803; INFO;  fold fitted    1BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
2016-01-24 22:37:45,825; INFO;  fitting fold   2BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
2016-01-24 22:37:46,730; INFO;  fold fitted    2BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
2016-01-24 22:37:46,747; INFO;  fitting fold   3BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=T
2016-01-24 22:37:47,668; INFO;  fold fitted    3BayesianRidge(alpha_1=1e

In [73]:
%%bash
free -m

             total       used       free     shared    buffers     cached
Mem:          7984       2395       5588          0        151        459
-/+ buffers/cache:       1784       6199
Swap:            0          0          0


In [95]:

dream_team = lambda: sorted([xgbr(), rfr(),  etr(), LinearRegression(), 
                             #xgbr_poly(), linreg_poly(),
                             bayes_ridge(),
                             lasso(),
                             svrsig(),
                             svrrbf(),
                             perc()
                            ])
dream_team()

[BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
        fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
        normalize=False, tol=0.001, verbose=False),
 ExtraTreesRegressor(bootstrap=False, compute_importances=None,
           criterion='mse', max_depth=None, max_features='auto',
           max_leaf_nodes=None, min_density=None, min_samples_leaf=1,
           min_samples_split=2, n_estimators=400, n_jobs=1, oob_score=False,
           random_state=None, verbose=0),
 Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute='auto', tol=0.0001,
    warm_start=False),
 LinearRegression(copy_X=True, fit_intercept=True, normalize=False),
 Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
       n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=False,
       verbose=0, warm_start=False),
 RandomForestRegressor(bootstrap=True, compute_importances=None,
    

In [None]:
2016-01-28 00:37:09,485; INFO;  optimized stacker 0.663023467567   
        LinearRegression(copy_X=True, fit_intercept=True, normalize=False), 
        [
            BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,         fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,         normalize=False, tol=0.001, verbose=False), 
            ExtraTreesRegressor(bootstrap=False, compute_importances=None,            criterion='mse', max_depth=None, max_features='auto',            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,            min_samples_split=2, n_estimators=400, n_jobs=1, oob_score=False,            random_state=None, verbose=0), 
            Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,     normalize=False, positive=False, precompute='auto', tol=0.0001,     warm_start=False), 
            LinearRegression(copy_X=True, fit_intercept=True, normalize=False), 
            Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,        n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=False,        verbose=0, warm_start=False), 
            RandomForestRegressor(bootstrap=True, compute_importances=None,             criterion='mse', max_depth=None, max_features='auto',             max_leaf_nodes=None, min_density=None, min_samples_leaf=1,             min_samples_split=2, n_estimators=400, n_jobs=1,             oob_score=False, random_state=None, verbose=0), 
            SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0,    kernel='sigmoid', max_iter=-1, probability=False, random_state=None,    shrinking=True, tol=0.001, verbose=False), 
            SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0,    kernel='rbf', max_iter=-1, probability=False, random_state=None,    shrinking=True, tol=0.001, verbose=False), 
            XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,         gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,         min_child_weight=80, missing=None, n_estimators=100, nthread=-1,         objective='reg:linear', reg_alpha=0, reg_lambda=1,         scale_pos_weight=1, seed=0, silent=1, subsample=0.85)
        ]

In [None]:
2016-01-28 00:38:12,708; INFO;  optimized stacker 0.663071898749   
        LinearRegression(copy_X=True, fit_intercept=True, normalize=False), 
        [
            ExtraTreesRegressor(bootstrap=False, compute_importances=None,            criterion='mse', max_depth=None, max_features='auto',            max_leaf_nodes=None, min_density=None, min_samples_leaf=1,            min_samples_split=2, n_estimators=400, n_jobs=1, oob_score=False,            random_state=None, verbose=0), 
            Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,     normalize=False, positive=False, precompute='auto', tol=0.0001,     warm_start=False), 
            LinearRegression(copy_X=True, fit_intercept=True, normalize=False), 
            Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,        n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=False,        verbose=0, warm_start=False), 
            RandomForestRegressor(bootstrap=True, compute_importances=None,             criterion='mse', max_depth=None, max_features='auto',             max_leaf_nodes=None, min_density=None, min_samples_leaf=1,             min_samples_split=2, n_estimators=400, n_jobs=1,             oob_score=False, random_state=None, verbose=0), 
            SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0,    kernel='sigmoid', max_iter=-1, probability=False, random_state=None,    shrinking=True, tol=0.001, verbose=False), 
            SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.0,    kernel='rbf', max_iter=-1, probability=False, random_state=None,    shrinking=True, tol=0.001, verbose=False), 
            XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,         gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,         min_child_weight=80, missing=None, n_estimators=100, nthread=-1,         objective='reg:linear', reg_alpha=0, reg_lambda=1,         scale_pos_weight=1, seed=0, silent=1, subsample=0.85)]

In [None]:
-bayes ridge  0.663071898749
-extra trees  0.662692889992
-lasso        0.663020255307
-linear       0.662786333124
-perceptron   0.663053416635
-randomforest 0.662830924589
-svr sigmoid  0.663023467567
-svr rbf      0.66318712225
-xgb          0.656280329093

    

In [151]:
def fake_data():
    theta = np.random.random(5)
    X = np.random.random((1000, 5))
    y = X.dot(theta)
    maksio = max(y)
    minio = min(y)
    y = (num_classes - 1) * (1/(maksio - minio)) * (y - minio) + 1
    y = np.round(y)
    test_X = -X
    return X, y, test_X

iks, igrek, testiks = fake_data()

[ 0.5093228   0.44824519  0.65131449  0.83269357  0.8128499 ]


In [152]:
mol = LinearRegression().fit(iks, igrek)

In [153]:
mol.coef_

array([ 1.3660634 ,  1.22204675,  1.80184071,  2.30295306,  2.22929648])

In [119]:
theta

array([ 0.68756706,  0.37507146,  0.19799844,  0.83495337,  0.23041633])

In [154]:
from persistent_cache import memo, PersistentDict as Perd
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer

def read_all_data():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")

    # combine train and test
    all_data = train.append(test)

    # factorize categorical variables    
    all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
    return all_data

def basic_extractor():
    all_data = read_all_data()
    
    print('Eliminate missing values')    
    # Use -1 for any others
    all_data.fillna(-1, inplace=True)

    # fix the dtype on the label column
    all_data['Response'] = all_data['Response'].astype(int)

    # split train and test
    train = all_data[all_data['Response']>0].copy()
    test = all_data[all_data['Response']<1].copy()


    X = np.array(train.drop(["Id", "Response"], axis=1))
    X_actual_test = np.array(test.drop(["Id", "Response"], axis=1))
    y = np.array(train.Response)
    return X, y, X_actual_test


def fe1():
    """not one-hot encoded
    """
    all_data = read_all_data()

    # FEATURE ENGINEERING
    all_data['bmi_ins_age'] = all_data.BMI * all_data.Ins_Age
    all_data['nan_count'] = all_data.isnull().sum(axis=1)
    #all_data['emp_inf_4_sq'] = all_data.Employment_Info_4 ** 2
    #all_data['fam_hist_4_sq'] = all_data.Family_Hist_4 ** 2
    #all_data['fam_hist_2_sq'] = all_data.Family_Hist_2 ** 2

    mk = [col for col in train.columns if col.startswith("Medical_K")]
    all_data['sum_keywords'] = sum(train[col] for col in mk)

    all_data.drop('Medical_History_24')
    all_data.drop('Medical_History_10')

    # Use -1 for any others
    all_data.fillna(-1, inplace=True)

    # fix the dtype on the label column
    all_data['Response'] = all_data['Response'].astype(int)

    # Provide split column
    # all_data['Split'] = np.random.randint(5, size=all_data.shape[0])

    # split train and test
    train = all_data[all_data['Response']>0].copy()
    test = all_data[all_data['Response']<1].copy()


    X = np.array(train.drop(["Id", "Response"], axis=1))
    X_actual_test = np.array(test.drop(["Id", "Response"], axis=1))
    y = np.array(train.Response)
    return X, y, X_actual_test


extractors = {
    'basic': basic_extractor,
    'fake_data': fake_data,
    'feats1': fe1
}

@memo(Perd(MEMO_PATH + "feature_extraction"))
def train_test_sets(extractor_name):
    extractor = extractors[extractor_name]
    return extractor()

In [45]:
from persistent_cache import memo, PersistentDict as Perd
import pandas as pd
import numpy as np
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer
MEMO_PATH = "memo_fe/"

def oh_med():
    categorical = {'Product_Info_1', 'Product_Info_2', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 
                   'Product_Info_7', 'Employment_Info_2', 'Employment_Info_3', 'Employment_Info_5', 
                   'InsuredInfo_1', 'InsuredInfo_2', 'InsuredInfo_3', 'InsuredInfo_4', 'InsuredInfo_5', 
                   'InsuredInfo_6', 'InsuredInfo_7', 'Insurance_History_1', 'Insurance_History_2', 
                   'Insurance_History_3', 'Insurance_History_4', 'Insurance_History_7', 'Insurance_History_8', 
                   'Insurance_History_9', 'Family_Hist_1', 'Medical_History_2', 'Medical_History_3', 
                   'Medical_History_4', 'Medical_History_5', 'Medical_History_6', 'Medical_History_7', 
                   'Medical_History_8', 'Medical_History_9', 'Medical_History_10', 'Medical_History_11', 
                   'Medical_History_12', 'Medical_History_13', 'Medical_History_14', 'Medical_History_16', 
                   'Medical_History_17', 'Medical_History_18', 'Medical_History_19', 'Medical_History_20', 
                   'Medical_History_21', 'Medical_History_22', 'Medical_History_23', 'Medical_History_25', 
                   'Medical_History_26', 'Medical_History_27', 'Medical_History_28', 'Medical_History_29', 
                   'Medical_History_30', 'Medical_History_31', 'Medical_History_33', 'Medical_History_34', 
                   'Medical_History_35', 'Medical_History_36', 'Medical_History_37', 
    #                'Medical_History_38', 
                   'Medical_History_39', 'Medical_History_40', 'Medical_History_41','Medical_History_1', 
                   'Medical_History_15', 'Medical_History_24', 'Medical_History_32'}

    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")
    total = pd.concat([train, test])
    median = total.median()
    train.fillna(median, inplace=True)
    test = test.fillna(median, inplace=True)
    encoder = LabelEncoder()
    for f in categorical:
        encoder.fit(total[f])
        train[f] = encoder.transform(train[f])
        test[f] = encoder.transform(test[f])

    feature_cols = test.columns[1:]
    categorical_inds = [i for i, col in enumerate(feature_cols) if col in categorical]
    oh_encoder = OneHotEncoder(categorical_features=categorical_inds)

    X = np.array(train[test.columns[1:]])
    y = np.array(train.Response)
    X_actual_test = np.array(test[feature_cols])

    oh_encoder.fit(X)
    X = oh_encoder.transform(X).todense()
    X_actual_test = oh_encoder.transform(X_actual_test).todense()
    return X, X_actual_test

def read_all_data():
    train = pd.read_csv("train.csv")
    test = pd.read_csv("test.csv")

    # combine train and test
    all_data = train.append(test)

    # factorize categorical variables    
    all_data['Product_Info_2'] = pd.factorize(all_data['Product_Info_2'])[0]
    return all_data

def basic_extractor():
    all_data = read_all_data()
    
    print('Eliminate missing values')    
    # Use -1 for any others
    all_data.fillna(-1, inplace=True)

    # fix the dtype on the label column
    all_data['Response'] = all_data['Response'].astype(int)

    # split train and test
    train = all_data[all_data['Response']>0].copy()
    test = all_data[all_data['Response']<1].copy()

    X = np.array(train.drop(["Id", "Response"], axis=1))
    X_actual_test = np.array(test.drop(["Id", "Response"], axis=1))
    y = np.array(train.Response)
    return X, X_actual_test


def fe1():
    """not one-hot encoded
    """
    all_data = read_all_data()

    # FEATURE ENGINEERING
    all_data['bmi_ins_age'] = all_data.BMI * all_data.Ins_Age
    all_data['nan_count'] = all_data.isnull().sum(axis=1)
    #all_data['emp_inf_4_sq'] = all_data.Employment_Info_4 ** 2
    #all_data['fam_hist_4_sq'] = all_data.Family_Hist_4 ** 2
    #all_data['fam_hist_2_sq'] = all_data.Family_Hist_2 ** 2

    mk = [col for col in train.columns if col.startswith("Medical_K")]
    all_data['sum_keywords'] = sum(train[col] for col in mk)

    all_data.drop('Medical_History_24')
    all_data.drop('Medical_History_10')

    # Use -1 for any others
    all_data.fillna(-1, inplace=True)

    # fix the dtype on the label column
    all_data['Response'] = all_data['Response'].astype(int)

    # Provide split column
    # all_data['Split'] = np.random.randint(5, size=all_data.shape[0])

    # split train and test
    train = all_data[all_data['Response']>0].copy()
    test = all_data[all_data['Response']<1].copy()


    X = np.array(train.drop(["Id", "Response"], axis=1))
    X_actual_test = np.array(test.drop(["Id", "Response"], axis=1))
    y = np.array(train.Response)
    return X, X_actual_test


def kmeans_feats(X, clusters=10):
    X = Normalizer().transform(X)
    kmeans = MiniBatchKMeans(n_clusters=clusters, random_state=0)
    return kmeans.fit_transform(X)
    
def kmeans_10():
    X, X_test = train_test_sets("ohmed")
    n, _ = X.shape
    tot = np.vstack([X, X_test])
    kmeans = kmeans_feats(tot)
    tot = np.hstack([tot, kmeans])
    return tot[:n, :], tot[n:, :]
    
def kmeans_20():
    X, X_test = train_test_sets("ohmed")
    n, _ = X.shape
    tot = np.vstack([X, X_test])
    kmeans = kmeans_feats(tot, clusters=20)
    tot = np.hstack([tot, kmeans])
    return tot[:n, :], tot[n:, :]

def combine(fextractors):
    trains, tests = [], []
    for train, test in [train_test_sets(f) for f in fextractors]:
        trains.append(train)
        tests.append(test)
    return np.hstack(trains), np.hstack(tests)    
    
def oh_km10():
    return combine(["ohmed", "kmeans10"])
    
def oh_km20():
    return combine(["ohmed", "kmeans20"])

def fe1_km10():
    return combine(["feats1", "kmeans10"])

def fe1_km20():
    return combine(["feats1", "kmeans20"])


extractors = {
    'basic': basic_extractor,
    'feats1': fe1,
    'ohmed': oh_med,
    'kmeans10': kmeans_10,
    'kmeans20': kmeans_20,
    'oh_km10': oh_km10,
    'oh_km20': oh_km20,
    'fe1_km10': fe1_km10,
    'fe1_km20': fe1_km20
}

@memo(Perd(MEMO_PATH + "feature_extraction"))
def train_test_sets(extractor_name):
    """extractor_name one of:
    'basic'
    'feats1'
    returns X_train, X_test
    """
    extractor = extractors[extractor_name]
    return extractor()

In [20]:
X, X_test = train_test_sets("ohmed")

In [27]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.preprocessing import Normalizer

In [6]:
from feature_engineering import train_test_sets, oh_km10, oh_km20, kmeans_10

In [8]:
x, xx = train_test_sets('ohmed')    #train_test_sets("oh_km20")

MemoryError: 

In [38]:
clusters_maybe = clusterer.fit_transform(tiny_X)

In [44]:
np.vstack([X, X_test]).shape

(79146, 1726)

In [16]:
%%bash
git status

On branch master
Your branch is up-to-date with 'origin/master'.

Changes to be committed:
  (use "git reset HEAD <file>..." to unstage)

	modified:   train_utils.py
	modified:   training.ipynb

Changes not staged for commit:
  (use "git add/rm <file>..." to update what will be committed)
  (use "git checkout -- <file>..." to discard changes in working directory)

	modified:   train.py
	deleted:    train1.py

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	.ipynb_checkpoints/
	cache/
	cachetest/
	feature_engineering.py
	feature_engineering.pyc
	memo/
	memo_fe/
	memo_fe_lazy_stacker_train_predictions
	memo_fe_stacker_test_predictions
	memo_fe_stacker_train_predictions
	memo_fe_test_predictions
	memo_fe_train_predictions
	persistent_cache.pyc
	train_utils-Copy1.py
	train_utils.pyc



In [17]:
git commit -m "asfs"

SyntaxError: invalid syntax (<ipython-input-17-6ba9a3609c49>, line 1)