In [1]:
import pandas as pd
import sklearn

from matplotlib import pyplot as plt

%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score

In [3]:
x_all = pd.read_csv('X_train.csv', index_col='id')
y_all = pd.read_csv('y_train.csv', index_col='id', squeeze=True)
x_out = pd.read_csv('X_test.csv', index_col='id')

x_train, x_test, y_train, y_test = train_test_split(x_all, y_all)

---

### add value counts and do onehot

In [166]:
def vc_map(x):
    vc = x.value_counts()
    return x.map(vc)

In [67]:
from sklearn.base import TransformerMixin

class AddVc(TransformerMixin):
    def __init__(self):
        self.vc_map = {}
    
    def fit(self, x, y=None):
        self.vc_map = x.value_counts().to_dict()
        return self
    
    def transform(self, x):
        dd = {val: 0 for val in x.unique() if val not in self.vc_map}
        dd.update(self.vc_map)
        return x.map(dd)

In [232]:
class NonFreqStrip(TransformerMixin):
    def __init__(self, valid_alpha):
        self.valid_alpha = float(valid_alpha)
        self.strip_val = 10 ** 6
        
    def fit(self, x, y=None):
        vc = vc_map(x).sort_values()
        start_ind = int(self.valid_alpha * len(x))
        good_ind = vc.index[start_ind:]
        self.freqs = set(x[good_ind])
        return self
    
    def transform(self, x):
        xx = x.copy()
        xx[~xx.isin(self.freqs)] = self.strip_val
        return xx

In [49]:
from sklearn.pipeline import Parallel, Pipeline, FeatureUnion
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import OneHotEncoder

In [320]:
def make_trans(x_train):
    ffs = [(col, AddVc()) for col in x_train]
    vcounts = ('vcounts', DataFrameMapper(ffs, input_df=True))
    
    alpha = 0.5
    strip_ffs = [(col, NonFreqStrip(alpha)) for col in x_train]
    non_freq_strip = 'strips', DataFrameMapper(strip_ffs, input_df=True)

    one_hot = 'one_hot', OneHotEncoder(sparse=False)
    one_hot_pipe = 'hot_pipe', Pipeline([non_freq_strip, one_hot])

    final_trans = FeatureUnion([vcounts, one_hot_pipe])
    
    return 'final_trans', final_trans

In [321]:
trans = make_trans(x_train)

In [322]:
trans[1].fit_transform(x_train).shape

(14745L, 1138L)

# Fit predict quality

In [323]:
from sklearn.ensemble import RandomForestClassifier as ForestClf
from sklearn.ensemble import RandomForestRegressor as ForestReg

from sklearn.metrics.scorer import make_scorer, accuracy_score
from sklearn.metrics import roc_auc_score


def scorer(est, x=x_test, y=y_test):
    if getattr(est, 'predict_proba', None):
        yy = est.predict_proba(x)[:,1]
    else:
        yy = est.predict(x)
    return roc_auc_score(y, yy)

In [324]:
forest = ForestReg(n_estimators=200, min_samples_leaf=2, min_samples_split=8, 
                   max_features='sqrt', n_jobs=-1)

In [327]:
forest = ForestClf(n_estimators=1000, min_samples_leaf=2, min_samples_split=8, 
                   max_features=0.4, n_jobs=4)
reg = 'forest', forest

reg = Pipeline([trans, reg])

In [328]:
reg.fit(x_train, y_train)

KeyboardInterrupt: 

In [None]:
accuracy_score()

In [305]:
scorer(reg)

0.80576751745359898

In [663]:
yy = reg.predict_proba(xx)[:, 1]
yyy = reg.predict_proba(xxx)[:, 1]

In [664]:
roc_auc_score(y_train, yy)

0.99532162526825374

In [665]:
roc_auc_score(y_test, yyy)

0.83701511499257497

In [574]:
yy = reg.predict(xx)
yyy = reg.predict(xxx)