In [3]:
%run preamble.ipynb


## The preamble handles a few imports.
## It also loads common functions and dicts: 

## Bosch challenge specific:
* **getMCC**(tp,tn,fp,fn)

## ML/Analytics functions:
* **compare_train_test**(clf, ds_train, label_train, ds_test, label_test, mva='MVA', bins=50, use_vote=None, log=False)
* **plot_classifier_output**( pred_train, pred_test, y_train, y_test, multipagepdf=None, bins = None, normalised = True )
* **plot_correlations**(data,label='', \*\*kwds)
* **optimisePars**(mva, points, data , classes, fraction=0.7, score = 'log_loss', cvs=5)

---

## Various
* **showUniques**(df)
* **ensure_dir**(directory)
* **printBumper**(text, c='=', n=-1)
* **intersec**(d1, d2)
* **union**(d1, d2)

---

## Color dictionaries:
* **Tableau10**
* **Tableau10_Light**
* **Tableau10_Medium**
* **Tableau_20**
* **ColorBlind10**


# Strategy

## Group data into stations
In this way we have logical packages of data w/o too many NaNs.

## Unsupervised dimensionalty reduction using PCA
This should result in better and most importantly less variables.
Then choose best six with an ANOVA filter.

## Supervised predictions using a BDT 
Use BDT on 6 (or less) variables.


In [6]:
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [4]:
train = pd.read_csv("files/train_numeric.csv")

In [5]:
def addLineStation(ds):
    l0=[]
    l1=[]
    l2=[]
    for c in ds:
        s = c.split(sep="_")
        l0.append(s[0])
        l1.append(s[0]+"_"+s[1])
        l2.append(c)
    return pd.MultiIndex.from_arrays([l0,l1,l2],names=["Line","Station","Feature"])

In [8]:
# a little bit of index magic to be able to group by station
train = train.set_index(["Id","Response"])
traindrop = ["Id","Response"]
train.columns = addLineStation(train)

gb = train.groupby(level="Station",axis=1)

In [12]:
def trainMVA(grps):
    mvas = {}
    for i,g in enumerate(grps.groups):
        s=time.time()
        if i%10==0 and i>0:
            print("pickle intermediate results at i=",i)
            joblib.dump(mvas,"files/mvas_per_station{}.pkl".format(i))
            mvas={}
        # Dataset magic
        d = gb.get_group(g).dropna(how="all", axis=0)
        d.columns = d.columns.droplevel(["Line","Station"])
        d=d.reset_index()
    
        n_features = len(d.columns)-2
        
        n_pca = 10 if n_features>10 else n_features
        n_annova = 6 if n_pca>6 else n_pca
        n_max_feat_bdt = 4 if n_annova==6 else n_annova
    
        # filter
        # feature selection
        anova_filter = SelectKBest(f_classif, k=n_annova)
    
        # classifier
        bdt = AdaBoostClassifier(DecisionTreeClassifier(max_features=n_max_feat_bdt
                                                        , max_depth=3)
                        , n_estimators = 600
                        , learning_rate = 0.01)
        # pipeline
        clf = Pipeline([("fillVals", Imputer(strategy="mean",verbose=1))
                ,("dimReduction", PCA(n_components=n_pca))
                ,('feature_selection', anova_filter)
                ,('mva', bdt)])
    
        clf.fit(d.drop(traindrop,axis=1), d.Response)
        print("classifier pipeline fit for goup: "
              , g, "(",i+1,"/",len(gb.groups),")")
        print("done in {:2.2f}".format((time.time()-s)/60.)," minutes.")
        mvas[g] = clf
        del d
    return 

In [None]:
# do all the heavy lifting!
%time trainMVA(gb)

classifier pipeline fit for goup:  L3_S32 ( 1 / 50 )
done in 0.21  minutes.


In [None]:
# save the hard work!
joblib.dump(mvas, "files/mvas_per_station.pkl",compress=3)

In [None]:
mvas=joblib.load("files/mvas_per_station.pkl")