In [2]:
%run preamble.ipynb


## The preamble handles a few imports.
## It also loads common functions and dicts: 

## Bosch challenge specific:
* **getMCC**(tp,tn,fp,fn)

## ML/Analytics functions:
* **compare_train_test**(clf, ds_train, label_train, ds_test, label_test, mva='MVA', bins=50, use_vote=None, log=False)
* **plot_classifier_output**( pred_train, pred_test, y_train, y_test, multipagepdf=None, bins = None, normalised = True )
* **plot_correlations**(data,label='', \*\*kwds)
* **optimisePars**(mva, points, data , classes, fraction=0.7, score = 'log_loss', cvs=5)

---

## Various
* **showUniques**(df)
* **ensure_dir**(directory)
* **printBumper**(text, c='=', n=-1)
* **intersec**(d1, d2)
* **union**(d1, d2)

---

## Color dictionaries:
* **Tableau10**
* **Tableau10_Light**
* **Tableau10_Medium**
* **Tableau_20**
* **ColorBlind10**


# Strategy

## Group data into stations
In this way we have logical packages of data w/o too many NaNs.

## Unsupervised dimensionalty reduction using PCA
This should result in better and most importantly less variables.
Then choose best six with an ANOVA filter.

## Supervised predictions using a BDT 
Use BDT on 6 (or less) variables.


In [3]:
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [4]:
train = pd.read_csv("files/train_numeric.csv")

In [5]:
def addLineStation(ds):
    l0=[]
    l1=[]
    l2=[]
    for c in ds:
        s = c.split(sep="_")
        l0.append(s[0])
        l1.append(s[0]+"_"+s[1])
        l2.append(c)
    return pd.MultiIndex.from_arrays([l0,l1,l2],names=["Line","Station","Feature"])

In [6]:
# a little bit of index magic to be able to group by station
train = train.set_index(["Id","Response"])
traindrop = ["Id","Response"]
train.columns = addLineStation(train)

gb = train.groupby(level="Station",axis=1)

In [73]:
def trainMVA(grps, processed=[], save=False):
    mvas = {}
    for i,g in enumerate(grps.groups):
        s=time.time()
        if g in processed:
            print(g, " already trained. Continue.")
            continue
        if i%10==0 and i>0 and save:
            print("pickle intermediate results at i=",i)
            joblib.dump(mvas,"files/mvas_per_station{}.pkl".format(i)
                        , compress = 3)
            mvas={}
        # Dataset magic
        d = gb.get_group(g).dropna(how="all", axis=0)
        d.columns = d.columns.droplevel(["Line","Station"])
        d=d.reset_index()
    
        n_features = len(d.columns)-2
        
        n_pca = 10 if n_features>10 else n_features
        n_annova = 6 if n_pca>6 else n_pca
        n_max_feat_bdt = 4 if n_annova==6 else n_annova
    
        # filter
        # feature selection
        anova_filter = SelectKBest(f_classif, k=n_annova)
    
        # classifier
        bdt = AdaBoostClassifier(DecisionTreeClassifier(max_features=n_max_feat_bdt
                                                        , max_depth=3)
                        , n_estimators = 600
                        , learning_rate = 0.01)
        # pipeline
        print("classifier pipeline fit for goup: "
              , g, "(",i+1,"/",len(gb.groups),")")
        clf = Pipeline([("fillVals", Imputer(strategy="mean",verbose=1))
                ,("dimReduction", PCA(n_components=n_pca))
                ,('feature_selection', anova_filter)
                ,('mva', bdt)])
    
        clf.fit(d.drop(traindrop,axis=1), d.Response)
        print("done in {:2.2f}".format((time.time()-s)/60.)," minutes.")
        mvas[g] = clf
        del d
    joblib.dump(mvas,"files/mvas_per_station{}.pkl".format(len(grps.groups)+1)
                , compress = 3)        
    return 

In [74]:
# do all the heavy lifting!
%time trainMVA(gb,trained)

L0_S23  already trained. Continue.
L3_S50  already trained. Continue.
L3_S41  already trained. Continue.
L3_S35  already trained. Continue.
L0_S11  already trained. Continue.
L2_S27  already trained. Continue.
L0_S9  already trained. Continue.
L3_S45  already trained. Continue.
L0_S8  already trained. Continue.
L3_S33  already trained. Continue.
L3_S37  already trained. Continue.
L3_S43  already trained. Continue.
L0_S16  already trained. Continue.
L3_S39  already trained. Continue.
L0_S14  already trained. Continue.
L0_S15  already trained. Continue.
L3_S47  already trained. Continue.
L0_S17  already trained. Continue.
L3_S32  already trained. Continue.
L0_S18  already trained. Continue.
L0_S0  already trained. Continue.
L3_S48  already trained. Continue.
L0_S22  already trained. Continue.
L0_S6  already trained. Continue.
L0_S5  already trained. Continue.
L3_S30  already trained. Continue.
L0_S19  already trained. Continue.
L2_S28  already trained. Continue.
L3_S49  already trained. 

In [71]:
trained = joblib.load("files/mvas_per_station.pkl")
trained.update(joblib.load("files/mvas_per_station10.pkl"))
trained.update(joblib.load("files/mvas_per_station20.pkl"))
trained.update(joblib.load("files/mvas_per_station30.pkl"))
trained.update(joblib.load("files/mvas_per_station40.pkl"))
trained.update(joblib.load("files/mvas_per_station50.pkl"))
trained.update(joblib.load("files/mvas_per_station51.pkl"))

# run again to get all (small coding bug)

In [24]:
trainMVA(gb, list(trained.keys()))

L0_S23  already trained. Continue.
L3_S50  already trained. Continue.
L3_S41  already trained. Continue.
L3_S35  already trained. Continue.
L0_S11  already trained. Continue.
L2_S27  already trained. Continue.
L0_S9  already trained. Continue.
L3_S45  already trained. Continue.
L0_S8  already trained. Continue.
L3_S33  already trained. Continue.
L3_S37  already trained. Continue.
L3_S43  already trained. Continue.
L0_S16  already trained. Continue.
L3_S39  already trained. Continue.
L0_S14  already trained. Continue.
L0_S15  already trained. Continue.
L3_S47  already trained. Continue.
L0_S17  already trained. Continue.
L3_S32  already trained. Continue.
L0_S18  already trained. Continue.
L0_S0  already trained. Continue.
L3_S48  already trained. Continue.
L0_S22  already trained. Continue.
L0_S6  already trained. Continue.
L0_S5  already trained. Continue.
L3_S30  already trained. Continue.
L0_S19  already trained. Continue.
L2_S28  already trained. Continue.
L3_S49  already trained. 

In [75]:
def predictStations(grps, mvas, results, processed=[], save=False):
    for i,g in enumerate(grps.groups):
        s=time.time()
        if g in processed:
            print(g, " already predicted. Continue.")
            continue
        # Dataset magic
        d = gb.get_group(g).dropna(how="all", axis=0)
        d.columns = d.columns.droplevel(["Line","Station"])
        d=d.reset_index()

        # pipeline
        print("classifier pipeline prediction for goup: "
              , g, "(",i+1,"/",len(gb.groups),")")
        clf = mvas[g]
    
        pred_func = pd.DataFrame({"Id" : d.Id
                            ,"pred_"+g :clf.decision_function(d.drop(traindrop,axis=1))})
        results = results.merge(pred_func, on="Id", how="left")
        print("done in {:2.2f}".format((time.time()-s)/60.)," minutes.")
        del d
    return results

In [76]:
result = pd.DataFrame({"Id": train.reset_index()["Id"],
                       "Response": train.reset_index()["Response"]})

In [64]:
pre_functs = predictStations(gb, trained, result)

classifier pipeline prediction for goup:  L0_S23 ( 1 / 50 )
done in 0.19  minutes.
classifier pipeline prediction for goup:  L3_S50 ( 2 / 50 )
done in 0.07  minutes.
classifier pipeline prediction for goup:  L3_S41 ( 3 / 50 )
done in 0.14  minutes.
classifier pipeline prediction for goup:  L3_S35 ( 4 / 50 )
done in 1.31  minutes.
classifier pipeline prediction for goup:  L0_S11 ( 5 / 50 )
done in 0.53  minutes.
classifier pipeline prediction for goup:  L2_S27 ( 6 / 50 )
done in 0.29  minutes.
classifier pipeline prediction for goup:  L0_S9 ( 7 / 50 )
done in 0.53  minutes.
classifier pipeline prediction for goup:  L3_S45 ( 8 / 50 )
done in 0.13  minutes.
classifier pipeline prediction for goup:  L0_S8 ( 9 / 50 )
done in 1.58  minutes.
classifier pipeline prediction for goup:  L3_S33 ( 10 / 50 )
done in 2.76  minutes.
classifier pipeline prediction for goup:  L3_S37 ( 11 / 50 )
done in 2.57  minutes.
classifier pipeline prediction for goup:  L3_S43 ( 12 / 50 )
done in 0.07  minutes.
cla

KeyError: 'L3_S31'

In [63]:
pre_functs[~pre_functs.pred_L0_S23.isnull()].head()

Unnamed: 0,Id,pred_L0_S23,pred_L3_S50
16,41,-1.004551,
18,47,-1.067508,
21,55,-0.972351,
45,97,-1.30668,
60,124,-1.062109,


In [None]:
mvas=joblib.load("files/mvas_per_station.pkl")