In [1]:
%run preamble.ipynb


## The preamble handles a few imports.
## It also loads common functions and dicts: 

## Bosch challenge specific:
* **getMCC**(tp,tn,fp,fn)

## ML/Analytics functions:
* **compare_train_test**(clf, ds_train, label_train, ds_test, label_test, mva='MVA', bins=50, use_vote=None, log=False)
* **plot_classifier_output**( pred_train, pred_test, y_train, y_test, multipagepdf=None, bins = None, normalised = True )
* **plot_correlations**(data,label='', \*\*kwds)
* **optimisePars**(mva, points, data , classes, fraction=0.7, score = 'log_loss', cvs=5)

---

## Various
* **showUniques**(df)
* **ensure_dir**(directory)
* **printBumper**(text, c='=', n=-1)
* **intersec**(d1, d2)
* **union**(d1, d2)

---

## Color dictionaries:
* **Tableau10**
* **Tableau10_Light**
* **Tableau10_Medium**
* **Tableau_20**
* **ColorBlind10**


# Strategy

## Group data into stations
In this way we have logical packages of data w/o too many NaNs.

## Unsupervised dimensionalty reduction using PCA
This should result in better and most importantly less variables.
Then choose best six with an ANOVA filter.

## Supervised predictions using a BDT 
Use BDT on 6 (or less) variables.


In [2]:
from sklearn.preprocessing import Imputer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

In [17]:
train = pd.read_csv("files/train_numeric.csv")

In [10]:
def addLineStation(ds):
    """
    Creates a MultiIndex object with levels:
    Line, Station, Feature.
    
    Arguments:
    ds - (DataFrame) Dataset from which to extract the MultiIndex
    """
    l0=[];l1=[];l2=[]
    for c in ds:
        s = c.split(sep="_")
        l0.append(s[0])
        l1.append(s[0]+"_"+s[1])
        l2.append(c)
    return pd.MultiIndex.from_arrays([l0,l1,l2],names=["Line","Station","Feature"])

In [126]:
# a little bit of index magic to be able to group by station
train = train.set_index(["Id","Response"])
train.columns = addLineStation(train)

gb = train.groupby(level="Station",axis=1)

In [132]:
traindrop = ["Id","Response"]

In [131]:
def trainMVA(grps, processed=[], save=False):
    mvas = {}
    for i,g in enumerate(grps.groups):
        s=time.time()
        if g in processed:
            print(g, " already trained. Continue.")
            continue
        if i%10==0 and i>0 and save:
            print("pickle intermediate results at i=",i)
            joblib.dump(mvas,"files/mvas_per_station{}.pkl".format(i)
                        , compress = 3)
            mvas={}
        # Dataset magic
        d = gb.get_group(g).dropna(how="all", axis=0)
        d.columns = d.columns.droplevel(["Line","Station"])
        d=d.reset_index()
    
        n_features = len(d.columns)-2
        
        n_pca = 10 if n_features>10 else n_features
        n_annova = 6 if n_pca>6 else n_pca
        n_max_feat_bdt = 4 if n_annova==6 else n_annova
    
        # filter
        # feature selection
        anova_filter = SelectKBest(f_classif, k=n_annova)
    
        # classifier
        bdt = AdaBoostClassifier(DecisionTreeClassifier(max_features=n_max_feat_bdt
                                                        , max_depth=3)
                        , n_estimators = 600
                        , learning_rate = 0.01)
        # pipeline
        print("classifier pipeline fit for goup: "
              , g, "(",i+1,"/",len(gb.groups),")")
       
        clf = Pipeline([ ("fillVals", Imputer(strategy="mean",verbose=1))
                       , ("dimReduction", PCA(n_components=n_pca))
                       , ('feature_selection', anova_filter)
                       , ('mva', bdt)])
    
        clf.fit(d.drop(traindrop,axis=1), d.Response)
        print("done in {:2.2f}".format((time.time()-s)/60.)," minutes.")
        mvas[g] = clf
        del d
    joblib.dump(mvas,"files/mvas_per_station{}.pkl".format(len(grps.groups)+2)
                , compress = 3)        
    return 

In [137]:
# do all the heavy lifting!
%time trainMVA(gb,trained)

L1_S24  already trained. Continue.
L0_S14  already trained. Continue.
L2_S26  already trained. Continue.
L3_S39  already trained. Continue.
L3_S33  already trained. Continue.
L3_S31  already trained. Continue.
L0_S4  already trained. Continue.
L3_S30  already trained. Continue.
L3_S38  already trained. Continue.
L3_S29  already trained. Continue.
L0_S9  already trained. Continue.
L2_S27  already trained. Continue.
L0_S21  already trained. Continue.
L3_S43  already trained. Continue.
L3_S49  already trained. Continue.
L0_S17  already trained. Continue.
L0_S19  already trained. Continue.
L3_S47  already trained. Continue.
L3_S45  already trained. Continue.
L0_S23  already trained. Continue.
L0_S6  already trained. Continue.
L3_S40  already trained. Continue.
L0_S18  already trained. Continue.
L3_S34  already trained. Continue.
L0_S20  already trained. Continue.
L0_S8  already trained. Continue.
L0_S0  already trained. Continue.
L0_S10  already trained. Continue.
L0_S2  already trained. C

In [19]:
trained = joblib.load("files/mvas_per_station.pkl")
trained.update(joblib.load("files/mvas_per_station10.pkl"))
trained.update(joblib.load("files/mvas_per_station20.pkl"))
trained.update(joblib.load("files/mvas_per_station30.pkl"))
trained.update(joblib.load("files/mvas_per_station40.pkl"))
trained.update(joblib.load("files/mvas_per_station50.pkl"))
trained.update(joblib.load("files/mvas_per_station51.pkl"))
trained.update(joblib.load("files/mvas_per_station52.pkl"))

# run again to get all (small coding bug)

In [5]:
def predictStations(grps, mvas, results, processed=[], save=False):
    for i,g in enumerate(grps.groups):
        s=time.time()
        if g in processed:
            print(g, " already predicted. Continue.")
            continue
        # Dataset magic
        d = grps.get_group(g).dropna(how="all", axis=0)
        d.columns = d.columns.droplevel(["Line","Station"])
        d=d.reset_index()
        if len(d)==0:
            continue
        # pipeline
        print("classifier pipeline prediction for goup: "
              , g, "(",i+1,"/",len(grps.groups),")")
        clf = mvas[g]
    
        pred_func = pd.DataFrame({"Id" : d.Id
                            ,"pred_"+g :clf.decision_function(d.drop(traindrop,axis=1))})
        results = results.merge(pred_func, on="Id", how="left")
        print("done in {:2.2f}".format((time.time()-s)/60.)," minutes.")
        del d
    return results

In [143]:
result = pd.DataFrame({"Id": train.reset_index()["Id"],
                       "Response": train.reset_index()["Response"]})

In [144]:
pre_functs = predictStations(gb, trained, result,pre_functs.columns)

classifier pipeline prediction for goup:  L1_S24 ( 1 / 50 )
done in 0.59  minutes.
classifier pipeline prediction for goup:  L0_S14 ( 2 / 50 )
done in 0.39  minutes.
classifier pipeline prediction for goup:  L2_S26 ( 3 / 50 )
done in 0.64  minutes.
classifier pipeline prediction for goup:  L3_S39 ( 4 / 50 )
done in 0.17  minutes.
classifier pipeline prediction for goup:  L3_S33 ( 5 / 50 )
done in 2.85  minutes.
classifier pipeline prediction for goup:  L3_S31 ( 6 / 50 )
done in 0.10  minutes.
classifier pipeline prediction for goup:  L0_S4 ( 7 / 50 )
done in 0.84  minutes.
classifier pipeline prediction for goup:  L3_S30 ( 8 / 50 )
done in 2.93  minutes.
classifier pipeline prediction for goup:  L3_S38 ( 9 / 50 )
done in 0.07  minutes.
classifier pipeline prediction for goup:  L3_S29 ( 10 / 50 )
done in 2.82  minutes.
classifier pipeline prediction for goup:  L0_S9 ( 11 / 50 )
done in 0.55  minutes.
classifier pipeline prediction for goup:  L2_S27 ( 12 / 50 )
done in 0.32  minutes.
cla

In [145]:
def plotPreds(df):
    for i,c in enumerate(df.columns.values):
        if c in ["Id","Response"]:
            continue
        a = sns.distplot(df[df.Response==0][c].dropna(),norm_hist=True,
                label="good parts", color=Tableau10_Medium['green'])
        a = sns.distplot(df[df.Response==1][c].dropna(),norm_hist=True
                , label="bad parts", color=Tableau10_Medium['red'])
        a.legend(loc=0)
        a.figure.savefig("plots/PredictionFunctions/predFunc"+c+".png")
        plt.clf()
        if i%10==9:
            print(i+1,"/",len(df.columns.values)-2, " plots made.")
    return

In [146]:
plotPreds(pre_functs)

  y = X[:m/2+1] + np.r_[0,X[m/2+1:],0]*1j


10 / 50  plots made.
20 / 50  plots made.
30 / 50  plots made.
40 / 50  plots made.
50 / 50  plots made.


<matplotlib.figure.Figure at 0x1539c97c278>

In [148]:
def optPrediction(df):
    from sklearn.metrics import matthews_corrcoef
    opt = []
    for i,c in enumerate(df.columns.values):
        if c in ["Id","Response"]:
            continue
        subset = df[["Id","Response",c]].dropna()
        # find threshold for prediction, use 50 steps
        thresholds = np.linspace(subset[c].min(),subset[c].max())
        tmp = -1
        best = -1
        best_z = subset[c].min()
        for z in thresholds:
            tmp = matthews_corrcoef(subset.Response,
                                    [1 if x>z else 0 for x in subset[c]])
            if tmp>best:
                best=tmp
                best_z = z
        opt.append([c,best_z,best])
    return opt

In [149]:
%time thresholds_mcc_per_station = optPrediction(pre_functs)

Wall time: 8min 56s


In [150]:
stations = pd.DataFrame({
        "Station" : [x[0] for x in thresholds_mcc_per_station],
        "Threshold" : [x[1] for x in thresholds_mcc_per_station],
        "MCC" : [x[2] for x in thresholds_mcc_per_station]
    })

In [151]:
joblib.dump(stations, "files/station_mcc_threshold.pkl", compress=3)

['files/station_mcc_threshold.pkl']

In [152]:
joblib.dump(pre_functs, "files/train_pred_functs.pkl", compress=3)

['files/train_pred_functs.pkl',
 'files/train_pred_functs.pkl_01.npy.z',
 'files/train_pred_functs.pkl_02.npy.z']

In [2]:
pre_functs = joblib.load("files/train_pred_functs.pkl")

In [3]:
stations = joblib.load("files/station_mcc_threshold.pkl")

In [153]:
def predictId(pred_f,thresholds, pred_test=False):
    """
    Strategy per Id will be:
    Sort stations classifiers by descending MCC values.
    Then predict Ids once with the best classifier available.
    """
    sorted_stations = thresholds.sort_values(by=["MCC"]
                                             , ascending=False)
    
    results = pred_f[["Id","Response"]] if pred_test else pred_f[["Id"]]
    results["pred_Response"] = 0
    
    for s in sorted_stations["Station"].values:
            
        data = pred_f[~pred_f[s].isnull()][["Id",s]]
        if len(data)==0:
            continue
        thrsd = thresholds[thresholds.Station==s].Threshold.values[0]
        pred_class = pd.DataFrame({"Id" : data.Id
                    ,"pred_Response" : [1 if x>thrsd else 0 for x in data[s].values]})
        results.pred_Response[results.Id.isin(data.Id.values)] = \
                        pred_class["pred_Response"].values
            
        print("Preditions done for station: ", s )
        print("Predicted ", len(data), "parts.")

    return results

In [154]:
%time train_predictions = predictId(pre_functs, stations)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Preditions done for station:  pred_L2_S28
Predicted  9583 parts.
Preditions done for station:  pred_L3_S43
Predicted  30551 parts.
Preditions done for station:  pred_L3_S44
Predicted  29804 parts.
Preditions done for station:  pred_L3_S38
Predicted  27142 parts.
Preditions done for station:  pred_L3_S41
Predicted  59913 parts.
Preditions done for station:  pred_L3_S40
Predicted  59914 parts.
Preditions done for station:  pred_L3_S47
Predicted  59955 parts.
Preditions done for station:  pred_L1_S25
Predicted  83658 parts.
Preditions done for station:  pred_L3_S48
Predicted  59923 parts.
Preditions done for station:  pred_L3_S49
Predicted  29673 parts.
Preditions done for station:  pred_L3_S50
Predicted  30359 parts.
Preditions done for station:  pred_L0_S21
Predicted  81409 parts.
Preditions done for station:  pred_L0_S23
Predicted  80290 parts.
Preditions done for station:  pred_L0_S22
Predicted  80599 parts.
Preditions done for station:  pred_L0_S15
Predicted  121445 parts.
Preditions

In [155]:
train_predictions.pred_Response.value_counts()

0    1124715
1      59032
Name: pred_Response, dtype: int64

In [156]:
59032/1124715

0.052486185389187485

In [157]:
train_predictions.to_csv("files/train_predictions.csv",index=False)

In [162]:
train_predictions = train_predictions.merge(train.reset_index()[["Id","Response"]]
                                            , on="Id"
                                            , how="left")

  self.right = self.right.drop(right_drop, axis=1)


In [91]:
from sklearn.metrics import matthews_corrcoef

In [164]:
train_predictions.columns=["Id","pred_Response","Response"]

In [171]:
train_predictions[train_predictions.Response==1].Id.isin(\
        train_predictions[train_predictions.pred_Response==1].Id).value_counts()

False    6562
True      317
Name: Id, dtype: int64

In [168]:
matthews_corrcoef(train_predictions.Response,train_predictions.pred_Response)

-0.0013299303371982756

In [6]:
def predictClass(grps, mvas, results, processed=[], save=False):
    for i,g in enumerate(grps.groups):
        s=time.time()
        if g in processed:
            print(g, " already predicted. Continue.")
            continue
        # Dataset magic
        d = grps.get_group(g).dropna(how="all", axis=0)
        d.columns = d.columns.droplevel(["Line","Station"])
        d=d.reset_index()
        if len(d)==0:
            continue
        # pipeline
        print("classifier pipeline prediction for goup: "
              , g, "(",i+1,"/",len(grps.groups),")")
        clf = mvas[g]
    
        pred_func = pd.DataFrame({"Id" : d.Id
                            ,"pred_"+g :clf.predict(d.drop(traindrop,axis=1))})
        results = results.merge(pred_func, on="Id", how="left")
        print("done in {:2.2f}".format((time.time()-s)/60.)," minutes.")
        del d
    return results

In [174]:
result_class = pd.DataFrame({"Id": train.reset_index()["Id"],
                       "Response": train.reset_index()["Response"]})

In [175]:
 pred_train_class = predictClass(gb,trained,result_class)

classifier pipeline prediction for goup:  L1_S24 ( 1 / 50 )
done in 0.56  minutes.
classifier pipeline prediction for goup:  L0_S14 ( 2 / 50 )
done in 0.31  minutes.
classifier pipeline prediction for goup:  L2_S26 ( 3 / 50 )
done in 0.56  minutes.
classifier pipeline prediction for goup:  L3_S39 ( 4 / 50 )
done in 0.16  minutes.
classifier pipeline prediction for goup:  L3_S33 ( 5 / 50 )
done in 2.72  minutes.
classifier pipeline prediction for goup:  L3_S31 ( 6 / 50 )
done in 0.09  minutes.
classifier pipeline prediction for goup:  L0_S4 ( 7 / 50 )
done in 0.80  minutes.
classifier pipeline prediction for goup:  L3_S30 ( 8 / 50 )
done in 2.84  minutes.
classifier pipeline prediction for goup:  L3_S38 ( 9 / 50 )
done in 0.07  minutes.
classifier pipeline prediction for goup:  L3_S29 ( 10 / 50 )
done in 2.84  minutes.
classifier pipeline prediction for goup:  L0_S9 ( 11 / 50 )
done in 0.58  minutes.
classifier pipeline prediction for goup:  L2_S27 ( 12 / 50 )
done in 0.32  minutes.
cla

In [213]:
pred_train_class[pred_train_class.Response==0].drop(["Id","Response"],axis=1).apply(pd.Series.sum,axis=1).value_counts(dropna=False)

 0.0    1176288
NaN         580
dtype: int64

In [212]:
len(pred_train_class.pred_Response[pred_train_class.pred_Response.isnull()])

582

In [210]:
pred_train_class[pred_train_class.Response==1].drop(["Id","Response"],axis=1).apply(lambda x: pd.Series.sum(x,numeric_only=False),axis=1).value_counts()

0.0    6826
2.0      51
dtype: int64

In [190]:
pred_train_class["pred_Response"] = pred_train_class.drop(["Id","Response"],axis=1).apply(pd.Series.sum,axis=1)

In [214]:
pred_train_class.pred_Response[pred_train_class.pred_Response.isnull()]=0

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [215]:
pred_train_class.loc[:,"pred_Response"] = pred_train_class.loc[:,"pred_Response"]\
.apply(lambda x: int(x))

In [216]:
pred_train_class.pred_Response.value_counts(dropna=False)

0    1183696
1         51
Name: pred_Response, dtype: int64

In [218]:
pred_train_class.to_csv("files/simple_prediction_train.csv",index=False)

In [217]:
matthews_corrcoef(pred_train_class.Response, pred_train_class.pred_Response)

0.085855124119177187

# Go ahead an predict the test data

In [7]:
test = pd.read_csv("files/test_numeric.csv")

In [8]:
# a little bit of index magic to be able to group by station
ids=test.Id
test = test.set_index(["Id"])
test.columns = addLineStation(test)

gbt = test.groupby(level="Station",axis=1)

In [11]:
test_result = pd.DataFrame({"Id": test.reset_index()["Id"]})

In [12]:
traindrop=["Id"]

In [16]:
"L3_S40" in trained.keys()

False

In [20]:
len(trained.keys())

45

In [13]:
res_t = predictStations(gbt,trained,test_result)

classifier pipeline prediction for goup:  L3_S33 ( 1 / 50 )
done in 2.69  minutes.
classifier pipeline prediction for goup:  L3_S40 ( 2 / 50 )


KeyError: 'L3_S40'