# Data For this notebook:

**combined_sig_df.pkl**
https://storage.googleapis.com/issue_label_bot/notebook_files/combined_sig_df.pkl

**feat_df.csv** https://storage.googleapis.com/issue_label_bot/notebook_files/feat_df.csv

In [337]:
import pandas as pd
from inference import InferenceWrapper, pass_through
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from IPython.display import display

import os
import torch
from torch.cuda import empty_cache
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

combined_sig_df = pd.read_pickle('combined_sig_df.pkl')
feat_df = pd.read_csv('feat_df.csv')

train_mask = combined_sig_df.part != 6
holdout_mask = ~train_mask

In [338]:
# count the labels in the holdout set
from collections import Counter
c = Counter()

for row in combined_sig_df[combined_sig_df.part == 6].labels:
    c.update(row)

In [339]:
X = feat_df[train_mask].values
X.shape

(7236, 1600)

In [340]:
label_columns = [x for x in combined_sig_df.columns if 'sig/' in x]
y = combined_sig_df[label_columns][train_mask].values
y.shape

(7236, 28)

In [341]:
X_holdout = feat_df[holdout_mask].values
y_holdout = combined_sig_df[label_columns][holdout_mask].values


In [342]:
def calculate_auc(predictions):
    auc_scores = []
    counts = []

    for i, l in enumerate(label_columns):
        y_hat = predictions[:, i]
        y = y_holdout[:, i]
        auc = roc_auc_score(y_true=y, y_score=y_hat)
        auc_scores.append(auc)
        counts.append(c[l])
    
    df = pd.DataFrame({'label': label_columns, 'auc': auc_scores, 'count': counts})    
    display(df)
    weightedavg_auc = df.apply(lambda x: x.auc * x['count'], axis=1).sum() / df['count'].sum()
    print(f'Weighted Average AUC: {weightedavg_auc}')
    return df, weightedavg_auc

# Part 1: Feed Embeddings From Language Model To Downstream Algorithims and Do Greedy Training On Top of That

## Keras

In [50]:
def shallow_model(l1=.01, l2=.01):

    inp = Input(shape=(1600,))
    x = Dense(units=30)(inp)
    out = Dense(units=28, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=out)
    model.compile(optimizer=Adam(lr=.001), loss='categorical_crossentropy')
    return model

shallow_model = shallow_model()

In [51]:
shallow_model.fit(x=X, y=y, batch_size=64, epochs=50, validation_split=.15)

Train on 6150 samples, validate on 1086 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<tensorflow.python.keras.callbacks.History at 0x7f4e3f7939e8>

In [27]:
shallow_model.fit(x=X, y=y, batch_size=64, epochs=1, validation_split=0)

Epoch 1/1


<tensorflow.python.keras.callbacks.History at 0x7f4ad6885c18>

In [None]:
y_hat_holdout = shallow_model.predict(X_holdout)

In [41]:
auc_scores = []

for i, l in enumerate(label_columns):
    y_hat = y_hat_holdout[:, i]
    y = y_holdout[:, i]
    auc = roc_auc_score(y_true=y, y_score=y_hat)
    auc_scores.append(auc)    

In [42]:
pd.DataFrame({'label': label_columns, 'auc': auc_scores})

Unnamed: 0,label,auc
0,sig/cluster-lifecycle,0.666893
1,sig/node,0.718941
2,sig/api-machinery,0.78774
3,sig/scalability,0.814389
4,sig/cli,0.840884
5,sig/autoscaling,0.849214
6,sig/network,0.786889
7,sig/cloud-provider,0.741232
8,sig/storage,0.880883
9,sig/scheduling,0.813724


wow that sucks, lets try something else

## Sklearn Nueral Network

b/c multi-label is supported naturally

In [11]:
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(early_stopping=True, n_iter_no_change=5, max_iter=500, solver='adam', 
                   random_state=1234)

In [12]:
mlp.fit(X, y)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=5, nesterovs_momentum=True, power_t=0.5,
       random_state=1234, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [15]:
mlp_predictions = mlp.predict_proba(X_holdout)

In [55]:
mlp_df, mlp_auc = calculate_auc(mlp_predictions)

Unnamed: 0,label,auc,count
0,sig/cluster-lifecycle,0.863932,498
1,sig/node,0.884496,1311
2,sig/api-machinery,0.892453,1090
3,sig/scalability,0.907244,258
4,sig/cli,0.935913,544
5,sig/autoscaling,0.949778,100
6,sig/network,0.945694,923
7,sig/cloud-provider,0.934848,29
8,sig/storage,0.965592,824
9,sig/scheduling,0.926638,397


Weighted Average AUC: 0.9168608333252417


#### Try Tuning the MLP

In [343]:
params = {'hidden_layer_sizes': [(100,), (200,), (400, ), (50, 50), (100, 100), (200, 200)],
              'alpha': [.001, .01, .1, 1, 10],
              'learning_rate': ['constant', 'adaptive'],
              'learning_rate_init': [.001, .01, .1]}
              
mlp_clf = MLPClassifier(early_stopping=True, validation_fraction=.2, n_iter_no_change=4, max_iter=500)

gscvmlp = GridSearchCV(mlp_clf, params, cv=5, n_jobs=-1)

gscvmlp.fit(X, y)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=4, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.2, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'hidden_layer_sizes': [(100,), (200,), (400,), (50, 50), (100, 100), (200, 200)], 'alpha': [0.001, 0.01, 0.1, 1, 10], 'learning_rate': ['constant', 'adaptive'], 'learning_rate_init': [0.001, 0.01, 0.1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [344]:
print(f'The best model from grid search is:\n=====================================\n{gscvmlp.best_estimator_}')

The best model from grid search is:
MLPClassifier(activation='relu', alpha=0.001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(200, 200), learning_rate='adaptive',
       learning_rate_init=0.001, max_iter=500, momentum=0.9,
       n_iter_no_change=4, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.2, verbose=False, warm_start=False)


In [347]:
mlp_tuned_predictions = gscvmlp.predict_proba(X_holdout)

In [348]:
mlp_tuned_df, mlp_tuned_auc = calculate_auc(mlp_tuned_predictions)

Unnamed: 0,label,auc,count
0,sig/cluster-lifecycle,0.861357,498
1,sig/node,0.886055,1311
2,sig/api-machinery,0.893178,1090
3,sig/scalability,0.897738,258
4,sig/cli,0.934423,544
5,sig/autoscaling,0.943365,100
6,sig/network,0.946594,923
7,sig/cloud-provider,0.891243,29
8,sig/storage,0.966184,824
9,sig/scheduling,0.925841,397


Weighted Average AUC: 0.9161311029647917


## Sklearn Random Forest

In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import NuSVC

rf = RandomForestClassifier(n_estimators=300,
                             random_state=1234, min_samples_leaf=3, class_weight='balanced')


clf = OneVsRestClassifier(rf, n_jobs=-1)

In [57]:
clf.fit(X, y)

OneVsRestClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=None, max_features='auto',
            max_leaf_nodes=None, min_impurity_decrease=0.0,
            min_impurity_split=None, min_samples_leaf=3,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=None, oob_score=False,
            random_state=1234, verbose=0, warm_start=False),
          n_jobs=-1)

In [58]:
rf_predictions = clf.predict_proba(X_holdout)

In [59]:
rf_df, rf_auc = calculate_auc(rf_predictions)

Unnamed: 0,label,auc,count
0,sig/cluster-lifecycle,0.837176,498
1,sig/node,0.863413,1311
2,sig/api-machinery,0.869993,1090
3,sig/scalability,0.890226,258
4,sig/cli,0.930573,544
5,sig/autoscaling,0.92856,100
6,sig/network,0.926477,923
7,sig/cloud-provider,0.944092,29
8,sig/storage,0.958161,824
9,sig/scheduling,0.898687,397


Weighted Average AUC: 0.9014719435122518


## Sklearn KNN

In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(weights='distance', n_neighbors=10, n_jobs=-1)

In [61]:
knn.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=-1, n_neighbors=10, p=2,
           weights='distance')

In [62]:
knn_preds = knn.predict_proba(X_holdout)

In [63]:
import numpy as np
knn_preds_stacked = np.stack([x[:, 1] for x in knn_preds], axis=0).T

In [64]:
knn_preds_stacked

array([[0.      , 0.397812, 0.      , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.201453, 0.096647, 0.099647, 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.202347, 0.201023, 0.097884, ..., 0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.093967, 0.50943 , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       ...,
       [0.095501, 0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.411118, 0.296557, 0.      , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.      , 0.097659, 0.51011 , 0.      , ..., 0.      , 0.      , 0.      , 0.      ],
       [0.09672 , 0.096316, 0.      , 0.404727, ..., 0.      , 0.      , 0.      , 0.      ]])

In [65]:
knn_df, knn_auc = calculate_auc(knn_preds_stacked)

Unnamed: 0,label,auc,count
0,sig/cluster-lifecycle,0.759911,498
1,sig/node,0.802074,1311
2,sig/api-machinery,0.818665,1090
3,sig/scalability,0.819704,258
4,sig/cli,0.869074,544
5,sig/autoscaling,0.792181,100
6,sig/network,0.884401,923
7,sig/cloud-provider,0.657295,29
8,sig/storage,0.919821,824
9,sig/scheduling,0.826708,397


Weighted Average AUC: 0.8253041170693705


## Sklearn GBM

In [66]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.multiclass import OneVsRestClassifier
gbm = GradientBoostingClassifier(max_depth=5, min_samples_leaf=3, max_features='auto', n_iter_no_change=4)
clf = OneVsRestClassifier(gbm, n_jobs=-1)

In [67]:
clf.fit(X, y)

OneVsRestClassifier(estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=5,
              max_features='auto', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=3, min_sam...      subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
          n_jobs=-1)

In [68]:
gbm_predictions = clf.predict_proba(X_holdout)

In [69]:
gbm_df, gbm_auc = calculate_auc(gbm_predictions)

Unnamed: 0,label,auc,count
0,sig/cluster-lifecycle,0.800617,498
1,sig/node,0.854387,1311
2,sig/api-machinery,0.872441,1090
3,sig/scalability,0.852579,258
4,sig/cli,0.925437,544
5,sig/autoscaling,0.862118,100
6,sig/network,0.923715,923
7,sig/cloud-provider,0.551281,29
8,sig/storage,0.954871,824
9,sig/scheduling,0.879484,397


Weighted Average AUC: 0.8766387996437228


# Part 2: Fine Tune Original Language Model 

With FastAI & Pytorch

Note: this model was completely trained in another notebook, it is only evaluated here.

### Prepare Inference Wrapper

Model is available for download here:

https://storage.googleapis.com/issue_label_bot/model/multi_class_model/export.pkl

In [171]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

from inference import InferenceWrapper, pass_through
iw = InferenceWrapper(model_path='/ds/multi_class_model/',
                      model_file_name='/ds/multi_class_model/export.pkl')

In [174]:
#because the pre-trained mdoel predicts more than just sig/ labels
pred_mask = [x in label_columns for x in iw.learn.data.classes]

Do pre-processing (such as markdown parsing) to prepare data for model.

In [111]:
parsed_df = iw.process_df(combined_sig_df)

  for d in tqdm(dataframe.to_dict(orient='rows')):


HBox(children=(IntProgress(value=0, max=14390), HTML(value='')))




In [355]:
holdout_text = parsed_df[holdout_mask]

In [356]:
lang_model_predict = np.stack(holdout_text.text.apply(lambda x: tcl.predict(x)[2].numpy()[pred_mask]).values)

In [359]:
lang_model_predict.shape

(7154, 24)

In [362]:
len(iw.learn.data.classes)

45

In [365]:
np.array(iw.learn.data.classes)[pred_mask]

array(['sig/api-machinery', 'sig/apps', 'sig/architecture', 'sig/auth', 'sig/autoscaling', 'sig/aws', 'sig/azure',
       'sig/cli', 'sig/cloud-provider', 'sig/cluster-lifecycle', 'sig/contributor-experience', 'sig/docs', 'sig/gcp',
       'sig/instrumentation', 'sig/multicluster', 'sig/network', 'sig/node', 'sig/openstack', 'sig/release',
       'sig/scalability', 'sig/scheduling', 'sig/storage', 'sig/testing', 'sig/windows'], dtype='<U31')

In [366]:
lang_model_predict_df = pd.DataFrame(lang_model_predict)
lang_model_predict_df.columns = np.array(iw.learn.data.classes)[pred_mask]
lm_df = lang_model_predict_df[[x for x in label_columns if x in lang_model_predict_df.columns]]

There are some columns that the model was not trained on

In [367]:
missing_cols = [x for x in label_columns if x not in lm_df.columns]

In [368]:
for col in missing_cols:
    lm_df[col] = 0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [371]:
lm_df, lm_auc = calculate_auc(lm_df.values)

Unnamed: 0,label,auc,count
0,sig/cluster-lifecycle,0.85523,498
1,sig/node,0.874241,1311
2,sig/api-machinery,0.888351,1090
3,sig/scalability,0.910308,258
4,sig/cli,0.941818,544
5,sig/autoscaling,0.956396,100
6,sig/network,0.942258,923
7,sig/cloud-provider,0.966819,29
8,sig/storage,0.966369,824
9,sig/scheduling,0.926945,397


Weighted Average AUC: 0.8564939002714045


In [372]:
missing_cols

['sig/federation', 'sig/cluster-ops', 'sig/vmware', 'sig/service-catalog']

# Part 3:  Using Text & Bag of Words Instead of Pre-Trained Embeddings (Classic ML)

**Count Vectorizer w/ Nueral Net**

Choose Nueral Net because supports multi-label classification natively

In [307]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV


train_mask = combined_sig_df.part != 6

parameters = {'hidden_layer_sizes': [(50,), (100,), (200,), (400, ), (50, 50), (100, 100)],
              'alpha': [0.0001, .001, .01, .1, 1, 10],
              'learning_rate': ['constant', 'adaptive'],
              'learning_rate_init': [.001, .01]}
              
mlp = MLPClassifier(early_stopping=True, validation_fraction=.2, n_iter_no_change=4, max_iter=500)

In [308]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.column].values
        except KeyError:
            col_error = list(set(self.column) - set(X.column))
            raise KeyError("The DataFrame does not include the columns: %s" % col_error)

In [309]:
model_pipeline = make_pipeline(
    FeatureUnion(transformer_list=[
        ("title", make_pipeline(
            ColumnSelector("title"),
            CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='unicode'),
        )),
        ("body", make_pipeline(
            ColumnSelector("body"),
            CountVectorizer(ngram_range=(1,3), min_df=3, strip_accents='unicode'),
        ))
    ]),
    GridSearchCV(mlp, parameters, cv=5, n_jobs=-1)
)

In [310]:
y_train = combined_sig_df[label_columns][train_mask].values
x_train_df = combined_sig_df[['body', 'title']][train_mask]

#### Fit the model

In [311]:
model_pipeline.fit(x_train_df, y_train)

Pipeline(memory=None,
     steps=[('featureunion', FeatureUnion(n_jobs=None,
       transformer_list=[('title', Pipeline(memory=None,
     steps=[('columnselector', ColumnSelector(column='title')), ('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, en...   pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0))])

In [329]:
best_model = model_pipeline.named_steps['gridsearchcv'].best_estimator_
print(f'The best model from grid search is:\n=====================================\n{best_model}')


The best model from grid search is:
MLPClassifier(activation='relu', alpha=0.01, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=True, epsilon=1e-08,
       hidden_layer_sizes=(100, 100), learning_rate='adaptive',
       learning_rate_init=0.01, max_iter=500, momentum=0.9,
       n_iter_no_change=4, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.2, verbose=False, warm_start=False)


#### Evaluate Model

In [332]:
holdout_mask = combined_sig_df.part == 6
x_holdout_df = combined_sig_df[['body', 'title']][holdout_mask]
y_holdout = combined_sig_df[label_columns][holdout_mask].values

In [335]:
no_deep_mlp_preds = model_pipeline.predict_proba(x_holdout_df)

In [336]:
no_deep_mlp_df, no_deep_mlp_auc = calculate_auc(no_deep_mlp_preds)

Unnamed: 0,label,auc,count
0,sig/cluster-lifecycle,0.778223,498
1,sig/node,0.868424,1311
2,sig/api-machinery,0.869518,1090
3,sig/scalability,0.858837,258
4,sig/cli,0.903395,544
5,sig/autoscaling,0.914378,100
6,sig/network,0.899245,923
7,sig/cloud-provider,0.487758,29
8,sig/storage,0.934296,824
9,sig/scheduling,0.883158,397


Weighted Average AUC: 0.8640413289631089
