In [None]:
load_docs = True
dump_docs = False
language='en'

In [None]:
import pickle, random, itertools, pandas
import numpy as np
from scipy.spatial.distance import cosine
import spacy
from spacy.pipeline import TextCategorizer
from spacy.tokens import Doc, Span
from matplotlib import pyplot as plt

In [None]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from sklearn.metrics import coverage_error, label_ranking_average_precision_score, label_ranking_loss

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, LSTM, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras import utils
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

In [None]:
%run evaluation.ipynb
%run utils.ipynb
%run nlp_functions.ipynb
%run explanation.ipynb

https://www.kaggle.com/rtatman/data-cleaning-challenge-scale-and-normalize-data

In [None]:
def prepare_data(project_docs, goal_docs, feature_type='goal_similarities', label_format='sparse', scale=True, shuffle=False):
    assert feature_type in ('goal_similarities', 'embeddings')
    assert label_format in ('sparse', 'dense')
    
    feature_vectors = []
    labels = []
    for pdoc in project_docs:
        if label_format == 'sparse':
            true_labels = [1 if i in pdoc._.goal_labels else 0 for i in range(14)]
        else:
            true_labels = pdoc._.goal_labels
            
        if feature_type == 'goal_similarities':
            features = compute_goal_scores(pdoc, goal_docs, similarity='custom')
        elif feature_type == 'embeddings':
            features = pdoc._.custom_vector
                    
        labels.append(true_labels)
        feature_vectors.append(features)
    
    feature_vectors, labels = np.array(feature_vectors), np.array(labels)
    
    if scale:
        feature_vectors = RobustScaler().fit_transform(feature_vectors)
        
    if shuffle:
        zipped = list(zip(feature_vectors, labels))
        random.seed(10)
        random.shuffle(zipped)
        feature_vectors, labels = zip(*zipped)
        feature_vectors, labels = np.array(feature_vectors), np.array(labels)
    
    return feature_vectors, labels

In [None]:
def compute_class_weights(label_matrix):
    
    counts = {i:0 for i in range(label_matrix.shape[1])}
    for label_vector in label_matrix:
        for index,l in enumerate(label_vector):
            if l:
                counts[index] += 1
    class_weights_dict = {i: 1.0 / counts[i] for i in counts}
    
    #for i in counts:
    #    print(i, counts[i], class_weights_dict[i])
    return class_weights_dict

In [None]:
def assign_weak_labels(training_project_docs, unlabeled_project_docs, goal_docs, algorithm,
                       #next line is the parameters to optimize
                       batch_size=100, label_selection_method='dgc', dcg_min_diff=0.1, threshold=0.3, k=3,
                       feature_type='goal_similarities', scale_features=True, parameters={}, 
                       plot_epochs=True, verbose=True):
    assert label_selection_method in ('dcg', 'threshold', 'k')
    if label_selection_method == 'dcg':
        assert dcg_min_diff
    elif label_selection_method == 'threshold':
        assert threshold
    elif label_selection_method == 'k':
        assert k
    
    n_iter = 0
    while len(unlabeled_project_docs) > 0:
        print('\nTraining set size:', len(training_project_docs))
        print('Unlabeled set size:', len(unlabeled_project_docs))
        
        parameters['random_state'] = n_iter
        n_iter += 1
        offset = min(batch_size, len(unlabeled_project_docs))
        unlabeled_batch = unlabeled_project_docs[:offset]
        _, u_score = test_classifier(training_project_docs, unlabeled_batch, goal_docs, algorithm=algorithm,
                                     feature_type=feature_type, scale_features=scale_features, parameters=parameters, 
                                     plot_epochs=plot_epochs, shuffle=True, verbose=verbose)
        
        for udoc, us in zip(unlabeled_batch, u_score):
            if label_selection_method == 'dcg':
                assigned_labels = select_labels_by_dcg(us, min_diff=dcg_min_diff)
            elif label_selection_method == 'threshold':
                assigned_labels = select_labels_by_threshold(us, threshold=threshold)
            elif label_selection_method == 'k':
                assigned_labels = select_labels_by_k(us, k=k)
            udoc._.goal_labels = assigned_labels
            
        unlabeled_batch = [udoc for udoc in unlabeled_batch if len(udoc._.goal_labels) > 0]
        training_project_docs = training_project_docs + unlabeled_batch
        unlabeled_project_docs = unlabeled_project_docs[offset:]
    
    print('\nTraining set size:', len(training_project_docs))
    print('Unlabeled set size:', len(unlabeled_project_docs))
    return training_project_docs

In [None]:
def classify_with_spacy(training_project_docs, test_project_docs, parameters={}, load_model=False, verbose=True):
    for p in parameters:
        assert p in ['n_iter', 'batch_size']
        
    if verbose:
        print('Algorithm: spacy')
    
    #nlp = en_trf_bertbaseuncased_lg.load()
    nlp=spacy.load("en_core_web_sm")
    
    if load_model:
        print('Loading the model...')
        text_classifier = nlp.create_pipe("textcat", config={"exclusive_classes": False, "architecture": "ensemble"})
        text_classifier.from_disk('models/spacy_textcategorizer_sm')
        nlp.add_pipe(text_classifier, last=True)
        
    else:
        n_iter = parameters['n_iter'] if 'n_iter' in parameters else 10
        batch_size = parameters['batch_size'] if 'batch_size' in parameters else 8

        if verbose:
            print('Preparing nlp model...')
        text_classifier = nlp.create_pipe("textcat", config={"exclusive_classes": False, "architecture": "ensemble"})
        nlp.add_pipe(text_classifier, last=True)
        if verbose:
            print('Pipeline:', nlp.pipe_names)

        for i in range(14):
            text_classifier.add_label(str(i))

        if verbose:
            print('Preparing samples...')

        training_samples = [(pdoc.text, [str(l) for l in pdoc._.goal_labels]) for pdoc in training_project_docs]
        training_samples = [(text, {'cats': {label: (label in true_labels) for label in text_classifier.labels}}) 
                            for text,true_labels in training_samples]

        #training_samples = training_samples[:50]

        spacy.util.fix_random_seed()
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
        with nlp.disable_pipes(*other_pipes):  # only train textcat
            nlp.begin_training()
            if verbose:
                print('Starting training...')
            for itn in range(n_iter):
                if verbose:
                    print('Training loop: ', itn)
                random.shuffle(training_samples)
                # Divide examples into batches
                for batch in spacy.util.minibatch(training_samples, size=batch_size):
                    texts = [text for text, label in batch]
                    labels = [label for text, label in batch]
                    #print('Updating...')
                    # Update the model
                    nlp.update(docs=texts, golds=labels)

        text_classifier = nlp.get_pipe('textcat')
        print('Saving the model...')
        text_classifier.to_disk('models/spacy_textcategorizer_sm')
    
    y_score, tensors = text_classifier.predict(test_project_docs)
    for test_pdoc,ps in zip(test_project_docs,y_score):
        test_pdoc._.predicted_goal_scores = sorted([(i,s) for i,s in enumerate(list(ps))], reverse=True, key=lambda x:x[-1])

    y_true = [[1 if int(l) in test_pdoc._.goal_labels else 0 for l in text_classifier.labels] 
              for test_pdoc in test_project_docs]
    
    return y_true, y_score

In [None]:
def plot_keras_epochs(history):
    
    fig, axs = plt.subplots(2, figsize = (8,6))
    axs[0].plot(history.history['loss'], label='training')
    axs[0].plot(history.history['val_loss'], label='validation')
    axs[0].set_title('Loss')
    axs[0].set(xlabel = 'Epochs')
    axs[0].legend()

    axs[1].plot(history.history['categorical_accuracy'], label='training')
    axs[1].plot(history.history['val_categorical_accuracy'], label='validation')
    axs[1].set_title('categorical_accuracy')
    axs[1].set(xlabel = 'Epochs')
    axs[1].legend()

    plt.subplots_adjust(left=0.125, bottom=0.1, right=0.9, top=0.9, wspace=0.2, hspace=0.4)   

In [None]:
def compile_keras_model(parameters):
        
    tf.random.set_seed(0)
    
    training_vectors_shape = parameters['training_vectors_shape']
    n_labels = parameters['n_labels']
    layer_type = parameters['layer_type'] if 'layer_type' in parameters else 'Dense'
    hidden_layer_sizes = parameters['hidden_layer_sizes'] if 'hidden_layer_sizes' in parameters else (100,)
    dropout = parameters['dropout'] if 'dropout' in parameters else 0.0
    learning_rate = parameters['learning_rate'] if 'learning_rate' in parameters else 0.001
    print_summary = parameters['print_summary'] if 'print_summary' in parameters else False
    
    model = Sequential()
    for n_neurons in hidden_layer_sizes:
        model.add(Dense(n_neurons, input_dim=training_vectors_shape[1], activation='relu'))
        model.add(Dropout(dropout))
            
    model.add(Dense(n_labels, activation='sigmoid'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer=Adam(lr=learning_rate), 
                  metrics=['categorical_accuracy'])
    
    if print_summary:
        model.summary()
            
    return model

In [None]:
def get_keras_sklearn_wrapper(parameters):
    
    assert 'training_vectors_shape' in parameters
    assert 'n_labels' in parameters
    if 'layer_type' in parameters:
        assert parameters['layer_type'] in ('Dense')
    for p_name in parameters:
        assert p_name in ('layer_type', 'hidden_layer_sizes', 'training_vectors_shape', 'n_labels', 'dropout', 'learning_rate',
                          'n_epochs', 'batch_size', 'class_weights_dict', 'early_stopping', 'patience', 'random_state')
        
    n_epochs = parameters['n_epochs'] if 'n_epochs' in parameters else 500
    batch_size = parameters['batch_size'] if 'batch_size' in parameters else 64
    early_stopping = parameters['early_stopping'] if 'early_stopping' in parameters else False
    patience = parameters['patience'] if 'patience' in parameters else 10
    class_weights_dict = parameters['class_weights_dict'] if 'class_weights_dict' in parameters else None
    random_state = parameters['random_state'] if 'random_state' in parameters else 0
    
    callbacks = [EarlyStopping(patience=patience)] if early_stopping else []    
    classifier = KerasClassifier(build_fn=lambda:compile_keras_model(parameters),
                                 epochs=n_epochs,
                                 batch_size=batch_size,
                                 callbacks=callbacks,
                                 validation_split=0.2,
                                 class_weight=class_weights_dict,
                                 random_state=random_state,
                                 verbose=False)
    return classifier

In [None]:
def compile_classifier(algorithm, parameters, verbose=True):
    assert algorithm in ('RandomForest', 'GradientBoosting', 'SVM', 'MLP', 'keras')
    
    if algorithm == 'RandomForest':
        for p_name in parameters:
            assert p_name in ('n_estimators', 'max_depth', 'min_samples_split', 'min_samples_leaf', 'random_state')
        if 'n_estimators' not in parameters:
            parameters['n_estimators'] = 100
        if 'max_depth' not in parameters:
            parameters['max_depth'] = None
        if 'min_samples_split' not in parameters:
            parameters['min_samples_split'] = 2
        if 'min_samples_leaf' not in parameters:
            parameters['min_samples_leaf'] = 1
        if 'random_state' not in parameters:
            parameters['random_state'] = 0
        base_classifier = RandomForestClassifier(n_estimators=parameters['n_estimators'],
                                                 max_depth=parameters['max_depth'],
                                                 min_samples_split=parameters['min_samples_split'],
                                                 min_samples_leaf=parameters['min_samples_leaf'],
                                                 random_state=parameters['random_state'],
                                                 class_weight='balanced',
                                                 n_jobs=-1)
    elif algorithm == 'GradientBoosting':
        for p_name in parameters:
            assert p_name in ('n_estimators', 'max_depth', 'min_samples_split', 
                              'min_samples_leaf', 'learning_rate', 'random_state')
        if 'n_estimators' not in parameters:
            parameters['n_estimators'] = 100
        if 'learning_rate' not in parameters:
            parameters['learning_rate'] = 0.1
        if 'min_samples_split' not in parameters:
            parameters['min_samples_split'] = 2
        if 'min_samples_leaf' not in parameters:
            parameters['min_samples_leaf'] = 1
        if 'max_depth' not in parameters:
            parameters['max_depth'] = 3
        if 'random_state' not in parameters:
            parameters['random_state'] = 0
        base_classifier = GradientBoostingClassifier(n_estimators=parameters['n_estimators'],
                                                     learning_rate=parameters['learning_rate'],
                                                     min_samples_split=parameters['min_samples_split'],
                                                     min_samples_leaf=parameters['min_samples_leaf'],
                                                     max_depth=parameters['max_depth'],        
                                                     random_state=parameters['random_state'])
    elif algorithm == 'SVM':
        for p_name in parameters:
            assert p_name in ('C', 'kernel', 'gamma', 'random_state')
        if 'C' not in parameters:
            parameters['C'] = 1.0
        if 'kernel' not in parameters:
            parameters['kernel'] = 'rbf'
        if 'gamma' not in parameters: #Kernel coefficient for ‘rbf’, ‘poly’ and ‘sigmoid’
            parameters['gamma'] = 'scale'
        if 'random_state' not in parameters:
            parameters['random_state'] = 0
        base_classifier = SVC(C=parameters['C'],
                              kernel=parameters['kernel'],
                              gamma=parameters['gamma'],
                              class_weight='balanced',
                              probability=True, 
                              random_state=parameters['random_state'])
    
    elif algorithm == 'MLP':
        for p_name in parameters:
            assert p_name in ('hidden_layer_sizes', 'alpha', 'batch_size', 'max_iter','tol', 'random_state')
        if 'hidden_layer_sizes' not in parameters:
            parameters['hidden_layer_sizes'] = (100,)
        if 'alpha' not in parameters:
            parameters['alpha'] = 0.0001
        if 'batch_size' not in parameters:
            parameters['batch_size'] = 'auto'
        if 'max_iter' not in parameters:
            parameters['max_iter'] = 500
        if 'tol' not in parameters:
            parameters['tol'] = 0.0001
        if 'random_state' not in parameters:
            parameters['random_state'] = 0
        base_classifier = MLPClassifier(hidden_layer_sizes=parameters['hidden_layer_sizes'],
                                        alpha=parameters['alpha'],
                                        batch_size=parameters['batch_size'],
                                        max_iter=parameters['max_iter'],
                                        tol=parameters['tol'],
                                        validation_fraction=0.2,
                                        random_state=parameters['random_state'])
    elif algorithm == 'keras':
        base_classifier = get_keras_sklearn_wrapper(parameters)  
        parameters = base_classifier.get_params()
        
    if verbose:
        print('\nBase estimator:', algorithm)
        if algorithm != 'keras':
            print('Meta estimator: One vs all')
        print('Parameters:')
        for p in parameters:
            print(p, ':', parameters[p])
            
    if algorithm == 'keras':
        return base_classifier
    else:
        return OneVsRestClassifier(base_classifier)

In [None]:
def cross_validate(feature_vectors, labels, algorithm='RandomForest', n_folds=10, parameters={}):
    
    if algorithm == 'keras':
        parameters['training_vectors_shape'] = feature_vectors.shape
        parameters['n_labels'] = 14
        parameters['class_weights_dict'] = compute_class_weights(labels)

    classifier = compile_classifier(algorithm=algorithm, parameters=parameters, verbose=False)
    
    y_true = []
    y_score = []
    kf = KFold(n_splits=n_folds)
    for training_index, validation_index in kf.split(feature_vectors):
        training_vectors, validation_vectors = feature_vectors[training_index], feature_vectors[validation_index]
        training_labels, validation_labels = labels[training_index], labels[validation_index]

        classifier.fit(training_vectors, training_labels)
        predicted_scores = classifier.predict_proba(validation_vectors)
        
        for vindex,ps in zip(validation_index,predicted_scores):
            project_docs[vindex]._.predicted_goal_scores = sorted([(i,s) for i,s in enumerate(list(ps))], reverse=True, key=lambda x:x[-1])

        y_true.extend(list(validation_labels))
        y_score.extend(list(predicted_scores))
    
    return y_true, y_score

In [None]:
def optimize_classifier(project_docs, goal_docs, algorithm, parameter_ranges, feature_type='goal_similarities',
                        scale_features=True, n_folds=10, metric='lrap', shuffle=False, verbose=True):
    assert metric in ('lrap', 'lrl', 'cov_err')
    
    if verbose:
        print('\nN. folds:', n_folds)
        print('Scale features:', scale_features)
        print('Metric:', metric)
    
    feature_vectors, labels = prepare_data(project_docs, goal_docs, feature_type=feature_type, 
                                           label_format='sparse', scale=scale_features, shuffle=shuffle)
    
    param_lists = []
    for p_name in parameter_ranges:
        l = [(p_name, p_value) for p_value in parameter_ranges[p_name]]
        param_lists.append(l)
        
    param_configurations = []
    for combination in itertools.product(*param_lists):
        param_dict = {}
        for p_name, p_value in combination:
            param_dict[p_name] = p_value
        param_configurations.append(param_dict)
        
    original_parameters = list(param_configurations[0].keys()) + ['score']
    
    print('\nStarting parameter optimization...')
    iteration = 0
    for pconf in param_configurations:
        #if (iteration % 10 == 0):
            #print(int(100*(iteration / len(param_configurations))), '%')
        print(pconf)
        iteration += 1
        y_true, y_score = cross_validate(feature_vectors, labels,
                                         algorithm=algorithm, parameters=pconf,
                                         n_folds=n_folds)
        
        if metric == 'lrap':
            score = label_ranking_average_precision_score(y_true, y_score)
        elif metric == 'lrl':
            score = label_ranking_loss(y_true, y_score)
        else:
            score = coverage_error(y_true, y_score)
        pconf['score'] = score
    
    print('Done.')
    grid = pandas.DataFrame(data=param_configurations)
    grid = grid[original_parameters]
    grid = grid.sort_values(by=['score'], ascending=True if metric in ('lrl','cov_err') else False)
    return grid

In [None]:
def test_classifier(training_project_docs, test_project_docs, goal_docs, algorithm,
                    feature_type='goal_similarities', scale_features=True, parameters={}, 
                    plot_epochs=True, shuffle=False, load_model=False, verbose=True):
    
    if algorithm == 'spacy':
        return classify_with_spacy(training_project_docs, test_project_docs, parameters=parameters, 
                                   load_model=load_model, verbose=verbose)
    
    if verbose:
        print('\nScale features:', scale_features)
        
    # siamo totalmente fair, perchè lo scaling/normalizzazione viene fatto distintamente per
    # training set e test set
    feature_vectors_train, labels_train = prepare_data(training_project_docs, goal_docs, feature_type=feature_type, 
                                                       label_format='sparse', scale=scale_features, shuffle=shuffle)
    feature_vectors_test, labels_test = prepare_data(test_project_docs, goal_docs, feature_type=feature_type, 
                                                     label_format='sparse', scale=scale_features, shuffle=shuffle)
        
    if algorithm == 'keras':
        parameters['training_vectors_shape'] = feature_vectors_train.shape
        parameters['n_labels'] = 14
        parameters['class_weights_dict'] = compute_class_weights(labels_train)
        
    classifier = compile_classifier(algorithm=algorithm, parameters=parameters, verbose=verbose)
            
    if verbose:
        print('\nFitting classifier...')
    history = classifier.fit(feature_vectors_train, labels_train)
    
    if algorithm == 'keras' and plot_epochs:
        plot_keras_epochs(history)
        
    if verbose:
        print('Predicting test labels...')
    predicted_scores = classifier.predict_proba(feature_vectors_test)
    
    if verbose:
        print('Done.')
    
    for test_pdoc,ps in zip(test_project_docs,predicted_scores):
        test_pdoc._.predicted_goal_scores = sorted([(i,s) for i,s in enumerate(list(ps))], reverse=True, key=lambda x:x[-1])
        
    y_true = list(labels_test)
    y_score = list(predicted_scores)
    return y_true, y_score

In [None]:
set_spacy_extensions()

In [None]:
if load_docs:
    with open('data/pickles/project_docs_labeled_optimized_'+language+'.pkl', 'rb') as f:
        project_docs = pickle.load(f)
    with open('data/pickles/project_docs_unlabeled_optimized_'+language+'.pkl', 'rb') as f:
        unlabeled_project_docs = pickle.load(f)
    with open('data/pickles/goal_docs_optimized_'+language+'.pkl', 'rb') as f:
        goal_docs = pickle.load(f)
else:
    projects_df = pandas.read_csv('data/ris3-mcat-projects-cleaned-'+language+'.csv', sep='\t')
    goals_df = pandas.read_excel('data/un-goals.xlsx')
        
    project_docs, unlabeled_project_docs, goal_docs = generate_project_and_goal_docs(projects_df, goals_df)

In [None]:
if dump_docs:
    with open('data/pickles/project_docs_labeled_optimized_'+language+'.pkl', 'wb') as f:
        pickle.dump(project_docs, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open('data/pickles/project_docs_unlabeled_optimized_'+language+'.pkl', 'wb') as f:
        pickle.dump(unlabeled_project_docs, f, protocol=pickle.HIGHEST_PROTOCOL)
    with open('data/pickles/goal_docs_optimized_'+language+'.pkl', 'wb') as f:
        pickle.dump(goal_docs, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
occurrences = np.zeros(14)
for pdoc in project_docs:
    for l in pdoc._.goal_labels:
        occurrences[l] += 1

label_names = list(goal_name_mapping.values())
occurrences, label_names = zip(*sorted([(o,l) for o,l in zip(occurrences, label_names)], reverse=True))

In [None]:
plt.rc('axes', labelsize=14)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=12)    # fontsize of the tick labels
x = np.arange(len(occurrences))
fig, ax = plt.subplots(figsize = (12, 3))
ax.bar(x, occurrences)
ax.set_ylabel('occurrences')
ax.set_xticks(x)
ax.set_xticklabels(label_names, rotation='vertical')
plt.show()

In [None]:
p_ranges = {'hidden_layer_sizes': [(100,), (100,50)], 
            'n_epochs': [5, 10]}
grid = optimize_classifier(training_project_docs, goal_docs, algorithm='keras', 
                           parameter_ranges=p_ranges, scale_features=True,
                           n_folds=3, metric='lrap', verbose=False)

In [None]:
grid.head(50)

In [None]:
grid.to_excel('results/optimization.xlsx')

In [None]:
random.seed(10)
test_project_docs = random.sample(project_docs, 10)
training_project_docs = [pdoc for pdoc in project_docs if pdoc not in test_project_docs]

print(len(training_project_docs))
print(len(test_project_docs))

In [None]:
y_true_training = []
for pdoc in training_project_docs:
    yt = [1 if i in pdoc._.goal_labels else 0 for i in range(14)]
    y_true_training.append(yt)

In [None]:
training_frequencies = calculate_label_frequencies(y_true_training)

In [None]:
parameters={'C':512, 'gamma':0.0009765625}
y_true_test, y_score_test = test_classifier(extended_training_project_docs, test_project_docs, goal_docs,
                                            feature_type='goal_similarities', scale_features=True,
                                            algorithm='SVM', parameters=parameters)

In [None]:
%run evaluation.ipynb
ranking_metrics = compute_ranking_metrics(y_true_test, y_score_test)

classification_metrics = compute_classification_metrics(y_true_test, y_score_test, 
                                                                   label_selection_method='threshold',
                                                                   threshold=0.6)

classification_metrics_per_class = compute_binary_classification_metrics_per_class(y_true_test, y_score_test, 
                                                                                   label_selection_method='threshold',
                                                                                   threshold=0.5)

print('RESULTS ON TEST SET')
print('\nRanking metrics:')
print_metrics(ranking_metrics)
print('\nClassification metrics:')
print_metrics(classification_metrics)
print('\nClassification metrics per class:')
for l in classification_metrics_per_class:
    print('\nLabel:', l)
    print_metrics(classification_metrics_per_class[l])
    
bias_metrics = compute_imbalance_bias_metrics(classification_metrics_per_class, training_frequencies)

print('\nImbalance Bias metrics:')
print_metrics(bias_metrics)

In [None]:
visualize_output(test_project_docs, goal_docs, percentile_highlighted_words=75, use_colors=True) 

In [None]:
extended_training_project_docs = assign_weak_labels(training_project_docs, unlabeled_project_docs, goal_docs, 
                                                    algorithm='RandomForest',
                                                    parameters={'max_depth':None, 'min_samples_leaf':1,
                                                                'min_samples_split':4, 'n_estimators':200},
                                                    batch_size=25, label_selection_method='threshold', threshold=0.5,
                                                    feature_type='goal_similarities', scale_features=True,  
                                                    verbose=False)

with open('data/pickles/extended_training_project_docs_RF_threshold_0.5.pkl', 'wb') as f:
    pickle.dump(extended_training_project_docs, f, protocol=pickle.HIGHEST_PROTOCOL)