# Try DEEP-COSMIC-UC on your use cases

Train DEEP-COSMIC-UC and approximate size of your use cases

## Imports

In [15]:
import pandas as pd

import itertools

import talos as ta

from gensim.models import KeyedVectors

from paths import input_folder, output_folder, word_embeddings_folder

from helpers import create_embedding_matrix, tokens2index, texts2index_padded, prune_wv

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import gc

from keras.models import Model
from keras.optimizers import Adam, Nadam
from keras.layers import TimeDistributed, GlobalMaxPooling1D, GlobalAveragePooling1D, Activation, Input, LSTM, GRU, Dense, Dropout, Flatten, Embedding, SpatialDropout1D, Bidirectional, CuDNNGRU
from keras.layers.convolutional import Conv1D, MaxPooling1D, AveragePooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers import Flatten, concatenate
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, activations
from keras.layers.recurrent import Recurrent
from keras.engine import InputSpec
from keras.callbacks import ModelCheckpoint, EarlyStopping, LearningRateScheduler, ReduceLROnPlateau, CSVLogger
from keras.wrappers.scikit_learn import KerasRegressor

import spacy
from spacy.symbols import ORTH, LEMMA, POS, TAG, LOWER

from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, mean_absolute_error

import tensorflow as tf
from tensorflow import set_random_seed
from keras import backend as K

# This part required only for GPU
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.allocator_type = 'BFC'
sess = tf.Session(config=config)
K.set_session(sess)

random_seed = 102329

## Load training data

Select projects that are similar to yours.

In [9]:
projects = ['P01',
 'P02',
 'P06',
 'P09',
 'P15',
 'P17',
 'P18',
 'P05',
 'P07',
 'P14',
 'P13',
 'P11',
 'P08',
 'P10',
 'P16',
 'P03',
 'P04',
 'P19',
 'P20',
 'P21',
 'P22',
 'P23',
 'P24',
 'P25',
 'P27',
 'P26',
 'P12']

In [10]:
use_cases_df = pd.read_csv(f"{input_folder}use-cases.csv", index_col=0)
use_cases_df.head(5)

Unnamed: 0,ProjectID,UC,TransTypes,UCType,Cfp,TitleTokens
0,P01,UC2-1-1,C|D|R|U,C|D|R|U,16,manage faculties crud
1,P01,UC2-1-10,DL|L|R,L,27,assign science olympiads major specialty edit ...
2,P01,UC2-1-11,CS|R,CS,7,manage ranking algorithms
3,P01,UC2-1-13,C|D|R|U,C|D|R|U,17,manage exams crud
4,P01,UC2-1-14,DL|L|R,L,27,manage assignments exams majors specialties


In [12]:
use_cases_df = use_cases_df[use_cases_df['ProjectID'].isin(projects)]
use_cases_df.head(5)

Unnamed: 0,ProjectID,UC,TransTypes,UCType,Cfp,TitleTokens
0,P01,UC2-1-1,C|D|R|U,C|D|R|U,16,manage faculties crud
1,P01,UC2-1-10,DL|L|R,L,27,assign science olympiads major specialty edit ...
2,P01,UC2-1-11,CS|R,CS,7,manage ranking algorithms
3,P01,UC2-1-13,C|D|R|U,C|D|R|U,17,manage exams crud
4,P01,UC2-1-14,DL|L|R,L,27,manage assignments exams majors specialties


In [3]:
tokenized_names = [name.split(" ") for name in use_cases_df['TitleTokens'].tolist()]

In [4]:
tokenized_names[:5]

[['manage', 'faculties', 'crud'],
 ['assign', 'science', 'olympiads', 'major', 'specialty', 'edit', 'delete'],
 ['manage', 'ranking', 'algorithms'],
 ['manage', 'exams', 'crud'],
 ['manage', 'assignments', 'exams', 'majors', 'specialties']]

In [13]:
train_ids = pd.read_csv(f"{input_folder}10-fold-train-full.csv", index_col=0)
val_ids = pd.read_csv(f"{input_folder}10-fold-val-full.csv", index_col=0)

runs = train_ids.run.max() + 1 
k = train_ids.k.max() + 1

## Prepare your use cases

In [51]:
# add the names of your use cases here
use_cases = ["Add a paper.", "Delete a paper."]

In [52]:
nlp = spacy.load('en_core_web_lg')

stop_words = spacy.lang.en.stop_words.STOP_WORDS
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

In [53]:
titles = []
for text in use_cases:
    doc = nlp(text)
    tokens = " ".join([token.lower_ for token in doc if not token.orth_.isspace() and not token.orth_ in filters and not token.orth_ in stop_words])
    titles.append(tokens)

In [54]:
tokenized_pred_names = [name.split(" ") for name in titles]
tokenized_pred_names[:5]

[['add', 'paper'], ['delete', 'paper']]

In [55]:
tokenized_all_names = tokenized_pred_names + tokenized_names
tokenized_all_names[:10]

[['add', 'paper'],
 ['delete', 'paper'],
 ['manage', 'faculties', 'crud'],
 ['assign', 'science', 'olympiads', 'major', 'specialty', 'edit', 'delete'],
 ['manage', 'ranking', 'algorithms'],
 ['manage', 'exams', 'crud'],
 ['manage', 'assignments', 'exams', 'majors', 'specialties'],
 ['manage', 'courses', 'crud'],
 ['manage', 'assignments', 'courses', 'majors', 'specialties'],
 ['manage', 'grading', 'scales', 'crud']]

## DEEP-COSMIC-UC model

In [19]:
def get_deep_cosmic_uc_model():
        
    global MAX_SEQUENCE_LENGTH, embedding_matrix

    embedding_layer = Embedding(input_dim=embedding_matrix.shape[0],
                 output_dim=embedding_matrix.shape[1],
                 mask_zero=False,
                 weights=[embedding_matrix],
                 input_length=MAX_SEQUENCE_LENGTH,
                 trainable=False)

    line_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name="input")
    embedded_sequences = embedding_layer(line_input)
    cnn = Conv1D(filters=10, kernel_size=2, strides=1, 
                        padding='same', activation='relu')(embedded_sequences)
    cnn = MaxPooling1D(pool_size=2)(cnn)
    cnn = Conv1D(filters=12, kernel_size=2, strides=1, 
                        padding='same', activation='relu')(cnn)
    cnn = MaxPooling1D(pool_size=2)(cnn)
    cnn = Dropout(0.5)(cnn)
    cnn = GlobalAveragePooling1D()(cnn)

    output = Dense(1, activation='linear')(cnn)

    model = Model([line_input] , output)
    algorithm = Adam(lr=0.001, beta_1=0.95, beta_2=0.999, epsilon=1e-08, decay=0.0) #0.005

    model.compile(optimizer=algorithm, loss='mae', metrics=['mse','mae'])

    return model

In [56]:
wv = KeyedVectors.load_word2vec_format(f"{word_embeddings_folder}w2v_glove.6B.300d.txt", binary=False)

In [57]:
wv = prune_wv(wv, tokenized_all_names)
MAX_SEQUENCE_LENGTH, VECTOR_LENGTH = 16, wv.vector_size
embedding_matrix = create_embedding_matrix(wv, VECTOR_LENGTH)
embedding_matrix.shape
gc.collect()

0

In [59]:
x = texts2index_padded(tokenized_names, wv, seq_length=MAX_SEQUENCE_LENGTH)
y = use_cases_df['Cfp']

'oek' is not in the vocabulary - 'bind kek oek'
'oeks' is not in the vocabulary - 'report bindings oeks keks'
'keks' is not in the vocabulary - 'report bindings oeks keks'
'keks' is not in the vocabulary - 'report bindings keks modules'
'oek' is not in the vocabulary - 'add oek'
'swdpi' is not in the vocabulary - 'notifications desktop swdpi system user dashboard'


In [60]:
get_deep_cosmic_uc_model().summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 16)                0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 16, 300)           134100    
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 16, 10)            6010      
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 8, 10)             0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 8, 12)             252       
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 4, 12)             0         
_________________________________________________________________
dropout_5 (Dropout)          (None, 4, 12)             0         
__________

In [61]:
np.random.seed(random_seed)
set_random_seed(random_seed)

callbacks_list = [
    ReduceLROnPlateau( 
        monitor='loss',
        min_lr=0.001, 
        factor=0.5,
        verbose=1,
        patience=10) 
]

estimators = []
estimators.append(('cnn', KerasRegressor(build_fn=get_deep_cosmic_uc_model, epochs=400,
                                        batch_size=200, verbose=2, callbacks=callbacks_list))) 

model = Pipeline(estimators)

model.fit(x, y)


Epoch 1/400
 - 0s - loss: 9.2690 - mean_squared_error: 125.2693 - mean_absolute_error: 9.2690
Epoch 2/400
 - 0s - loss: 9.2506 - mean_squared_error: 124.9250 - mean_absolute_error: 9.2506
Epoch 3/400
 - 0s - loss: 9.2308 - mean_squared_error: 124.5710 - mean_absolute_error: 9.2308
Epoch 4/400
 - 0s - loss: 9.2148 - mean_squared_error: 124.2741 - mean_absolute_error: 9.2148
Epoch 5/400
 - 0s - loss: 9.1968 - mean_squared_error: 123.9451 - mean_absolute_error: 9.1968
Epoch 6/400
 - 0s - loss: 9.1780 - mean_squared_error: 123.5751 - mean_absolute_error: 9.1780
Epoch 7/400
 - 0s - loss: 9.1531 - mean_squared_error: 123.1541 - mean_absolute_error: 9.1531
Epoch 8/400
 - 0s - loss: 9.1338 - mean_squared_error: 122.7814 - mean_absolute_error: 9.1338
Epoch 9/400
 - 0s - loss: 9.1123 - mean_squared_error: 122.3901 - mean_absolute_error: 9.1123
Epoch 10/400
 - 0s - loss: 9.0875 - mean_squared_error: 121.8882 - mean_absolute_error: 9.0875
Epoch 11/400
 - 0s - loss: 9.0524 - mean_squared_error: 121

Epoch 88/400
 - 0s - loss: 4.3288 - mean_squared_error: 40.2552 - mean_absolute_error: 4.3288
Epoch 89/400
 - 0s - loss: 4.2669 - mean_squared_error: 39.9498 - mean_absolute_error: 4.2669
Epoch 90/400
 - 0s - loss: 4.3434 - mean_squared_error: 41.8776 - mean_absolute_error: 4.3434
Epoch 91/400
 - 0s - loss: 4.2436 - mean_squared_error: 40.0707 - mean_absolute_error: 4.2436
Epoch 92/400
 - 0s - loss: 4.4134 - mean_squared_error: 42.2024 - mean_absolute_error: 4.4134

Epoch 00092: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 93/400
 - 0s - loss: 4.2952 - mean_squared_error: 40.6757 - mean_absolute_error: 4.2952
Epoch 94/400
 - 0s - loss: 4.3329 - mean_squared_error: 41.1915 - mean_absolute_error: 4.3329
Epoch 95/400
 - 0s - loss: 4.3446 - mean_squared_error: 42.2833 - mean_absolute_error: 4.3446
Epoch 96/400
 - 0s - loss: 4.2840 - mean_squared_error: 40.8078 - mean_absolute_error: 4.2840
Epoch 97/400
 - 0s - loss: 4.1772 - mean_squared_error: 38.6783 - mean_absolute_error: 4.

Epoch 173/400
 - 0s - loss: 3.6327 - mean_squared_error: 31.0078 - mean_absolute_error: 3.6327
Epoch 174/400
 - 0s - loss: 3.6736 - mean_squared_error: 31.8726 - mean_absolute_error: 3.6736
Epoch 175/400
 - 0s - loss: 3.6339 - mean_squared_error: 31.9462 - mean_absolute_error: 3.6339
Epoch 176/400
 - 0s - loss: 3.7111 - mean_squared_error: 32.0066 - mean_absolute_error: 3.7111
Epoch 177/400
 - 0s - loss: 3.6317 - mean_squared_error: 30.7664 - mean_absolute_error: 3.6317
Epoch 178/400
 - 0s - loss: 3.7189 - mean_squared_error: 33.3736 - mean_absolute_error: 3.7189
Epoch 179/400
 - 0s - loss: 3.6632 - mean_squared_error: 31.1676 - mean_absolute_error: 3.6632
Epoch 180/400
 - 0s - loss: 3.7068 - mean_squared_error: 32.4194 - mean_absolute_error: 3.7068

Epoch 00180: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 181/400
 - 0s - loss: 3.6634 - mean_squared_error: 32.6888 - mean_absolute_error: 3.6634
Epoch 182/400
 - 0s - loss: 3.7135 - mean_squared_error: 33.0153 - mean_absolute

Epoch 256/400
 - 0s - loss: 3.3120 - mean_squared_error: 28.9947 - mean_absolute_error: 3.3120
Epoch 257/400
 - 0s - loss: 3.3008 - mean_squared_error: 28.2269 - mean_absolute_error: 3.3008
Epoch 258/400
 - 0s - loss: 3.3836 - mean_squared_error: 28.2622 - mean_absolute_error: 3.3836
Epoch 259/400
 - 0s - loss: 3.4499 - mean_squared_error: 29.3267 - mean_absolute_error: 3.4499
Epoch 260/400
 - 0s - loss: 3.2785 - mean_squared_error: 25.9453 - mean_absolute_error: 3.2785
Epoch 261/400
 - 0s - loss: 3.2413 - mean_squared_error: 27.2614 - mean_absolute_error: 3.2413
Epoch 262/400
 - 0s - loss: 3.3753 - mean_squared_error: 27.6347 - mean_absolute_error: 3.3753
Epoch 263/400
 - 0s - loss: 3.3565 - mean_squared_error: 27.4701 - mean_absolute_error: 3.3565
Epoch 264/400
 - 0s - loss: 3.3197 - mean_squared_error: 27.3342 - mean_absolute_error: 3.3197

Epoch 00264: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 265/400
 - 0s - loss: 3.2147 - mean_squared_error: 27.2686 - mean_absolute

Epoch 339/400
 - 0s - loss: 3.1537 - mean_squared_error: 25.6258 - mean_absolute_error: 3.1537
Epoch 340/400
 - 0s - loss: 2.9804 - mean_squared_error: 24.3230 - mean_absolute_error: 2.9804
Epoch 341/400
 - 0s - loss: 3.0826 - mean_squared_error: 25.2882 - mean_absolute_error: 3.0826
Epoch 342/400
 - 0s - loss: 3.0245 - mean_squared_error: 25.4349 - mean_absolute_error: 3.0245
Epoch 343/400
 - 0s - loss: 2.9893 - mean_squared_error: 23.4456 - mean_absolute_error: 2.9893

Epoch 00343: ReduceLROnPlateau reducing learning rate to 0.001.
Epoch 344/400
 - 0s - loss: 3.1414 - mean_squared_error: 26.1211 - mean_absolute_error: 3.1414
Epoch 345/400
 - 0s - loss: 3.1080 - mean_squared_error: 25.4939 - mean_absolute_error: 3.1080
Epoch 346/400
 - 0s - loss: 3.1478 - mean_squared_error: 24.4498 - mean_absolute_error: 3.1478
Epoch 347/400
 - 0s - loss: 3.0781 - mean_squared_error: 24.7206 - mean_absolute_error: 3.0781
Epoch 348/400
 - 0s - loss: 3.0239 - mean_squared_error: 24.3202 - mean_absolute

Pipeline(memory=None,
     steps=[('cnn', <keras.wrappers.scikit_learn.KerasRegressor object at 0x000001464C5AA358>)])

## Predict size of use cases

In [62]:
x_pred = texts2index_padded(tokenized_pred_names, wv, seq_length=MAX_SEQUENCE_LENGTH)

In [63]:
y_pred = model.predict(x_pred)

In [64]:
predictions = pd.DataFrame({'use case':use_cases, 'Cfp-pred':y_pred, 'Cfp-int':np.round(y_pred)})

In [65]:
predictions

Unnamed: 0,use case,Cfp-pred,Cfp-int
0,Add a paper.,5.589661,6.0
1,Delete a paper.,6.42612,6.0
