In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('input1.csv')

In [4]:
# Show a few examples and structure of data
df.head()

Unnamed: 0,document_id,page_no,features,tech_debt,security,arch
0,1122,8,Network Protocol: HTTP,0,1,0
1,1122,8,OS: Windows 8 / 8.1 / 10 (64-bit OS required),0,0,0
2,1122,8,OS: Windows 7 / 8 / 8.1 / 10 (64-bit OS required),1,0,0
3,1122,8,Memory: 4 GB RAM,0,0,1
4,1122,9,Graphics: NVIDIA GeForce GTX 750Ti(2 GB),0,0,0


The data here was obtained from Steam for specific game requirements. This is meant to mimic the nature of specifications data extracted from design documents. Two main (reasonable) assumption:
- Certain products/specifications will have multiple instances across our dataset
- A certain category will have various specifications corresponding to it. Risk categories chosen: Technology debt, Security, Architecture

In [5]:
import re
import string

# Function to to do some basic cleanup
def clean_all(text):
    text = text.lower()  # make lowercase
    text = text.encode('ascii', 'ignore').decode('ascii')  # remove emoji characters
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)  # remove punctuation
    return text

In [6]:
# New column for the cleaned text
df['cleaned_feat'] = df['features'].apply(clean_all)

In [7]:
df.head()

Unnamed: 0,document_id,page_no,features,tech_debt,security,arch,cleaned_feat
0,1122,8,Network Protocol: HTTP,0,1,0,network protocol http
1,1122,8,OS: Windows 8 / 8.1 / 10 (64-bit OS required),0,0,0,os windows 8 8 1 10 64 bit os required
2,1122,8,OS: Windows 7 / 8 / 8.1 / 10 (64-bit OS required),1,0,0,os windows 7 8 8 1 10 64 bit os required
3,1122,8,Memory: 4 GB RAM,0,0,1,memory 4 gb ram
4,1122,9,Graphics: NVIDIA GeForce GTX 750Ti(2 GB),0,0,0,graphics nvidia geforce gtx 750ti 2 gb


### Assign words to vectors here.

Note: Due to the small dataset, we opted to created word vectors from a pre-trained word2vec model i.e. Glove. Note that given a reasonable dataset size, we can create a new word2vec model from scratch, which might lead to better results.

In [8]:
# Load Glove model, publicly available
file = open("glove.6B.300d.txt", encoding="utf8")

# Save in dictionary
word_vecs = {}
for line in file:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    word_vecs[word] = vector

In [27]:
# Assign every feature in the dataeet to set of vectors
# Might be a bit complicated here, essentially just incorporates words not exisiting in the pre-trained model as well
feature_vecs = []
for feature in df.cleaned_feat:
    lst = feature.split()
    num_words = len(lst)
    feature_vec = np.zeros((1, 300))
    for word_n in range(num_words):
        word = lst[word_n]
        if word in word_vecs:
            x = word_vecs[word]
            x = np.expand_dims(np.array(x), axis=-1)
            feature_vec = np.concatenate((feature_vec, x.T))
        else:
            word_splitted = [char for char in word]
            for word2 in word_splitted:
                x = word_vecs[word2]
                x = np.expand_dims(np.array(x), axis=-1)
                feature_vec = np.concatenate((feature_vec, x.T))
    feature_vec = feature_vec[1:, :]
    feature_vecs.append(feature_vec)
feature_vecs = np.array(feature_vecs)

In [28]:
# Compress all the features to consistent vectors for training
num_features = df.shape[0]
num_categories = 3
words_dim = 300
feature_vecs_train = np.zeros((num_features, num_categories, words_dim))

# Use singular value decomposition to get the three strongest features (orthogonal basis vectors) from all feature vectors
for vec_n in range(len(feature_vecs)):
    vec = feature_vecs[vec_n]
    u, s, vh = np.linalg.svd(vec, full_matrices=True)
    vec_reduced = vh[0:3, :]
    feature_vecs_train[vec_n, :, :] = vec_reduced
feature_vecs_train = np.reshape(feature_vecs_train, newshape=(df.shape[0], -1))

In [29]:
# Function to assign all values to in training vectors between 0 and 1
def scale_array(dat, out_range=(0, 1)):
    domain = [np.min(dat, axis=0), np.max(dat, axis=0)]

    def interp(x):
        return out_range[0] * (1.0 - x) + out_range[1] * x

    def uninterp(x):
        b = 0
        if (domain[1] - domain[0]) != 0:
            b = domain[1] - domain[0]
        else:
            b =  1.0 / domain[1]
        return (x - domain[0]) / b
        
    return interp(uninterp(dat))

In [30]:
for i in range(feature_vecs_train.shape[0]):
    feature_vecs_train[i] = scale_array(feature_vecs_train[i])

# Simple 3-layer Neural Network

Simple NN chosen. Mostly to investigate underlying relationships in the network and checking if it overfits

In [31]:
# Get labels for all features first 
train_labels = np.zeros((num_features, num_categories))
for i in range(num_features):
    label = [df.tech_debt[i], df.security[i], df.arch[i]]
    train_labels[i] = label

In [32]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential()
model.add(Dense(200, input_dim=900, kernel_initializer='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(100, kernel_initializer='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(50, kernel_initializer='normal', activation='relu'))
# model.add(Dropout(0.3))
model.add(Dense(num_categories, kernel_initializer='normal', activation='sigmoid'))
model.compile(loss='categorical_crossentropy', 
              optimizer=tensorflow.keras.optimizers.Adam(), metrics=['accuracy'])
# Implement dropout for proper training set

In [46]:
model.fit(feature_vecs_train, train_labels, epochs=20, batch_size=2)

Train on 49 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x251255e5cc0>

Loss converging, seems to be overfitting. Consistent with assumptions

Prediction example

In [55]:
a = feature_vecs_train[11]
print('Feature', df.features[11])
a = np.expand_dims(a, axis=-1)
print('Scores for every category:', model.predict(a.T))
print('Truth label:', train_labels[11])

Feature Storage: 5 GB
Scores for every category: [[2.2446131e-02 2.2295958e-06 9.9788707e-01]]
Truth label: [0. 0. 1.]


### True class compartively gets much higher score (scores are close to 0 due to overfitting)

In [52]:
save_model = input('Enter file name for model to save:')
model.save(save_model+'.h5')

Enter file name for model to save:  my1


### 'test1.csv' gives an idea of what our input data would look like, and 'out1.csv' would be the corresponding output