In [17]:
import pandas as pd
import numpy as np

from gensim.test.utils import datapath, get_tmpfile
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

# A demo of the feature extractor, in order to extract features in the design documents in order to feed into our ML algorithm

### Load GloVe word vectors as a model and as a dictionary

In [18]:
file = open("glove.6B.300d.txt", encoding="utf8")

word_vecs = {}
for line in file:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], "float32")
    word_vecs[word] = vector

In [13]:
glove_file = datapath('C:/Users/msham6/PycharmProjects/nlp/glove.6B.300d.txt')
word2vec_glove_file = get_tmpfile("glove.6B.300d.word2vec.txt")
glove2word2vec(glove_file, word2vec_glove_file)

(400001, 300)

In [14]:
# load word2vec model
model = KeyedVectors.load_word2vec_format(word2vec_glove_file)

#### Lets say we are looking for the 'specifications' within a document. The model is able to tell which words are most similar to this.

In [68]:
model.most_similar('specifications')

[('specification', 0.7413736581802368),
 ('requirements', 0.587746262550354),
 ('specified', 0.5779179930686951),
 ('specifies', 0.5399518609046936),
 ('standards', 0.5142701864242554),
 ('parameters', 0.511261522769928),
 ('design', 0.48775291442871094),
 ('configurations', 0.4876812994480133),
 ('criteria', 0.4871051013469696),
 ('prototypes', 0.4829327464103699)]

#### We read the first page of the pdf document, which contains the 'Table of Contents'.

In [4]:
import PyPDF2

pdfFileObj = open('design1.pdf', 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
pageObj = pdfReader.getPage(0)
p1 = pageObj.extractText()
p1 = p1.split()

In [50]:
# The words in the Table of Contents
p1

['Table',
 'of',
 'Contents',
 '1)',
 'X',
 '2',
 '2)',
 'Y',
 '7',
 '3)',
 'Configuration',
 '9']

#### In this case, 'specifications' are written as 'configuration'. Loop through every word in the Table of Contents, find the most similar term and print if the similarity is above a certain threshold


In [69]:
for word_n in range(len(p1)):
    word = p1[word_n]
    word = word.lower()
    sim = 0
    if word in word_vecs:
        a = np.array(word_vecs[word])
        b = np.array(word_vecs['specification'])
        sim = np.mean(np.dot(a, b))
        if sim > 20:
            print(word)
            page_num = p1[word_n + 1]
            print('Page number:', page_num)

configuration
Page number: 9


#### The page number should follow immediately after the target term. Get the page number from the page, and read that specific page instead of going through the whole document.

In [61]:
# Read page 9, which contains the configuration in the pdf file
page_num = int(page_num)- 1
pageObj = pdfReader.getPage(page_num)
p9 = pageObj.extractText()

In [None]:
# function to replace instances of unusual n placements
import re
def remove_rn(a):
    a = a.replace("\n", " ")
    return a

## Show the features extracted from the configuration page specifically

In [67]:
remove_rn(p9)

'Configuration:     Network Protocol: HTTPS   Requires a 64 - bit processor and operating system   OS: *Windows 7* / 8 / 8.1 / 10 (64 - bit OS required)   Processor: Intel Core i5 - 7500   Memory: 8 GB RAM   Graphics: NVIDIA GeForce GTX 1060 (3GB)   DirectX: Version 11   Network: Broadband Internet connection   Storage:  50 GB Storage   Sound Card: DirectX 11 sound card       '