# Notebook to test out different NLP methods for NER and POS

### Imports

In [1]:
import spacy
import nltk
from spacy import displacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

## NER using SpaCy

In [3]:
NER = spacy.load("en_core_web_sm")

In [35]:
raw_text = '''Cinematography (also called \"Direction of Photography\") is the science or art of motion-picture photography by recording light or other electromagnetic radiation, either electronically by means of an image sensor, or chemically by means of a light-sensitive material such as film stock.
Typically, a lens is used to repeatedly focus the light reflected from objects into real images on the light-sensitive surface inside a camera during a questioned exposure, creating multiple images.
With an electronic image sensor, this produces an electrical charge at each pixel, which is electronically processed and stored in a video file for subsequent display or processing.'''
result = NER(raw_text.title())
for word in result.ents:
    print(word.text, word.label_)

Cinematography ORG
The Science Or Art Of Motion-Picture Photography By Recording Light Or Other Electromagnetic Radiation WORK_OF_ART
The Light-Sensitive Surface Inside A Camera During A Questioned Exposure WORK_OF_ART


In [7]:
displacy.render(result,style="ent",jupyter=True)

# POS Tagging using NLTK

In [15]:
stop_words = set(stopwords.words('english'))

In [45]:
def tag_sentence(sentence):
    wordsList = word_tokenize(sentence)
    # wordsList = [w for w in wordsList if not w in stop_words]
    tagged = nltk.pos_tag(wordsList)
    return tagged

In [46]:
while (True):
    raw_text = input("User: ")
    tokenized = sent_tokenize(raw_text)
    
    for sentence in tokenized:
        tagged = tag_sentence(sentence)
        
        print("Model: ", tagged)

User: Cinematography is the science or art of motion-picture photography by recording light.
Model:  [('Cinematography', 'NN'), ('is', 'VBZ'), ('the', 'DT'), ('science', 'NN'), ('or', 'CC'), ('art', 'NN'), ('of', 'IN'), ('motion-picture', 'NN'), ('photography', 'NN'), ('by', 'IN'), ('recording', 'VBG'), ('light', 'NN'), ('.', '.')]


KeyboardInterrupt: Interrupted by user

## Building a combined Approach

In [2]:
raw_text = '''Cinematography is the science or art of motion-picture photography by recording light or other electromagnetic radiation, either electronically by means of an image sensor, or chemically by means of a light-sensitive material such as film stock.
Typically, a lens is used to repeatedly focus the light reflected from objects into real images on the light-sensitive surface inside a camera during a questioned exposure, creating multiple images.
With an electronic image sensor, this produces an electrical charge at each pixel, which is electronically processed and stored in a video file for subsequent display or processing.'''

entities = []

ner_result = NER(raw_text.title())

sentences = sent_tokenize(raw_text)
for sentence in sentences:
    pos_tagged = tag_sentence(sentence)
    for tags in pos_tagged:
        if tags[1] in ["NNP", "NN"]:
            entities.append(tags[0].lower())
        # End if
    # End for
# End for
print (entities)

NameError: name 'NER' is not defined

## Using an existing codebase

In [21]:
raw_text = '''Cinematography is the science or art of motion-picture photography by recording light or other electromagnetic radiation, either electronically by means of an image sensor, or chemically by means of a light-sensitive material such as film stock.
Typically, a lens is used to repeatedly focus the light reflected from objects into real images on the light-sensitive surface inside a camera during a questioned exposure, creating multiple images.
With an electronic image sensor, this produces an electrical charge at each pixel, which is electronically processed and stored in a video file for subsequent display or processing.'''

### My Custom Solution

In [26]:
from subjectVerbObject import findSubjectVerbObjects, nlp
tokens = nlp(raw_text)
svos = findSubjectVerbObjects(tokens)
for svo in svos:
    print (svo)

[]


### Solution with more SVOs identified

In [3]:
from subject_verb_object_extract import findSVOs, nlp
tokens = nlp(raw_text)
svos = findSVOs(tokens)
for svo in svos:
    print (svo)

('light', 'record', 'the science')
('other electromagnetic radiation', 'record', 'the science')
('the light', 'focus', 'a lens')
('objects into on the sensitive surface inside a camera', 'reflect', 'the light')
('a exposure', 'reflect', 'the light')
('a exposure', 'questioned')
('an electrical charge at each pixel ,', 'produce', '\n')
('a video file for subsequent display', 'process', 'an electrical charge at')
('a video file for', 'store', 'an electrical charge at')
('a video file for', 'store', 'an electrical charge at')


### Solution with "IS" SVOs identified

In [20]:
from subject_object_extraction import findSVAOs, findSVOs
import en_core_web_sm
parser = en_core_web_sm.load(disable=['ner','textcat'])
parse = parser(raw_text)
svaos = findSVAOs(parse)
for svao in svaos:
    print (svao)

('cinematography', 'is', 'science')
('science', 'recording', 'light')
('lens', 'focus', 'repeatedly focus creating')
('lens', 'focus', 'light')
('\n', 'produces', 'electrical charge')


### Combining the above two in a cleaner version

In [2]:
from subject_verb_object_extract import findSVOs, nlp
from subject_object_extraction import findSVAOs

In [3]:
def retrieve_SVAOs(text):
    # Tokenizing
    tokens = nlp(text)

    # Finding SVOs and SVAOs
    svos = findSVOs(tokens)
    svaos = findSVAOs(tokens)

    # Removing the tuples with less then 2 items
    # and cleaning for \n objects/subjects
    svos = [svo for svo in svos if len(svo) >= 3 and svo[0] != "\n" and svo[2] != "\n"]
    svaos = [svao for svao in svaos if len(svaos) >= 3 and svao[0] != "\n" and svao[2] != "\n"]

    # Merging the two
    svao_total = svos + svaos
    
    return svao_total

In [4]:
def get_similar_SVAOs(svaos_context, svaos_source):
    similar_svaos = []
    for svao_context in svaos_context:
        for svao_source in svaos_source:
            toMatch = svao_context[0].lower()
            subject_candidate = svao_source[0].lower()
            object_candidate = svao_source[2].lower()
            if toMatch == subject_candidate or toMatch == object_candidate:
                similar_svaos.append(svao_source)
        # End for
    # End for
    return similar_svaos

In [15]:
raw_text = '''Cinematography is the science or art of motion-picture photography by recording light or other electromagnetic radiation, either electronically by means of an image sensor, or chemically by means of a light-sensitive material such as film stock.
Typically, a lens is used to repeatedly focus the light reflected from objects into real images on the light-sensitive surface inside a camera during a questioned exposure, creating multiple images.
With an electronic image sensor, this produces an electrical charge at each pixel, which is electronically processed and stored in a video file for subsequent display or processing.'''

user_text_1_1 = '''Hi buddy, What do you think about cinematography'''


raw_text_2 = '''Blue is one of the three primary colours of pigments in painting and traditional colour theory, as well as in the RGB colour model.
It lies between violet and green on the spectrum of visible light.
The eye perceives blue when observing light with a dominant wavelength between approximately 450 and 495 nanometres.
Most blues contain a slight mixture of other colors; azure contains some green, while ultramarine contains some violet.
The clear daytime sky and the deep sea appear blue because of an optical effect known as Rayleigh scattering.
An optical effect called Tyndall scattering explains blue eyes.
Distant objects appear more blue because of another optical effect called atmospheric perspective.'''
user_text_2_1 = '''Blue is my favorite primary color.'''

raw_text_3 = '''The Royal Blue was the Baltimore and Ohio Railroad (B&O)'s flagship passenger train between New York City and Washington, D.C., in the United States, beginning in 1890.'''
user_text_3_1 = '''Blue is always nice. I like royal blue.'''


In [16]:
svaos_source = retrieve_SVAOs(raw_text)
# Printing
for svao in svaos_source:
    print (svao)
# End for
print ("-------------------------------------------------")
svaos_context = retrieve_SVAOs(user_text_1_1)
# Printing
for svao in svaos_context:
    print (svao)
# End for

('light', 'record', 'the science')
('other electromagnetic radiation', 'record', 'the science')
('the light', 'focus', 'a lens')
('objects into on the sensitive surface inside a camera', 'reflect', 'the light')
('a exposure', 'reflect', 'the light')
('a video file for subsequent display', 'process', 'an electrical charge at')
('a video file for', 'store', 'an electrical charge at')
('a video file for', 'store', 'an electrical charge at')
('cinematography', 'is', 'science')
('science', 'recording', 'light')
('lens', 'focus', 'repeatedly focus creating')
('lens', 'focus', 'light')
-------------------------------------------------


In [17]:
similar_svaos = get_similar_SVAOs(svaos_context, svaos_source)
for svaos in similar_svaos:
    print (svaos)