# Esercitazione su Spacy e Annotazione dei testi basata su NER

#### Author
Federico Ranadi, May 2023.  
federico.ranaldi99@gmail.com

# Required Libraries

In [1]:
import pandas as pd

from IPython.display import display, HTML

In [2]:
# option to print all the value of cells in DataFrames
pd.set_option("max_colwidth", None)

### Install spacy and download the english pipeline

In [3]:
# install the spacy module
!pip install spacy

# download the english pipeline here
# 'it_core_news_sm' for italian texts
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
2023-05-22 16:16:53.501569: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-22 16:16:55.679165: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2023-05-22 16:16:55.679636: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:996] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at htt

In [4]:
import spacy
from spacy import displacy

# Annotation example

In [5]:
input_string = "In 1982, Mark drove his car from Los Angeles to Las Vegas until 5 of july"
nlp = spacy.load('en_core_web_sm')

In [6]:
print(nlp("ciao bello"))
print(type(nlp("ciao bello")))

ciao bello
<class 'spacy.tokens.doc.Doc'>


In [7]:
def print_annotation(input_string):
    doc = nlp(input_string)
    
    df = pd.DataFrame({
        "id": [],
        "word": [],
        "lemma": [],
        "tag": [],
        "entity": [],
        "dependency": [],
        "head_id": []
    })

    for sent in doc.sents:
        for i, word in enumerate(sent):
            if word.head is word:
                head_idx = 0
            else:
                head_idx = doc[i].head.i+1
            if head_idx == i + 1:
                head_idx = 0

            entity_tag = word.ent_type_
            if len(entity_tag) == 0:
                entity_tag = "O"
            
            word_obj = {"id": str(i+1), "word": str(word), "lemma": word.lemma_, "tag": word.tag_, "entity": entity_tag,
                                    "dependency": word.dep_, "head_id": str(head_idx)}
            df = df.append(word_obj, ignore_index=True)
    display(df)

In [None]:
print_annotation(input_string)

  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)
  df = df.append(word_obj, ignore_index=True)


Unnamed: 0,id,word,lemma,tag,entity,dependency,head_id
0,1,In,in,IN,O,prep,5
1,2,1982,1982,CD,DATE,pobj,1
2,3,",",",",",",O,punct,5
3,4,Mark,Mark,NNP,PERSON,nsubj,5
4,5,drove,drive,VBD,O,ROOT,0
5,6,his,his,PRP$,O,poss,7
6,7,car,car,NN,O,dobj,5
7,8,from,from,IN,O,prep,5
8,9,Los,Los,NNP,GPE,compound,10
9,10,Angeles,Angeles,NNP,GPE,pobj,8


In [None]:
def visualize_annotation(input_string, style="dep"):
    doc = nlp(input_string)
    # style can be either "dep" or "ent"
    displacy.render(doc, style=style, jupyter=True, options={"distance": 100}) #distance default 140

In [None]:
visualize_annotation(input_string, style="dep")

In [None]:
visualize_annotation(input_string, style="ent")


# Information extraction

Get information about a particular word in a given string.

In [8]:
#This function of code computes the annotation for each word: a set of usefull informations extracted by the dependency parsing of the input string...
#...Between these informations we have the id,head_id,tag,entity(obtained with NER),...
#It returns the annotation of the entire sentence

def get_sentence_annotation(input_string):
    doc = nlp(input_string)

    words = []
    for sent in doc.sents:  
        for i, word in enumerate(sent): 
            if word.head is word: 
                head_idx = 0
            else:
                head_idx = doc[i].head.i+1  
            if head_idx == i + 1:
                head_idx = 0

            entity_tag = word.ent_type_
            if len(entity_tag) == 0:
                entity_tag = "O"
            
            word_obj = {"id": i+1, "word": str(word), "lemma": word.lemma_, "tag": word.tag_, "entity": entity_tag,
                                    "dependency": word.dep_, "head_id": head_idx}
            words.append(word_obj)
    
    return words

In [None]:
get_sentence_annotation(input_string)

[{'id': 1,
  'word': 'In',
  'lemma': 'in',
  'tag': 'IN',
  'entity': 'O',
  'dependency': 'prep',
  'head_id': 5},
 {'id': 2,
  'word': '1982',
  'lemma': '1982',
  'tag': 'CD',
  'entity': 'DATE',
  'dependency': 'pobj',
  'head_id': 1},
 {'id': 3,
  'word': ',',
  'lemma': ',',
  'tag': ',',
  'entity': 'O',
  'dependency': 'punct',
  'head_id': 5},
 {'id': 4,
  'word': 'Mark',
  'lemma': 'Mark',
  'tag': 'NNP',
  'entity': 'PERSON',
  'dependency': 'nsubj',
  'head_id': 5},
 {'id': 5,
  'word': 'drove',
  'lemma': 'drive',
  'tag': 'VBD',
  'entity': 'O',
  'dependency': 'ROOT',
  'head_id': 0},
 {'id': 6,
  'word': 'his',
  'lemma': 'his',
  'tag': 'PRP$',
  'entity': 'O',
  'dependency': 'poss',
  'head_id': 7},
 {'id': 7,
  'word': 'car',
  'lemma': 'car',
  'tag': 'NN',
  'entity': 'O',
  'dependency': 'dobj',
  'head_id': 5},
 {'id': 8,
  'word': 'from',
  'lemma': 'from',
  'tag': 'IN',
  'entity': 'O',
  'dependency': 'prep',
  'head_id': 5},
 {'id': 9,
  'word': 'Los',
  '

In [9]:
#This function takes in input a sentence and a specyfic word
#It returns the annotation of that word

def get_word_annotation(input_string, word_string):
    
    words=get_sentence_annotation(input_string)
    
    for word in words:
        if word["word"] == word_string:
            return word
  
    return None

In [None]:
print(get_word_annotation(input_string, "Mark"))

{'id': 4, 'word': 'Mark', 'lemma': 'Mark', 'tag': 'NNP', 'entity': 'PERSON', 'dependency': 'nsubj', 'head_id': 5}


In [10]:
#This function takes in input a string and a dependency_type(such as "nsubj","prep"...) 
#It returns the head_id and the tail_id of all the couples of words involved in that dependency-type

def search_relation(input_string, relation_string):
  
    words = get_sentence_annotation(input_string)
    
    #each word annotation is a dictionary
    #words is the list of the word-annotations

    relations_head_tail = [] ##these are respectively the ids of the heads and the tails of each the relations 
    
    for word in words:
        if(word["dependency"]==relation_string):
          relations_head_tail.append([word["head_id"],word["id"]])

    return relations_head_tail

In [8]:
search_relation(input_string, "prep")

[[5, 1], [5, 8], [5, 11], [5, 14], [15, 16]]

In [11]:
#This function takes in input a string and a dependency_type(such as "nsubj","prep"...) 
#It returns a triple containing the lemmas of the head_word and tail_word in the dependency respectively and the dependency-type

def get_triple_relation(input_string, relation_string):
  
  triples_relation=[]

  relations_head_tail = search_relation(input_string, relation_string)
  
  words = get_sentence_annotation(input_string)

  for ids in relations_head_tail:
    for word in words:
      if (word["id"] == ids[0]): 
        lemma_head = word["lemma"]
      if (word["id"] == ids[1]):
        lemma_tail = word["lemma"]
    triples_relation.append([lemma_head,relation_string,lemma_tail])

  return triples_relation

In [10]:
print(get_triple_relation(input_string, "prep"))

[['drive', 'prep', 'in'], ['drive', 'prep', 'from'], ['drive', 'prep', 'to'], ['drive', 'prep', 'until'], ['5', 'prep', 'of']]


### Exercise 2: Search for entities

Define a method that takes in input a sentence (`input_string`) and the name of an entity type (`entity_type_string`), parses with spacy the input and returns the words (the `objects`! not the strings) described by that entity. If the entity type is not present, return an empty array.

```
def search_entity(input_string, entity_type_string):
    return word_obj_list
```

In [12]:
#This function takes in input a sentence and a entity-type
#I returns the id of each word associated to that entity-type

def search_entity(input_string, entity_type_string):
    ids = []
    
    words = get_sentence_annotation(input_string)
    
    for word in words:
      if(word["entity"] == entity_type_string):
        ids.append(word["id"])
    return ids

In [12]:
search_entity(input_string,"DATE")

[2, 15, 16, 17]

In [13]:
#This function takes as input a sentence and a entity-type
#It returns couples(lemma,entity_type) of all words associated to that entity-type

def get_couple_entity(input_string, entity_type_string):
  
  couples= []  

  ids=search_entity(input_string,entity_type_string)

  words = get_sentence_annotation(input_string)

  for id in ids:
    for word in words:
      if(word["id"] == id):
        lemma = word["lemma"]
    couples.append([lemma,entity_type_string])
  
  return couples

In [None]:
get_couple_entity(input_string,"DATE")

[['1982', 'DATE'], ['5', 'DATE'], ['of', 'DATE'], ['july', 'DATE']]

### Exercise 3: Enriching the sentences

For every sentence in the QuestionClassification dataset, extract the `subject-verb` relation and the `verb-object` relation. Add these couples to the original input, divided by the `#`:

- Sentence: '*What is the full form of .com?*' 
- `subject-verb`: *What is*  
- `verb-object`: *is the full form* 
- Enriched sentence: '*What is the full form of .com? # What is # is the full form*'  

Store the enriched sentences in a new dataframe and train a classifier (SVM, NB, Rocchio..) and evaluate it.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from pprint import pprint

In [15]:
# option to print all the value of cells in DataFrames
pd.set_option("max_colwidth", None)

In [16]:
training_data = pd.read_csv("./train.csv")
testing_data = pd.read_csv("./test.csv")

classes = list(np.unique(testing_data['classes']))
print(classes)

['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM']


In [17]:
train_questions=list(training_data["questions"])
test_questions=list(testing_data["questions"])

In [18]:
#list of spacy dependencies

for label in nlp.get_pipe("parser").labels:
    print(label, " -- ", spacy.explain(label))

ROOT  --  root
acl  --  clausal modifier of noun (adjectival clause)
acomp  --  adjectival complement
advcl  --  adverbial clause modifier
advmod  --  adverbial modifier
agent  --  agent
amod  --  adjectival modifier
appos  --  appositional modifier
attr  --  attribute
aux  --  auxiliary
auxpass  --  auxiliary (passive)
case  --  case marking
cc  --  coordinating conjunction
ccomp  --  clausal complement
compound  --  compound
conj  --  conjunct
csubj  --  clausal subject
csubjpass  --  clausal subject (passive)
dative  --  dative
dep  --  unclassified dependent
det  --  determiner
dobj  --  direct object
expl  --  expletive
intj  --  interjection
mark  --  marker
meta  --  meta modifier
neg  --  negation modifier
nmod  --  modifier of nominal
npadvmod  --  noun phrase as adverbial modifier
nsubj  --  nominal subject
nsubjpass  --  nominal subject (passive)
nummod  --  numeric modifier
oprd  --  object predicate
parataxis  --  parataxis
pcomp  --  complement of preposition
pobj  --  ob



 We are now considering only those dependecies whose role is of connecting subject-verb and verb-object.

 These dependecies are : nsubj, dobj, csubj, csubjpass, pobj.

 We will consider only nsubj and dobj (for simplicity and for avoiding dependencies overlapping).

 Givene a text, we are going to annotate these kinds of dependencies in this way:

 Example
 
 INPUT: "How did serfdom develop in and then leave Russia ?"

Text Annotation :

    ID   WORD      DEPENDENCY    HEAD_ID

    1    How       advmod         4
    2    did       aux            4
    3    serfdom   nsubj          4
    4    develop   ROOT           0
    5    in        prep           4
    6    and       cc             4
    7    then      advmod         8
    8    leave     conj           4
    9    Russia    dobj           8
    10   ?         punct          4

  OUTPUT: "How did # serfdom develop # in and then # leave Russia # ?"

In [19]:
def get_words_list(annotated_sentence): #return the list of words annotated (it receives as input the sentence annotation)
  l=[]
  for word in annotated_sentence :
    l.append(word['word'])
  return l

In [20]:
def get_enriched_sentence(sentence):
    deps = ["nsubj","dobj"]
    annotated_sentence = get_sentence_annotation(sentence)

    #find the ids of the words involved in the dependency
    for dep in deps:
        dep_head_tail = []
        for word in annotated_sentence:
            if(word["dependency"]==dep):
              dep_head_tail.append(word["head_id"])
              dep_head_tail.append(word["id"])
        dep_head_tail.sort()
        
          #modify words with "#"...
        if(len(dep_head_tail) > 1):
          for word in annotated_sentence:
            if(word['id'] == dep_head_tail[0]):
              word['word'] = "#" + word['word']
            if(word['id'] == dep_head_tail[1]):
              word['word'] = word['word'] + "#"
    
    l = get_words_list(annotated_sentence)

    #now get the enriched text (NOTE: we want to tokenize togethere head and tail of the relation)!
    enriched_sentence=''

    enriched_sentence= " ".join(l)

    return enriched_sentence

In [21]:
#Now let's enrich all the dataset

train_enriched_questions=[]
test_enriched_questions=[]

for question in train_questions:
  train_enriched_questions.append(get_enriched_sentence(question))

for question in test_questions:
  test_enriched_questions.append(get_enriched_sentence(question))


In [22]:
import nltk

In [23]:
#tokenize the dataset and store it inside the dataframe
#NOTE: do note remove stopwords like "?" ,"!",":" 
#In particulare words surrounded by "#" witness a dependency relation inside the sentence which can be usefull for our task
nltk.download('punkt')

training_data['enriched_questions'] = list(map(lambda sent: nltk.word_tokenize(sent), train_enriched_questions))
testing_data['enriched_questions'] = list(map(lambda sent: nltk.word_tokenize(sent), test_enriched_questions))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [24]:
#transforming list of token into a string...
#" ".join(str(x) for x in xs)
training_data["enriched_questions"] = training_data["enriched_questions"].apply(lambda l: ' '.join(str(t) for t in l))
testing_data["enriched_questions"] = testing_data["enriched_questions"].apply(lambda l: ' '.join(str(t) for t in l))

In [25]:
training_data.head()

Unnamed: 0,questions,classes,enriched_questions
0,How did serfdom develop in and then leave Russia ?,DESC,How did # serfdom develop # in and then # leave Russia # ?
1,What films featured the character Popeye Doyle ?,ENTY,What # films # featured # the character # Popeye Doyle ?
2,How can I find a list of celebrities ' real names ?,DESC,How can # I # find # a list # of celebrities ' real names ?
3,What fowl grabs the spotlight after the Chinese Year of the Monkey ?,ENTY,What # fowl # grabs # the spotlight # after the Chinese Year of the Monkey ?
4,What is the full form of .com ?,ABBR,What # is the full form # of .com ?


In [26]:
# NOTE: using BOW-vectorizer we are losing informations about the position of a word inside the sentence
# hence we are partially losing the informations we have encoded before (like the head and tail of a dependecy relation)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(training_data['enriched_questions'].tolist())
X_test = vectorizer.transform(testing_data['enriched_questions'].tolist())
y_train = training_data['classes'].tolist()
y_test = testing_data['classes'].tolist()

In [27]:

vectorizer.vocabulary_

{'how': 3786,
 'did': 2306,
 'serfdom': 6748,
 'develop': 2273,
 'in': 3888,
 'and': 509,
 'then': 7587,
 'leave': 4390,
 'russia': 6553,
 'what': 8182,
 'films': 3009,
 'featured': 2953,
 'the': 7576,
 'character': 1523,
 'popeye': 5873,
 'doyle': 2461,
 'can': 1350,
 'find': 3015,
 'list': 4496,
 'of': 5370,
 'celebrities': 1473,
 'real': 6229,
 'names': 5144,
 'fowl': 3163,
 'grabs': 3424,
 'spotlight': 7133,
 'after': 383,
 'chinese': 1578,
 'year': 8351,
 'monkey': 5013,
 'is': 4032,
 'full': 3225,
 'form': 3134,
 'com': 1748,
 'contemptible': 1885,
 'scoundrel': 6668,
 'stole': 7246,
 'cork': 1933,
 'from': 3213,
 'my': 5122,
 'lunch': 4590,
 'team': 7494,
 'baseball': 832,
 'st': 7156,
 'louis': 4563,
 'browns': 1208,
 'become': 887,
 'oldest': 5392,
 'profession': 6015,
 'are': 622,
 'liver': 4511,
 'enzymes': 2727,
 'name': 5142,
 'scar': 6637,
 'faced': 2874,
 'bounty': 1124,
 'hunter': 3812,
 'old': 5390,
 'west': 8174,
 'when': 8188,
 'was': 8113,
 'ozzy': 5515,
 'osbourne'

Let's train and test an svm with sigmoid as kernel function and the following parametres...

In [28]:
from sklearn.svm import SVC

c = 1.5
decision_function='ovr'
max_iter=-1
kernel='sigmoid'
degree=2
gamma =1.0

svm_sig = SVC(C=c, max_iter=max_iter, degree=degree, kernel=kernel, gamma=gamma, decision_function_shape=decision_function)

svm_sig.fit(X_train, y_train)

Test the svm model on the test set and evaluate it...

In [29]:
y_pred = svm_sig.predict(X_test)

print(classification_report(y_test, y_pred, target_names=classes))

              precision    recall  f1-score   support

        ABBR       1.00      0.78      0.88         9
        DESC       0.84      0.99      0.91       138
        ENTY       0.83      0.71      0.77        94
         HUM       0.88      0.91      0.89        65
         LOC       0.87      0.85      0.86        81
         NUM       0.99      0.90      0.94       113

    accuracy                           0.88       500
   macro avg       0.90      0.86      0.88       500
weighted avg       0.89      0.88      0.88       500



Let's train and test a Multinomial Naive Bayes with alpha's parameter equal to 0.1...

In [None]:
from sklearn.naive_bayes import MultinomialNB

MultinomialNB_model = MultinomialNB(alpha=0.1)
MultinomialNB_model.fit(X_train, y_train)

Test the MultinomialNB model on the test set and evaluate it...

In [None]:
y_pred = MultinomialNB_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=classes))

              precision    recall  f1-score   support

        ABBR       1.00      0.67      0.80         9
        DESC       0.81      0.83      0.82       138
        ENTY       0.73      0.59      0.65        94
         HUM       0.65      0.89      0.75        65
         LOC       0.66      0.77      0.71        81
         NUM       0.89      0.74      0.81       113

    accuracy                           0.76       500
   macro avg       0.79      0.75      0.76       500
weighted avg       0.77      0.76      0.76       500



NOTE: both SVM's and MultinomialNB's parameters are choosen by looking the previous exercises since they were found to be the best on Question Classification Task.

Looking at the results it seems that SVM performs better than MultinomialNB

### Exercise 4: Replace texts with entities

For every sentence in the QuestionClassification dataset, extract the entities annotated by the spacy module only for the proper nouns (`PROPN`) and replace the spans in the text with the entity name:

- Sentence: '*In 1982, Mark drove his car from Los Angeles to Las Vegas until 5 of july*'  
- Modified Sentence: '*In 1982, PERSON drove his car from GPE to GPE until 5 of DATE*'

Store the enriched sentences in a new dataframe and train a classifier (SVM, NB, Rocchio..) and evaluate it.

**WARNING**: be careful with `compounds`, they should be replaced by a SINGLE entity name: *Las Vegas* => `GPE`

In [30]:
annotated_sentence = get_sentence_annotation("In 1982, Mark drove his car from Los Angeles to Las Vegas until 5 of july")
annotated_sentence

[{'id': 1,
  'word': 'In',
  'lemma': 'in',
  'tag': 'IN',
  'entity': 'O',
  'dependency': 'prep',
  'head_id': 5},
 {'id': 2,
  'word': '1982',
  'lemma': '1982',
  'tag': 'CD',
  'entity': 'DATE',
  'dependency': 'pobj',
  'head_id': 1},
 {'id': 3,
  'word': ',',
  'lemma': ',',
  'tag': ',',
  'entity': 'O',
  'dependency': 'punct',
  'head_id': 5},
 {'id': 4,
  'word': 'Mark',
  'lemma': 'Mark',
  'tag': 'NNP',
  'entity': 'PERSON',
  'dependency': 'nsubj',
  'head_id': 5},
 {'id': 5,
  'word': 'drove',
  'lemma': 'drive',
  'tag': 'VBD',
  'entity': 'O',
  'dependency': 'ROOT',
  'head_id': 0},
 {'id': 6,
  'word': 'his',
  'lemma': 'his',
  'tag': 'PRP$',
  'entity': 'O',
  'dependency': 'poss',
  'head_id': 7},
 {'id': 7,
  'word': 'car',
  'lemma': 'car',
  'tag': 'NN',
  'entity': 'O',
  'dependency': 'dobj',
  'head_id': 5},
 {'id': 8,
  'word': 'from',
  'lemma': 'from',
  'tag': 'IN',
  'entity': 'O',
  'dependency': 'prep',
  'head_id': 5},
 {'id': 9,
  'word': 'Los',
  '

In [18]:
search_relation(input_string,"compound")

[[10, 9], [13, 12]]

In [134]:
from spacy.util import get_words_and_spaces
#this function takes in input a sentence 
#after annotating it with spacy it replace proper nouns with the associated entity

def get_enriched_sentence_with_entity(sentence): 
    annotated_sentence = get_sentence_annotation(sentence)
    enriched_sentence_list = []
    
    i=0
    l=len(annotated_sentence)
    while i < len(annotated_sentence):
      if(annotated_sentence[i]["tag"]=="NNP"):
        enriched_sentence_list.append(annotated_sentence[i]["entity"])
        if(annotated_sentence[i]["dependency"]=="compound"):
          i+=2
        else:
          i+=1
      else:
        enriched_sentence_list.append(annotated_sentence[i]["word"])
        i+=1
      
    #now get the enriched text (NOTE: we want to tokenize togethere head and tail of the relation)!
    enriched_sentence=''

    enriched_sentence= " ".join(enriched_sentence_list)

    return enriched_sentence

In [136]:
get_enriched_sentence_with_entity("Marc is going to Las Palmas with the Harley Davidson")

'PERSON is going to GPE with the ORG'

In [38]:
get_enriched_sentence_with_entity("In 1982, Mark drove his car from Los Angeles to Las Vegas until 5 of july")

ao
ao


'In 1982 , drove his car from GPE to GPE until 5 of'

In [137]:
#Now let's enrich all the dataset

train_enriched_questions=[]
test_enriched_questions=[]

for question in train_questions:
  train_enriched_questions.append(get_enriched_sentence_with_entity(question))

for question in test_questions:
  test_enriched_questions.append(get_enriched_sentence_with_entity(question))


In [153]:
#tokenize the dataset and store it inside the dataframe
#NOTE: do note remove stopwords like "?" ,"!",":" 
#In particulare words surrounded by "#" witness a dependency relation inside the sentence which can be usefull for our task
nltk.download('punkt')

training_data['enriched_questions_entity'] = list(map(lambda sent: nltk.word_tokenize(sent), train_enriched_questions))
testing_data['enriched_questions_entity'] = list(map(lambda sent: nltk.word_tokenize(sent), test_enriched_questions))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [156]:
#transforming list of token into a string...
#" ".join(str(x) for x in xs)
training_data["enriched_questions_entity"] = training_data["enriched_questions_entity"].apply(lambda l: ' '.join(str(t) for t in l))
testing_data["enriched_questions_entity"] = testing_data["enriched_questions_entity"].apply(lambda l: ' '.join(str(t) for t in l))

In [157]:
# NOTE: using BOW-vectorizer we are losing informations about the position of a word inside the sentence
# hence we are partially losing the informations we have encoded before (like the head and tail of a dependecy relation)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(training_data['enriched_questions_entity'].tolist())
X_test = vectorizer.transform(testing_data['enriched_questions_entity'].tolist())
y_train = training_data['classes'].tolist()
y_test = testing_data['classes'].tolist()

In [158]:
#Now inside the vocabulary we have also named_entity

vectorizer.vocabulary_

{'how': 2723,
 'did': 1691,
 'serfdom': 4671,
 'develop': 1666,
 'in': 2789,
 'and': 440,
 'then': 5319,
 'leave': 3070,
 'gpe': 2490,
 'what': 5757,
 'films': 2215,
 'featured': 2176,
 'the': 5310,
 'character': 1103,
 'person': 3882,
 'can': 984,
 'find': 2220,
 'list': 3141,
 'of': 3658,
 'celebrities': 1064,
 'real': 4302,
 'names': 3519,
 'fowl': 2337,
 'grabs': 2492,
 'spotlight': 4971,
 'after': 354,
 'event': 2049,
 'product': 4129,
 'is': 2904,
 'full': 2371,
 'form': 2315,
 'com': 1255,
 'contemptible': 1375,
 'scoundrel': 4608,
 'stole': 5054,
 'cork': 1415,
 'from': 2362,
 'my': 3507,
 'lunch': 3199,
 'team': 5252,
 'baseball': 644,
 'fac': 2117,
 'browns': 888,
 'become': 679,
 'oldest': 3676,
 'profession': 4132,
 'are': 511,
 'liver': 3154,
 'enzymes': 2007,
 'name': 3517,
 'scar': 4587,
 'faced': 2119,
 'bounty': 833,
 'hunter': 2736,
 'org': 3713,
 'when': 5761,
 'was': 5704,
 'born': 820,
 'why': 5776,
 'do': 1766,
 'heavier': 2619,
 'objects': 3639,
 'travel': 5464,


Let's train and test an SVM with the same parameters as before...

In [159]:
from sklearn.svm import SVC

c = 1.5
decision_function='ovr'
max_iter=-1
kernel='sigmoid'
degree=2
gamma =1.0

svm_sig = SVC(C=c, max_iter=max_iter, degree=degree, kernel=kernel, gamma=gamma, decision_function_shape=decision_function)

svm_sig.fit(X_train, y_train)

Provide the Classification Report...

In [160]:
y_pred = svm_sig.predict(X_test)

print(classification_report(y_test, y_pred, target_names=classes))

              precision    recall  f1-score   support

        ABBR       0.88      0.78      0.82         9
        DESC       0.82      0.96      0.89       138
        ENTY       0.82      0.74      0.78        94
         HUM       0.93      0.88      0.90        65
         LOC       0.85      0.88      0.86        81
         NUM       0.97      0.86      0.91       113

    accuracy                           0.87       500
   macro avg       0.88      0.85      0.86       500
weighted avg       0.87      0.87      0.87       500



Let's train and test a Multinomial Naive Bayes with the same parametres as before...

In [161]:
from sklearn.naive_bayes import MultinomialNB

MultinomialNB_model = MultinomialNB(alpha=0.1)
MultinomialNB_model.fit(X_train, y_train)

Provide the Classification Report...

In [162]:
y_pred = MultinomialNB_model.predict(X_test)
print(classification_report(y_test, y_pred, target_names=classes))

              precision    recall  f1-score   support

        ABBR       1.00      0.78      0.88         9
        DESC       0.85      0.83      0.84       138
        ENTY       0.78      0.61      0.68        94
         HUM       0.74      0.94      0.83        65
         LOC       0.77      0.90      0.83        81
         NUM       0.86      0.81      0.84       113

    accuracy                           0.81       500
   macro avg       0.83      0.81      0.82       500
weighted avg       0.81      0.81      0.81       500



Observations:

The first idea of annotation was to annotate the occurrence of some specified couple of words inside a question by looking at the dependency relation(by surronding them with "#"). The problem was that during the BOW-Vectorization we loose important informations such as the relative position of words inside a sentence. This lead us to consider "# where is #" and "# is where#" that is an example of recurent pattern in LOC questions, the same.

The second idea was to substitute proper nouns with named_entities paying attention to compound words that refer to a single entity.

Results:
We trained both the representations on two specyfic models whose parametres were chosen by looking at the previous work of optimization (look at the previous notebooks) and we observed that:

- In the SVMs results there are no significant changes from one representation to the other one in therms of f1-score

- In the MultinomialNBs results we can see that the second representation leads us to slightly better performances.