In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [2]:
import scipy
import numpy as np
import pandas as pd
import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
import Utils
import skseq



## Preprocessing Data

In [3]:
from Utils.assignment_2_functions import *

df_train = pd.read_csv("./data/train_data_ner.csv", encoding="latin1")
df_test  = pd.read_csv("./data/test_data_ner.csv", encoding="latin1")

print(df_train.shape, df_test.shape)
df_train.head()

(839149, 3) (837339, 3)


Unnamed: 0,sentence_id,words,tags
0,0,Thousands,O
1,0,of,O
2,0,demonstrators,O
3,0,have,O
4,0,marched,O


In [4]:
print('In train set there are {} sentences'.format(len(set(df_train.sentence_id))))
print('In test set there are {} sentences'.format(len(set(df_test.sentence_id))))

print('There are {} diferent tags'.format(len(set(df_train.tags))))


In train set there are 38366 sentences
In test set there are 38367 sentences
There are 17 diferent tags


The tags:

```
geo = Geographical Entity
org = Organization
per = Person
gpe = Geopolitical Entity
tim = Time indicator
art = Artifact
eve = Event
nat = Natural Phenomenon
```

### Building word to position map and his inverse

In [5]:
corpus, word_to_pos, pos_to_word = get_corpus_and_word_dict(df_train,df_test)
print('There are {} words in the corpus'.format(len(word_to_pos)))


There are 55145 words in the corpus


### Building tag to position map and his inverse

In [6]:
tag_to_pos, pos_to_tag = get_tag_dict(df_train)
print('There are {} tags'.format(len(tag_to_pos)))


There are 17 tags


### Building a function to get all the sentences in different list of tokens. And all the tags in different lists of labels

In [10]:
X_tr,Y_tr = get_X_Y(df_train)

In [11]:
X_tr[2], Y_tr[2]

(['They',
  'left',
  'after',
  'a',
  'tense',
  'hour-long',
  'standoff',
  'with',
  'riot',
  'police',
  '.'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'])

## Perceptron

Initializing the dictionary

In [12]:
from skseq.sequences.label_dictionary import LabelDictionary
tag_pos_dict = LabelDictionary(tag_to_pos.keys())
word_pos_dict = LabelDictionary(word_to_pos.keys())

Generating the SequenceList => List of Sequence obects with word_pos/tag_pos

In [13]:
from skseq.sequences.sequence_list import SequenceList
train_seq = SequenceList(word_pos_dict, tag_pos_dict)
for x,y in zip(X_tr,Y_tr):
    train_seq.add_sequence(x, y, word_pos_dict, tag_pos_dict)

In [14]:
train_seq[1], train_seq[1].to_words(train_seq)

(15045/16 43294/16 28745/7 48506/16 46162/16 43633/16 44134/16 53065/16 24527/2 53627/16 49800/16 21/16 54800/16 45749/16 31501/5 46165/16 36595/16 37209/16 53335/16 43478/16 42381/16 53335/16 36916/16 36261/16 40947/16 46176/16 47225/16 44134/16 46751/16 30358/2 34003/10 24/16 ,
 'Helicopter/O gunships/O Saturday/B-tim pounded/O militant/O hideouts/O in/O the/O Orakzai/B-geo tribal/O region/O ,/O where/O many/O Taliban/B-org militants/O are/O believed/O to/O have/O fled/O to/O avoid/O an/O earlier/O military/O offensive/O in/O nearby/O South/B-geo Waziristan/I-geo ./O ')

Build IDFeatures

In [15]:
feature_mapper = skseq.sequences.id_feature.IDFeatures(train_seq)
feature_mapper.build_features()

Each sentence is given by 4 parts: init, trans, fin and emi


In [16]:
id_seq=1

print ("Initial features:",     feature_mapper.feature_list[id_seq][0])
print ("Transition features:",  feature_mapper.feature_list[id_seq][1])
print ("Final features:",       feature_mapper.feature_list[id_seq][2])
print ("Emission features:",    feature_mapper.feature_list[id_seq][3])

Initial features: [[0]]
Transition features: [[3], [32], [34], [3], [3], [3], [3], [9], [11], [3], [3], [3], [3], [44], [46], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [3], [9], [58], [59]]
Final features: [[28]]
Emission features: [[29], [30], [31], [33], [35], [36], [15], [13], [37], [38], [39], [40], [41], [42], [43], [45], [47], [48], [10], [5], [49], [10], [50], [51], [52], [53], [54], [15], [55], [56], [57], [27]]


In [17]:
print('In the base model there are {} features'.format(len(feature_mapper.feature_dict)))

In the base model there are 39802 features


Adding extra features

In [18]:
#Adding extra features
from skseq.sequences import extended_feature
feature_mapper2 = skseq.sequences.extended_feature.ExtendedFeatures(train_seq)
feature_mapper2.build_features()

In [19]:
print('In the feat model there are {} features'.format(len(feature_mapper2.feature_dict)))

In the feat model there are 158055 features


Train the models

In [20]:
import skseq.sequences.structured_perceptron as spc

sp = spc.StructuredPerceptron(word_pos_dict, tag_pos_dict, feature_mapper)
sp2 = spc.StructuredPerceptron(word_pos_dict, tag_pos_dict, feature_mapper2)

sp.num_epochs = 1
sp.fit(feature_mapper.dataset, sp.num_epochs)

In [21]:
sp2.num_epochs = 1
sp2.fit(feature_mapper2.dataset, sp.num_epochs)

Epoch: 0 Accuracy: 0.945339
Epoch: 1 Accuracy: 0.953404
Epoch: 2 Accuracy: 0.956288
Epoch: 3 Accuracy: 0.957609
Epoch: 4 Accuracy: 0.959090
Epoch: 5 Accuracy: 0.960163
Epoch: 6 Accuracy: 0.960788
Epoch: 7 Accuracy: 0.961633
Epoch: 8 Accuracy: 0.961799
Epoch: 9 Accuracy: 0.962740
Epoch: 10 Accuracy: 0.963002
Epoch: 11 Accuracy: 0.963532
Epoch: 12 Accuracy: 0.963861
Epoch: 13 Accuracy: 0.964186
Epoch: 14 Accuracy: 0.964215


Saving the models' parameters

In [31]:
sp.save_model("./fitted_models/perceptron_base_1_epoch")
sp2.save_model("./fitted_models/perceptron_extra_1_epoch")
