In [1]:
from nltk.corpus import treebank
import numpy as np
from collections import Counter
import operator
import string

In [38]:
sentences = treebank.tagged_sents()
new_sentences=[]
for s in sentences:
    sen=[]
    for w in s:
        if w[1] != '-NONE-':
            sen.append(w)
    new_sentences.append(sen)
max_length = 271
max_tag = 46

In [40]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(new_sentences, test_size=0.2)

In [41]:
pairs = []
for s in train_data:
    for pair in s:
        pairs.append(pair)

In [4]:
def make_index(s):
    dict_s = dict.fromkeys(s)
    k = 0
    for i in dict_s:
        dict_s[i] = k
        k +=1
    return dict_s

In [5]:
def data_inf(data):
    word = set()
    pos = set()
    pairs = set()
    for pair in data:
        word.add(pair[0].lower())
        pos.add(pair[1])
        pairs.add(pair)
    word = make_index(word)
    pos = make_index(pos)
    pairs = make_index(pairs)
    return word,pos,pairs

In [43]:
word_index,pos_index,pairs_index = data_inf(pairs)

In [46]:
def presentation_word(word, word_index):
    l = len(word_index)+1
    word_feat = np.zeros(l)
    if word in word_index:
        index = word_index[word]
        word_feat[index] = 1
        word_feat[-1]=1
    else:
        word_feat[-1] = 0
    return word_feat

In [47]:
def presentation_pos(pos, pos_index):
    l = len(pos_index)
    index = pos_index[pos]
    pos_feat = np.zeros(l)
    pos_feat[index] = 1
    return pos_feat

In [48]:
def presentation_pair(pair, pairs_index):
    l = len(pairs_index)
    index = pairs_index[pair]
    pair_feat = np.zeros(l)
    pair_feat[index] = 1
    return pair_feat

In [49]:
def position_feature(position):
    position_feat = np.zeros(max_length)
    position_feat[position] = 1
    return position_feat

In [50]:
def neighbors_word(position,sentence,word_index):
    l = len(word_index)+1
    length_sentence = len(sentence)
    if position==0:
        last_word_feat = np.zeros(l)
        last_two_word_feat = np.zeros(l)
    elif position ==1 :
        last_word_feat = presentation_word(word=sentence[position-1],word_index=word_index)
        last_two_word_feat = np.zeros(l)
    else:
        last_word_feat = presentation_word(word=sentence[position-1],word_index=word_index)
        last_two_word_feat = presentation_word(word=sentence[position-2],word_index=word_index)
    if length_sentence-position==1:
        next_word_feat = np.zeros(l)
        next_two_word_feat = np.zeros(l)
    elif length_sentence-position == 2:
        next_word_feat = presentation_word(word=sentence[position+1],word_index=word_index)
        next_two_word_feat = np.zeros(l)
    else:
        next_word_feat = presentation_word(word=sentence[position+1],word_index=word_index)
        next_two_word_feat = presentation_word(word=sentence[position+2],word_index=word_index)
    return last_word_feat,last_two_word_feat,next_word_feat,next_two_word_feat

In [51]:
def neighbors_pos(position, sentence_pos):
    last_pos_feat = np.zeros(max_tag+1)
    last_two_pos_feat = np.zeros(max_tag+1)
    next_pos_feat = np.zeros(max_tag+1)
    next_two_pos_feat = np.zeros(max_tag+1)
    #length of the sentence
    max_length_sentence = len(sentence_pos)
    #pos for neighbors
    if position==0:
        last_pos_feat[-1] = 1
        last_two_pos_feat[-1] = 1
        if max_length_sentence==1:
            
            next_pos_feat[-1]=1
            next_two_pos_feat[-1]=1
        elif max_length_sentence == 2:
            next_pos_feat[sentence_pos[1]] = 1
            next_two_pos_feat[-1] = 1
        else:
            next_pos_feat[sentence_pos[1]]=1
            next_two_pos_feat[sentence_pos[2]] =1
    elif position == 1:
        last_pos_feat[sentence_pos[0]]=1
        last_two_pos_feat[-1]=1
        if max_length_sentence==2:
            next_pos_feat[-1]=1
            next_two_pos_feat[-1]=1
        elif max_length==3:
            next_pos_feat[sentence_pos[2]]=1
            next_two_pos_feat[-1]=1
        else:
            next_pos_feat[sentence_pos[position+1]]=1
            next_two_pos_feat[sentence_pos[position+2]]=1
    else:
        if position + 1 == max_length_sentence:
            next_pos_feat[-1]=1
            next_two_pos_feat[-1]=1
            last_pos_feat[sentence_pos[position-1]]=1
            last_two_pos_feat[sentence_pos[position-2]]=1
        elif position + 2 == max_length_sentence:
            next_pos_feat[sentence_pos[position+1]]=1
            next_two_pos_feat[-1]=1
            last_pos_feat[sentence_pos[position-1]]=1
            last_two_pos_feat[sentence_pos[position-2]]=1
        else:
            next_pos_feat[sentence_pos[position+1]]=1
            next_two_pos_feat[sentence_pos[position+2]]=1
            last_pos_feat[sentence_pos[position-1]]=1
            last_two_pos_feat[sentence_pos[position-2]]=1
    return next_pos_feat,next_two_pos_feat,last_pos_feat,last_two_pos_feat
            
    

In [56]:
w=[]
t=[]
t_index=[]
pair = []
for i in new_sentences[0]:
    w.append(i[0])
    t_index.append(pos_index[i[1]])
    t.append(i[1])
    pair.append(i)

In [57]:
word_feat = presentation_word(w[0],word_index=word_index)

In [58]:
pos_feat = presentation_pos(t[0], pos_index=pos_index)

In [59]:
pair_feat = presentation_pair(pair[0],pairs_index)

In [63]:
neighbors_pos_feat = neighbors_pos(position=0, sentence_pos=t_index)

In [66]:
words = [i[0] for i in pairs]
dict_words = Counter(words)

In [67]:
s = sorted(dict_words.items(), key=operator.itemgetter(1))
rare_words = []
not_rare_words = []
for i in s:
    if i[1]<2:
        rare_words.append(i[0])
    else:
        not_rare_words.append(i[0])

In [68]:
k = 0
dict_alphabet=dict()
for i in string.ascii_lowercase:
    dict_alphabet[i]=k
    k += 1
dict_alphabet['symbol'] = 26
k = 27
for i in range(10):
    dict_alphabet[str(i)] = k
    k +=1


In [69]:
def prefix_suffix_one(word):
    l = 26+10+1+1
    prefix_one_feat = np.zeros(l)
    suffix_one_feat = np.zeros(l)
    if word[0] not in string.ascii_lowercase and word not in string.digits:
        prefix_one_feat[dict_alphabet['symbol']] = 1 
    else:
        prefix_one_feat[dict_alphabet[word[0]]] = 1
    if word[-1]not in string.ascii_lowercase and word not in string.digits:
        suffix_one_feat[dict_alphabet['symbol']] = 1
    else:
        suffix_one_feat[dict_alphabet[word[-1]]] = 1
    return prefix_one_feat,suffix_one_feat

In [70]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

In [71]:
def prefix_suffix_two(word):
    l = 37 * 2
    prefix_two_feat = np.zeros(l)
    suffix_two_feat = np.zeros(l)
    if len(word)==1:
        if word[0] not in string.ascii_lowercase and word not in string.digits:
            prefix_two_feat[dict_alphabet['symbol']]=1
            suffix_two_feat[dict_alphabet['symbol']]=1
        else:
            prefix_two_feat[dict_alphabet[word[0]]]=1
            suffix_two_feat[dict_alphabet[word[0]]]=1
    else:
        if word[0] not in string.ascii_lowercase and word not in string.digits:
            prefix_two_feat[dict_alphabet['symbol']]=1
        else:
            prefix_two_feat[dict_alphabet[word[0]]] = 1
        if word[1] not in string.ascii_lowercase and word not in string.digits:
            prefix_two_feat[dict_alphabet['symbol'] + 37] =1
        else:
            prefix_two_feat[dict_alphabet[word[1]]+37] = 1
        if word[-1] not in string.ascii_lowercase and word not in string.digits:
            suffix_two_feat[dict_alphabet['symbol']]=1
        else:
            suffix_two_feat[dict_alphabet[word[-1]]]=1
        if word[-2] not in string.ascii_lowercase and word not in string.digits:
            suffix_two_feat[dict_alphabet['symbol']+37]=1
        else:
            suffix_two_feat[dict_alphabet[word[-2]]+37]=1
    return prefix_two_feat,suffix_two_feat
        
            
            



In [72]:
def prefix_suffix_three(word):
    l = 37 * 3
    prefix_three_feat = np.zeros(l)
    suffix_three_feat = np.zeros(l)
    if len(word) == 1:
        if word[0] not in string.ascii_lowercase and word not in string.digits:
            prefix_three_feat[dict_alphabet['symbol']]=1
            suffix_three_feat[dict_alphabet['symbol']]=1
        else:
            prefix_three_feat[dict_alphabet[word[0]]]=1
            suffix_three_feat[dict_alphabet[word[0]]]=1
    elif len(word) == 2 : 
        if word[0] not in string.ascii_lowercase and word not in string.digits:
            prefix_three_feat[dict_alphabet['symbol']]=1
        else:
            prefix_three_feat[dict_alphabet[word[0]]] = 1
        if word[1] not in string.ascii_lowercase and word not in string.digits:
            prefix_three_feat[dict_alphabet['symbol'] + 37] =1
        else:
            prefix_three_feat[dict_alphabet[word[1]]+37] = 1
        if word[-1] not in string.ascii_lowercase and word not in string.digits:
            suffix_three_feat[dict_alphabet['symbol']]=1
        else:
            suffix_three_feat[dict_alphabet[word[-1]]]=1
        if word[-2] not in string.ascii_lowercase and word not in string.digits:
            suffix_three_feat[dict_alphabet['symbol']+37]=1
        else:
            suffix_three_feat[dict_alphabet[word[-2]]+37]=1
    else:
        if word[0] not in string.ascii_lowercase and word not in string.digits:
            prefix_three_feat[dict_alphabet['symbol']]=1
        else:
            prefix_three_feat[dict_alphabet[word[0]]] = 1
            
            
        if word[1] not in string.ascii_lowercase and word not in string.digits:
            prefix_three_feat[dict_alphabet['symbol'] + 37] =1
        else:
            prefix_three_feat[dict_alphabet[word[1]]+37] = 1
            
            
        if word[2] not in string.ascii_lowercase and word not in string.digits:
            prefix_three_feat[dict_alphabet['symbol'] + 37+37] =1
        else:
            prefix_three_feat[dict_alphabet[word[2]]+37+37] = 1
            
            
        if word[-1] not in string.ascii_lowercase and word not in string.digits:
            suffix_three_feat[dict_alphabet['symbol']]=1
        else:
            suffix_three_feat[dict_alphabet[word[-1]]]=1
        if word[-2] not in string.ascii_lowercase and word not in string.digits:
            suffix_three_feat[dict_alphabet['symbol']+37]=1
        else:
            suffix_three_feat[dict_alphabet[word[-2]]+37]=1
        if word[-3] not in string.ascii_lowercase and word not in string.digits:
            suffix_three_feat[dict_alphabet['symbol']+37+37]=1
        else:
            suffix_three_feat[dict_alphabet[word[-3]]+37+37]=1
    return prefix_three_feat,suffix_three_feat    

In [73]:
def contain_number(word):
    contain_number_feat = np.zeros(1)
    for i in word:
        if i in string.digits:
            contain_number_feat[0]=1
    return contain_number_feat
def contain_uppercase(word):
    contain_uppercase_feat = np.zeros(1)
    for i in word:
        if i in string.uppercase:
            contain_uppercase_feat[0]=1
    return contain_uppercase_feat

def contain_hyphen(word):
    contain_hyphen_feat = np.zeros(1)
    for i in word:
        if i not in string.digits and i not in string.ascii_lowercase:
            contain_hyphen_feat[0]=1
    return contain_hyphen_feat

def rare_word(word,rare_words,not_rare_words):
    rare_word_feat = np.zeros(3)
    if word in rare_words:
        rare_word_feat[0]=1
    elif word in not_rare_words:
        rare_word_feat[1]=1
    else: 
        rare_word_feat[2] = 1
    return rare_word_feat

In [74]:
test_word = words[0].lower()

In [75]:
prefix_suffix_three(test_word)

(array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.]),
 array([ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
      

In [76]:
word_feat = presentation_word(word=test_word,word_index=word_index)

In [77]:
def combination(position,sentence):
    sentence_word = [i[0] for i in sentence]
    sentence_tag = [i[1] for i in sentence]
    
    word = sentence[position][0]
    tag = sentence[position][1]
    pair = sentence[position]
    
    
    word_feat = presentation_word(word=word.lower(),word_index=word_index)

    
    position_feat = position_feature(position)

    
    rare_word_feat = rare_word(word=word, rare_words=rare_words,not_rare_words=not_rare_words)

    contain_number_feat = contain_number(word=word)
    contain_uppercase_feat = contain_uppercase(word=word)
    contain_hyphen_feat = contain_hyphen(word=word)
    prefix_feat,suffix_feat = prefix_suffix_three(word=word.lower())
    pair_feat = presentation_pair(pair=pair,pairs_index=pairs_index)
    tag_feat = presentation_pos(pos=tag,pos_index=pos_index)
    if position==0:
        last_tag = np.zeros(max_tag)
        last_two_tag = np.zeros(max_tag)
    elif position ==1:
        last_tag = presentation_pos(pos=sentence_tag[position-1],pos_index=pos_index)
        last_two_tag = np.zeros(max_tag)
    else:
        last_tag = presentation_pos(pos=sentence_tag[position-1],pos_index=pos_index)
        last_two_tag = presentation_pos(pos=sentence_tag[position-2],pos_index=pos_index)
    last_word_feat,last_two_word_feat,next_word_feat,next_two_word_feat = neighbors_word(position=position,sentence=sentence_word,word_index=word_index)
    feature_vector=np.concatenate((word_feat,position_feat,rare_word_feat,contain_number_feat,contain_hyphen_feat,contain_uppercase_feat,
                                  prefix_feat,suffix_feat,pair_feat,tag_feat,last_tag,last_two_tag,last_word_feat,last_two_word_feat,next_word_feat,next_two_word_feat),axis=0)
    return feature_vector

In [80]:
def feature_vector(position,sentence,tag,tag_1,tag_2):
    #word-level features
    word = sentence[position]
    word_feat = presentation_word(word = word, word_index=word_index)
    contain_number_feat = contain_number(word=word)
    contain_uppercase_feat = contain_uppercase(word=word)
    contain_hyphen_feat = contain_hyphen(word=word)
    prefix_feat,suffix_feat = prefix_suffix_three(word=word.lower())
    pair_feat = presentation_pair(pair=(word,tag),pairs_index=pairs_index)
    
    #tag-level features
    tag_feat = presentation_pos(pos=tag, pos_index=pos_index)
    tag_1_feat = presentation_pos(pos=tag_1,pos_index=pos_index)
    tag_2_feat = presentation_pos(pos=tag_2,pos_index=pos_index)
    #position feature
    position_feat = position_feature(position=position)
    last_word_feat,last_two_word_feat,next_word_feat,next_two_word_feat = neighbors_word(position=position,sentence=sentence,word_index=word_index)
    feature_vector = np.concatenate((word_feat,contain_number_feat,contain_uppercase_feat,contain_hyphen_feat,
        prefix_feat,suffix_feat,pair_feat,
        tag_2_feat,tag_1_feat,tag_feat,
        position_feat,last_word_feat,last_two_word_feat,next_word_feat,next_two_word_feat
        ),axis = 0)

In [79]:
feature_vector(0)

array([ 0.,  0.,  0., ...,  0.,  0.,  0.])

In [85]:
new_sentences[0]
sentence = [i[0] for i in new_sentences[0]]
tag = [i[1] for i in new_sentences[0]]

In [86]:
sentence,tag

([u'Pierre',
  u'Vinken',
  u',',
  u'61',
  u'years',
  u'old',
  u',',
  u'will',
  u'join',
  u'the',
  u'board',
  u'as',
  u'a',
  u'nonexecutive',
  u'director',
  u'Nov.',
  u'29',
  u'.'],
 [u'NNP',
  u'NNP',
  u',',
  u'CD',
  u'NNS',
  u'JJ',
  u',',
  u'MD',
  u'VB',
  u'DT',
  u'NN',
  u'IN',
  u'DT',
  u'JJ',
  u'NN',
  u'NNP',
  u'CD',
  u'.'])

In [None]:
feature_vector(position=2,sentence=sentence,tag=',',tag_1='NNP',t)