In [45]:
from sklearn.feature_extraction.text import *
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
import sklearn.naive_bayes as naive_bayes
import json
import numpy as np
import nltk.corpus
import nltk
import string

In [46]:
stop_words = nltk.corpus.stopwords.words('english') + list(string.punctuation)
def clean_article(s):
    return [i for i in nltk.word_tokenize(s) if i not in stop_words]

def load_data():
    data = json.load(open('stories.json'))
    for dataset in ['train', 'test', 'dev']:
        for idx, article in enumerate(data[dataset]):
            data[dataset][idx]['Content'] = clean_article(article['Content'])
            #data[dataset][idx]['Regions'] = article['Regions']
            #data[dataset][idx]['Persons / Groups'] = article['Persons / Groups']
            #data[dataset][idx]['Places'] = article['Places']
    return data



In [47]:
data = load_data()
corpus = [' '.join(article['Content']) for article in data['train']]
#words = nltk.word_tokenize(corpus)

In [128]:
# create word dictionary
word_dict = {}
idx = 0
for cor in corpus:
    token_lst = nltk.word_tokenize(cor)
    word_lst_unq = set(token_lst)
    for w in word_lst_unq:
        if w not in word_dict:
            word_dict[w] = idx
            idx += 1
            
# create POS dictionary
pos_dict = {'CC':0, 'CD':1, 'DT':2,'EX':3,'FW':4, 'IN':5, 'JJ':6,'JJR':7, 'JJS':8, 'LS':9,'MD':10,'NN':11, 'NNP':12, 
            'NNPS':13,'NNS':14,'PDT':15, 'POS':16,'PRP':17, 'PRP$':18,'RB':19, 'RBR':20, 'RBS':21, 'RP':22, 'SYM':23,
            'TO':24, 'UH':25, 'VB':26, 'VBD':27, 'VBG':28,'VBN':29, 'VBP':30,'VBZ':31, 'WDT':32,'WP':33,'WP$':34, 
            'WRB':35, '``':36, '$':37,'\'\'':38, '(':39, ')':40, ',':41,'--':42,'.':43,':':44}

In [129]:
# create feature vector (based on a token)
window_sz = 2
x = []
for cor in corpus:
    token_lst = nltk.word_tokenize(cor)
    tags = nltk.pos_tag(token_lst)
    for i in range(len(tags)):
        if i >= window_sz and i < len(token_lst)-window_sz:
            length = (2*window_sz+1)*len(word_dict)+(2*window_sz+1)*len(pos_dict)
            feature = np.zeros(length)
            idx = word_dict[tags[i][0]]
            idx_pre1 = word_dict[tags[i-1][0]]
            idx_pre2 = word_dict[tags[i-2][0]]
            idx_aft1 = word_dict[tags[i+1][0]]
            idx_aft2 = word_dict[tags[i+2][0]]
            feature[idx] = 1
            feature[idx_pre1+len(word_dict)*1] = 1
            feature[idx_pre2+len(word_dict)*2] = 1
            feature[idx_aft1+len(word_dict)*3] = 1
            feature[idx_aft2+len(word_dict)*4] = 1
            
            idx_pos = pos_dict[tags[i][1]]
            idx_pos_pre1 = pos_dict[tags[i-1][1]]
            idx_pos_pre2 = pos_dict[tags[i-2][1]]
            idx_pos_aft1 = pos_dict[tags[i+1][1]]
            idx_pos_aft2 = pos_dict[tags[i+2][1]]
            feature[idx_pos+(2*window_sz+1)*len(word_dict)] = 1
            feature[idx_pos_pre1+len(pos_dict)*1+(2*window_sz+1)*len(word_dict)] = 1
            feature[idx_pos_pre2+len(pos_dict)*2+(2*window_sz+1)*len(word_dict)] = 1
            feature[idx_pos_aft1+len(pos_dict)*3+(2*window_sz+1)*len(word_dict)] = 1
            feature[idx_pos_aft2+len(pos_dict)*4+(2*window_sz+1)*len(word_dict)] = 1
            
            
            x.append(feature)
    

In [121]:
# create set for all possible person words
persons = [' '.join(article['Persons']) for article in data['train']]
person = set()
for i in persons:
    per = nltk.word_tokenize(i)
    for p in per:
        if p not in person and (nltk.pos_tag([p])[0][1] in {'NNP', 'NN', 'NNS', 'JJ'}):
            person.add(p)

In [150]:
# create labels for each word
y_per = []
for cor in corpus:
    token_lst = nltk.word_tokenize(cor)
    for i in range(len(token_lst)):
        if i >= 2 and i < len(token_lst)-2:
            if token_lst[i] in person:
                y_per.append(1)
            else:
                y_per.append(-1)

In [152]:
len(x)

146116

In [138]:
y = []
idx = 0
token_lst = nltk.word_tokenize("The office of David Davis, Secretary of State")
for w in token_lst:
    if w in person:
        y.append(1)
    else:
        y.append(-1)

In [146]:
print(y)
print(y[2:-2])

[-1, -1, -1, 1, 1, -1, 1, -1, 1]
[-1, 1, 1, -1, 1]


In [140]:
word_dict = {}
idx = 0
token_lst = nltk.word_tokenize("The office of David Davis, Secretary of State")
word_lst_unq = set(token_lst)
for w in word_lst_unq:
    if w not in word_dict:
        word_dict[w] = idx
        idx += 1
    
x_t = []
window_sz=2
token_lst = nltk.word_tokenize("The office of David Davis, Secretary of State")
tags = nltk.pos_tag(token_lst)
for i in range(len(tags)):
    if i >= window_sz and i < len(token_lst)-window_sz:
        length = (2*window_sz+1)*len(word_dict)+(2*window_sz+1)*len(pos_dict)
        feature = np.zeros(length)
        idx = word_dict[tags[i][0]]
        idx_pre1 = word_dict[tags[i-1][0]]
        idx_pre2 = word_dict[tags[i-2][0]]
        idx_aft1 = word_dict[tags[i+1][0]]
        idx_aft2 = word_dict[tags[i+2][0]]
        feature[idx] = 1
        feature[idx_pre1+len(word_dict)*1] = 1
        feature[idx_pre2+len(word_dict)*2] = 1
        feature[idx_aft1+len(word_dict)*3] = 1
        feature[idx_aft2+len(word_dict)*4] = 1
            
        idx_pos = pos_dict[tags[i][1]]
        idx_pos_pre1 = pos_dict[tags[i-1][1]]
        idx_pos_pre2 = pos_dict[tags[i-2][1]]
        idx_pos_aft1 = pos_dict[tags[i+1][1]]
        idx_pos_aft2 = pos_dict[tags[i+2][1]]
        feature[idx_pos+(2*window_sz+1)*len(word_dict)] = 1
        feature[idx_pos_pre1+len(pos_dict)*1+(2*window_sz+1)*len(word_dict)] = 1
        feature[idx_pos_pre2+len(pos_dict)*2+(2*window_sz+1)*len(word_dict)] = 1
        feature[idx_pos_aft1+len(pos_dict)*3+(2*window_sz+1)*len(word_dict)] = 1
        feature[idx_pos_aft2+len(pos_dict)*4+(2*window_sz+1)*len(word_dict)] = 1

        x_t.append(feature)

In [153]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [155]:
corpus = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?',]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(X)
print(vectorizer.get_feature_names())

  (0, 8)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
  (1, 8)	0.281088674033753
  (1, 3)	0.281088674033753
  (1, 6)	0.281088674033753
  (1, 1)	0.6876235979836938
  (1, 5)	0.5386476208856763
  (2, 8)	0.267103787642168
  (2, 3)	0.267103787642168
  (2, 6)	0.267103787642168
  (2, 0)	0.511848512707169
  (2, 7)	0.511848512707169
  (2, 4)	0.511848512707169
  (3, 8)	0.38408524091481483
  (3, 3)	0.38408524091481483
  (3, 6)	0.38408524091481483
  (3, 2)	0.5802858236844359
  (3, 1)	0.46979138557992045
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
