In [1]:
import json
import evaluation_shared_task as evalfun
import numpy as np

In [2]:
LANG = "en"

In [3]:
WINDOW_SIZE_LIST = [5,10,20]

In [4]:
train = json.load(open("data/Train_"+LANG+"_new.json","rb"))

In [5]:
dev = json.load(open("data/Dev_"+LANG+"_new.json","rb"))

In [6]:
train["text_splited"] = train["text"].split(" ")
dev["text_splited"] = dev["text"].split(" ")

In [7]:
train["sentences_list"] = {}
dev["sentences_list"] = {}
for WINDOW_SIZE in WINDOW_SIZE_LIST:
    PAD = "<pad>"
    train["sentences_list"][WINDOW_SIZE] = [PAD for i in range(WINDOW_SIZE)]
    train["sentences_list"][WINDOW_SIZE].extend([word.lower().replace("\n","") for word in train["text_splited"]])
    train["sentences_list"][WINDOW_SIZE].extend([PAD for i in range(WINDOW_SIZE)])
    dev["sentences_list"][WINDOW_SIZE] = [PAD for i in range(WINDOW_SIZE)]
    dev["sentences_list"][WINDOW_SIZE].extend([word.lower().replace("\n","") for word in dev["text_splited"]])
    dev["sentences_list"][WINDOW_SIZE].extend([PAD for i in range(WINDOW_SIZE)])

In [8]:
from gensim.models import word2vec
import glob

In [9]:
model_name_list = glob.glob("model-"+LANG+"-*")

In [10]:
import nltk
pos_tag_dict = {}
for i, (k, v) in enumerate(nltk.data.load('help/tagsets/upenn_tagset.pickle').items()):
    pos_tag_dict[k] = i
pos_tag_dict["other"] = i + 1

In [11]:
def w2v(word,_w2v_model,_w2v_dim):
    try:
        word_vector = _w2v_model.wv[word.lower().replace("\n","")]
    except:
        word_vector = np.array([0.0 for i in range(_w2v_dim)])
    try:
        tag = pos_tag_dict[nltk.pos_tag([word])[0][1]]
    except:
        tag = pos_tag_dict["other"]
    pos_vec = np.array([1.0 if i == tag  else 0.0 for i in range(len(pos_tag_dict))])
    ##数字記号:0, 全部小文字: 1, 全部大文字: 2, 大文字1+小文字: 3, other:4
    class_ul = 0
    if word == PAD:
        class_ul = 4
    elif not word.isalpha():
        class_ul = 0
    elif word.lower() == word:
        class_ul = 1
    elif word.upper() == word:
        class_ul = 2
    elif word[0].upper() == word[0] and word[1:].lower() == word[1:]:
        class_ul = 3
    else:
        class_ul = 4
    capital_vec = np.array([1.0 if i == class_ul else 0.0 for i in range(5)])
    ###"\n"が含まれているか
    if word[-1:] == "\n":
        n_vec = np.array([1.0])
    else:
        n_vec = np.array([0.0])
    return np.concatenate([word_vector, pos_vec, capital_vec,n_vec])

In [12]:
def mkInput(words_list,_w2v_model,_w2v_dim):
    return np.concatenate([w2v(word,_w2v_model,_w2v_dim) for word in words_list])

In [13]:
def mkAllInput(_data,_window_size,_w2v_model,_w2v_dim):
    #labels = mkLabel()
    inputs = mkInput(_data["sentences_list"][_window_size],_w2v_model,_w2v_dim)
    data_len = int(len(inputs) / len(_data["sentences_list"][_window_size]))
    return [inputs[i*data_len:(i+_window_size*2+1)*data_len] for i in range(len(_data["text_splited"]))]

In [14]:
def mkLabel(_data):
    labels = np.array([0 for i in range(len(_data["text_splited"]))])
    for i in _data["begin_sentence"]:
        labels[i] = 1
    for i in _data["end_sentence"]:
        labels[i] = 2
    return labels

In [15]:
import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision.transforms as transforms
import numpy as np
from matplotlib import pyplot as plt

In [16]:
num_classes = 3

class MLPNet (nn.Module):
    def __init__(self, _input_len,_hidden_dim):
        super(MLPNet, self).__init__()
        self.fc1 = nn.Linear(_input_len, _hidden_dim)
        self.fc2 = nn.Linear(_hidden_dim, _hidden_dim)
        self.fc3 = nn.Linear(_hidden_dim, num_classes)
        self.dropout1 = nn.Dropout2d(0.2)
        self.dropout2 = nn.Dropout2d(0.2)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.dropout1(x)
        x = F.relu(self.fc2(x))
        x = self.dropout2(x)
        return F.relu(self.fc3(x))

In [17]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [18]:
model_list = {}
for model_name in model_name_list:
    [_, _lang, _window_size, _w2v_dim, _hidden_dim] = model_name.split("-")
    _window_size, _w2v_dim, _hidden_dim = int(_window_size), int(_w2v_dim), int(_hidden_dim)
    model_list[model_name] = {"lang":_lang, "window_size":_window_size, "w2v_dim":_w2v_dim, "hidden_dim":_hidden_dim}
    model_list[model_name]["w2v_model"] = word2vec.Word2Vec.load(model_name + "/word2vec.gensim.model")
    _input_len = (_window_size * 2 + 1) * (_w2v_dim + len(pos_tag_dict) + 6)
    model_list[model_name]["input_len"] = _input_len
    model_list[model_name]["net"] = MLPNet(_input_len,_hidden_dim).to(device)
    checkpoint = torch.load(sorted(glob.glob(model_name+"/v*"))[-1], map_location='cpu')#'cuda:0')
    model_list[model_name]["net"].load_state_dict(checkpoint["model"])
    model_list[model_name]["net"].eval()

In [19]:
from sklearn.metrics import classification_report
def label_seq(data):
    """
    Input: 
        data: dictionary of text, begin_sentences, end_sentences
    Output:
        a sequence of labels where each token from the text is assiciated with a label:
            regular token --> O
            begin sentences token --> BS
            end sentences token --> ES
    """
    text = data["text"].split(" ")
    
    True_Begin_sentences = data["begin_sentence"]
    True_End_sentences = data["end_sentence"]
    
    labels_train = ["O"] * len(text)    
    for index in True_Begin_sentences:
        labels_train[index] ="BS"
    for index in True_End_sentences:
        labels_train[index] ="ES"
    return labels_train


def evaluate_result(data_true,data_pred):
    """
    Report the score of Begin sentence and end sentences and regular labels
    NB : Only F1 score of BS and ES will be taken under account in the evaluation (first two lines of the report)
    data_true: a json file of the ground truth (rain_fr.json for instance)
    data_pred is a json file in the same format as the json data files
    """
    labels_true = label_seq(data_true)       
    labels_pred = label_seq(data_pred)

    
    target_names= ["O","BS","ES"]
    tag2idx = {t: i for i, t in enumerate(target_names)}
    
    y_true = [tag2idx[i] for i in labels_true]
    y_pred = [tag2idx[i] for i in labels_pred]
    return classification_report(y_true, y_pred, target_names=target_names)

In [20]:
for model_name in model_name_list:
    _lang = model_list[model_name]["lang"]
    _window_size = model_list[model_name]["window_size"]
    _w2v_dim = model_list[model_name]["w2v_dim"]
    _hidden_dim = model_list[model_name]["hidden_dim"]
    _w2v_model = model_list[model_name]["w2v_model"]
    _net = model_list[model_name]["net"]
    inputs_dev = mkAllInput(dev,_window_size,_w2v_model,_w2v_dim)
    inputs_dev = torch.tensor(inputs_dev).float().to(device)
    results_dev = _net(inputs_dev).argmax(1)
    dev2 = {}
    dev2["text"] = dev["text"]
    dev2["begin_sentence"] = []
    dev2["end_sentence"] = []
    for i, _label in enumerate(results_dev):
        if _label == 1:
            dev2["begin_sentence"].append(i)
        elif _label == 2:
            dev2["end_sentence"].append(i)
    model_list[model_name]["macro_f1"] = evaluate_result(dev,dev2).split("\n")[7].split(" ")[23]

In [21]:
model_list

{'model-en-10-50-1200': {'lang': 'en',
  'window_size': 10,
  'w2v_dim': 50,
  'hidden_dim': 1200,
  'w2v_model': <gensim.models.word2vec.Word2Vec at 0x7fbdc5fb3cf8>,
  'input_len': 2142,
  'net': MLPNet(
    (fc1): Linear(in_features=2142, out_features=1200, bias=True)
    (fc2): Linear(in_features=1200, out_features=1200, bias=True)
    (fc3): Linear(in_features=1200, out_features=3, bias=True)
    (dropout1): Dropout2d(p=0.2)
    (dropout2): Dropout2d(p=0.2)
  ),
  'macro_f1': '0.90'},
 'model-en-20-50-1200': {'lang': 'en',
  'window_size': 20,
  'w2v_dim': 50,
  'hidden_dim': 1200,
  'w2v_model': <gensim.models.word2vec.Word2Vec at 0x7fbdbf920588>,
  'input_len': 4182,
  'net': MLPNet(
    (fc1): Linear(in_features=4182, out_features=1200, bias=True)
    (fc2): Linear(in_features=1200, out_features=1200, bias=True)
    (fc3): Linear(in_features=1200, out_features=3, bias=True)
    (dropout1): Dropout2d(p=0.2)
    (dropout2): Dropout2d(p=0.2)
  ),
  'macro_f1': '0.90'},
 'model-en-2

In [22]:
test = dev

In [23]:
for model_name in model_name_list:
    _lang = model_list[model_name]["lang"]
    _window_size = model_list[model_name]["window_size"]
    _w2v_dim = model_list[model_name]["w2v_dim"]
    _hidden_dim = model_list[model_name]["hidden_dim"]
    _w2v_model = model_list[model_name]["w2v_model"]
    _net = model_list[model_name]["net"]
    inputs_dev = mkAllInput(test,_window_size,_w2v_model,_w2v_dim)
    inputs_dev = torch.tensor(inputs_dev).float().to(device)
    results_dev = _net(inputs_dev).argmax(1)
    test2 = {}
    test2["text"] = test["text"]
    test2["begin_sentence"] = []
    test2["end_sentence"] = []
    for i, _label in enumerate(results_dev):
        if _label == 1:
            test2["begin_sentence"].append(i)
        elif _label == 2:
            test2["end_sentence"].append(i)
    model_list[model_name]["test_begin_sentence"] = test2["begin_sentence"]
    model_list[model_name]["test_end_sentence"] = test2["end_sentence"]

In [24]:
final = {}
final["text"] = test["text"]

In [25]:
vote_agree = {}
vote_agree["begin_sentence"] = {}
vote_agree["end_sentence"] = {}
for model_name in model_name_list:
    for i in model_list[model_name]["test_begin_sentence"]:
        if i not in vote_agree["begin_sentence"]:
            vote_agree["begin_sentence"][i] = 1
        else:
            vote_agree["begin_sentence"][i] += 1
    for i in model_list[model_name]["test_end_sentence"]:
        if i not in vote_agree["end_sentence"]:
            vote_agree["end_sentence"][i] = 1
        else:
            vote_agree["end_sentence"][i] += 1

In [26]:
model_num = len(model_list)

In [27]:
thr = model_num * 0.66
thr

17.82

In [28]:
final["begin_sentence"] = []
final["end_sentence"] = []
for i,votes in vote_agree["begin_sentence"].items():
    if votes >=thr:
        final["begin_sentence"].append(i)
for i,votes in vote_agree["end_sentence"].items():
    if votes >=thr:
        final["end_sentence"].append(i)

In [29]:
evalfun.evaluate_result(test,final)

              precision    recall  f1-score   support

           O       1.00      0.99      0.99     47091
          BS       0.81      0.87      0.84      1384
          ES       0.87      0.98      0.92      1384

   micro avg       0.99      0.99      0.99     49859
   macro avg       0.89      0.95      0.92     49859
weighted avg       0.99      0.99      0.99     49859

