In [2]:
from src.Dataset import Dataset
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.linear_model import Perceptron

import numpy as np
import pandas as pd

In [3]:
train_fr=Dataset(filename="fr_gsd-ud-train.conllu")
train_en=Dataset(filename="en_ewt-ud-train.conllu")

In [40]:

def eval_perceptron(train,features,limit_n=False,do_pca=False,pca_ncomponents=1000):

    if not limit_n:
        limit_n=train.nb_tokens

    X = [{"word": train.types[train.X[i]],
        "left_neighbor":train.types[train.X[i-1]],
        "right_neighbor":train.types[train.X[(i+1) % train.nb_tokens]],
        "pos_in_sentence":0,
        "end_word3":train.types[train.X[i]][-3:],
        "end_word2":train.types[train.X[i]][-2:],
        "word_length":len(train.types[train.X[i]]),
        "word_length_left":len(train.types[train.X[i-1]]),
        "word_length_right":len(train.types[train.X[(i+1) % train.nb_tokens]]),
        "left_pos":"",
        "after_comma":0,
        "before_comma":0
        } for i in range(train.nb_tokens)]

    for i in range(train.nb_sentences):
        X[train.start_sentences[i]]["left_neighbor"]=""
        X[train.start_sentences[i]+train.len_sentences[i]-1]["right_neighbor"]=""
        X[train.start_sentences[i]]["word_length_left"]=0
        X[train.start_sentences[i]+train.len_sentences[i]-1]["word_length_right"]=0
        for j in range(train.len_sentences[i]):
            s=train.start_sentences[i]
            if train.len_sentences[i]>1:
                X[s+j]["pos_in_sentence"]=(j/(train.len_sentences[i]-1))
            X[s+j]["left_end_word3"]=X[s+j]["left_neighbor"][-3:]
            X[s+j]["left_end_word2"]=X[s+j]["left_neighbor"][-2:]
            X[s+j]["right_end_word3"]=X[s+j]["right_neighbor"][-3:]
            X[s+j]["right_end_word2"]=X[s+j]["right_neighbor"][-2:]
            if X[s+j]["left_neighbor"]!="":
                X[s+j]["left_pos"]=train.pos[train.y[s+j-1]]
            X[s+j]["after_comma"]=(X[s+j]["left_neighbor"]==",")
            X[s+j]["before_comma"]=(X[s+j]["right_neighbor"]==",")

    X_filtered=[{key:X[i][key] for key in features} for i in range(train.nb_tokens)]


    v = DictVectorizer(sparse=(not do_pca))
    X_matrix = v.fit_transform(X_filtered[:limit_n])
    print(X_matrix.shape)


    if do_pca:
        pca = PCA(n_components=pca_ncomponents)
        X_matrix=pca.fit_transform(X_matrix)

    Y=np.array([train.pos[train.y[i]] for i in range(train.nb_tokens)])

    X_train, X_test, y_train, y_test = train_test_split(X_matrix, Y[:limit_n], test_size=0.2, random_state=42)
    print(X_train.shape)


    clf = Perceptron(random_state=0)
    clf.fit(X_train, y_train)

    return clf.score(X_test, y_test)

In [8]:
eval_perceptron(train_fr,["word"])

(354558, 42332)


0.8648888763537906

In [12]:
eval_perceptron(train_fr,["word","left_neighbor","right_neighbor"])

(354558, 125575)


0.929814417870036

In [14]:
eval_perceptron(train_en,["word","left_neighbor","right_neighbor","pos_in_sentence"])

(204576, 57210)


0.9214488219767328

In [17]:
eval_perceptron(train_fr,["word","left_neighbor","right_neighbor","pos_in_sentence","before_comma","after_comma"])

(354558, 125578)


0.9296169900722022

In [24]:
eval_perceptron(train_fr,["word","left_neighbor","right_neighbor","pos_in_sentence","end_word3","end_word2","word_length"])

(354558, 133664)


0.9573273916967509

In [26]:
eval_perceptron(train_en,["word","left_neighbor","right_neighbor","pos_in_sentence","end_word3","end_word2","word_length","left_end_word3","left_end_word2","right_end_word3","right_end_word2"])

(204576, 72919)


0.9365284974093264

In [29]:
eval_perceptron(train_fr,["word","left_neighbor","right_neighbor","pos_in_sentence","end_word3","end_word2","word_length","left_end_word3","left_end_word2","right_end_word3","right_end_word2","word_length_left","word_length_right"])

(354558, 149660)


0.9458624774368231

In [43]:
eval_perceptron(train_en,["word"],do_pca=True,limit_n=15000)

(15000, 3656)
(12000, 1000)


0.7793333333333333

In [45]:
eval_perceptron(train_fr,["word"],do_pca=True,limit_n=15000)

(15000, 4790)
(12000, 1000)


0.7846666666666666

In [41]:
eval_perceptron(train_en,["word","left_neighbor","right_neighbor","pos_in_sentence","end_word3","end_word2","word_length"],do_pca=True,limit_n=15000)

(15000, 12400)
(12000, 1000)


0.8543333333333333

In [42]:

eval_perceptron(train_fr,["word","left_neighbor","right_neighbor","pos_in_sentence","end_word3","end_word2","word_length"],do_pca=True,limit_n=15000)

(15000, 16189)
(12000, 1000)


0.8453333333333334