In [1]:
from bs4 import BeautifulSoup
from string import punctuation
punctuation_marks = punctuation + '»«–…'
import os
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import warnings
warnings.filterwarnings("ignore")

In [2]:
!python --version

Python 3.6.8


http://wiki.python.su/Документации/BeautifulSoup

In [3]:
def get_sent_info(clusters):
    d_sents = {}
    for cl in clusters:
        units = cl.findAll("alignunit")
        for u in units:
            sents = u.findAll("sent")
            for s in sents:
                _id_sent = s['sent_id']
                tokens = s.findAll('tok')
                sent = ''
                l_tokens = []
                for t in tokens:
                    _id_tok = t['id']
                    lemma = t['mate_lemma']
                    pos = t['mate_pos']
                    token = t.string
                    l_tokens.append([_id_tok, lemma, token, pos])
                    if token in punctuation_marks:
                        sent += token
                    else:
                        if sent.endswith("'"):
                            sent += token
                        else:
                            sent += ' ' + token
                d_sents[_id_sent] = [sent.strip(), l_tokens]
    return d_sents

In [4]:
def get_shell_nouns(y):
    shell_nouns = y.europarl_chunk.shellnouns.findAll("shellnoun")
    d_shell_nouns = {}
    for sn in shell_nouns:
        span = sn['span']
        value = sn['value']
        d_shell_nouns[span] = [value]
    return d_shell_nouns

In [5]:
def get_info(file):
    with open(file, 'r') as file:
        f = file.read()
    y=BeautifulSoup(f)
    clusters = y.europarl_chunk.findAll("turn", lang="en")
    d_sents = get_sent_info(clusters)
    d_shell_nouns = get_shell_nouns(y)
    return d_sents, d_shell_nouns

In [6]:
main_dir = 'annotator1/'
xmls = []
for root, dirs, files in os.walk(main_dir):
    for name in files:
        xmls.append(os.path.join(root, name))

In [7]:
all_info = []

In [8]:
for i, xml in tqdm(enumerate(xmls)):
    d_sents, d_shell_nouns = get_info(xml)
    for se in d_sents:
        whole_sent = d_sents[se][0] # предложение целиком
        tokens = d_sents[se][1]
        for t in tokens:
            if t[0] in d_shell_nouns.keys():
                d_shell_nouns[t[0]].append(t[2]) # присоединяем токен
                d_shell_nouns[t[0]].append(t[1]) # присоединяем лемму
                d_shell_nouns[t[0]].append(se) # присоединяем id предложения
                d_shell_nouns[t[0]].append(d_sents[se][0]) # присоединяем предложение
                all_info.append([xml, t[0]] + d_shell_nouns[t[0]])

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [9]:
all_info[0]

['annotator1/ep-01-02-28.xml',
 't_455',
 'true',
 'view',
 'view',
 's_14',
 'Would he share with me the view that, if the Council took more interest in his reforms, it would help their speedy and thorough implementation to achieve the end that we all want, which is the most professional and effective European civil service.']

In [10]:
len(all_info)

1018

In [11]:
from parsing import ParserUDpipe

In [12]:
res = pd.DataFrame(data={'Id': [], 'Form': [], 'Lemma': [],
                         'UPosTag': [], 'XPosTag': [], 'Feats': [],
                         'Head': [], 'DepRel': [], 'Deps': [],
                         'Misc': [], 'Tag': [], 'Sent': []})
i = 0
for inf in all_info:
    try:
        token = inf[3]
        sent = inf[6]
        value = inf[2]
        parser = ParserUDpipe(sent)
        df = parser.conllu2df()
        tags = ['context'] * len(df)
        tags[list(df['Form']).index(token)] = value
        df['Tag'] = tags
        df['Sent'] = [i] * len(df)
        res = pd.concat([res, df])
    except:
        print(i)
    i += 1

910


In [13]:
res = res.drop(['Id', 'XPosTag', 'Feats', 'Head', 'Deps', 'Misc'], axis=1)

In [14]:
def get_right_context(lemmas, window):
    return list(lemmas)[window:] + ['END'] * window
def get_left_context(lemmas, window):
    return ['START'] * window + list(lemmas)[:-window]

In [15]:
next_lemma = []
next_next_lemma = []
next_next_next_lemma = []
next_next_next_next_lemma = []
for s in set(res['Sent']):
    lemmas = res[res['Sent'] == s]['Lemma']
    next_lemma += get_right_context(lemmas, 1)
    next_next_lemma += get_right_context(lemmas, 2)
    next_next_next_lemma += get_right_context(lemmas, 3)
    next_next_next_next_lemma += get_right_context(lemmas, 4)
res['next_lemma'] = next_lemma
res['next_next_lemma'] = next_next_lemma
res['next_next_next_lemma'] = next_next_next_lemma
res['next_next_next_next_lemma'] = next_next_next_next_lemma

next_pos = []
next_next_pos = []
next_next_next_pos = []
next_next_next_next_pos = []
for s in set(res['Sent']):
    poss = res[res['Sent'] == s]['UPosTag']
    next_pos += get_right_context(poss, 1)
    next_next_pos += get_right_context(poss, 2)
    next_next_next_pos += get_right_context(poss, 3)
    next_next_next_next_pos += get_right_context(poss, 4)
res['next_pos'] = next_pos
res['next_next_pos'] = next_next_pos
res['next_next_next_pos'] = next_next_next_pos
res['next_next_next_next_pos'] = next_next_next_next_pos

next_rel = []
next_next_rel = []
next_next_next_rel = []
next_next_next_next_rel = []
for s in set(res['Sent']):
    rels = res[res['Sent'] == s]['DepRel']
    next_rel += get_right_context(rels, 1)
    next_next_rel += get_right_context(rels, 2)
    next_next_next_rel += get_right_context(rels, 3)
    next_next_next_next_rel += get_right_context(rels, 4)
res['next_rel'] = next_rel
res['next_next_rel'] = next_next_rel
res['next_next_next_rel'] = next_next_next_rel
res['next_next_next_next_rel'] = next_next_next_next_rel

In [16]:
pre_lemma = []
pre_pre_lemma = []
pre_pre_pre_lemma = []
pre_pre_pre_pre_lemma = []
for s in set(res['Sent']):
    lemmas = res[res['Sent'] == s]['Lemma']
    pre_lemma += get_left_context(lemmas, 1)
    pre_pre_lemma += get_left_context(lemmas, 2)
    pre_pre_pre_lemma += get_left_context(lemmas, 3)
    pre_pre_pre_pre_lemma += get_left_context(lemmas, 4)
res['pre_lemma'] = pre_lemma
res['pre_pre_lemma'] = pre_pre_lemma
res['pre_pre_pre_lemma'] = pre_pre_pre_lemma
res['pre_pre_pre_pre_lemma'] = pre_pre_pre_pre_lemma

pre_pos = []
pre_pre_pos = []
pre_pre_pre_pos = []
pre_pre_pre_pre_pos = []
for s in set(res['Sent']):
    poss = res[res['Sent'] == s]['UPosTag']
    pre_pos += get_left_context(poss, 1)
    pre_pre_pos += get_left_context(poss, 2)
    pre_pre_pre_pos += get_left_context(poss, 3)
    pre_pre_pre_pre_pos += get_left_context(poss, 4)
res['pre_pos'] = pre_pos
res['pre_pre_pos'] = pre_pre_pos
res['pre_pre_pre_pos'] = pre_pre_pre_pos
res['pre_pre_pre_pre_pos'] = pre_pre_pre_pre_pos

pre_rel = []
pre_pre_rel = []
pre_pre_pre_rel = []
pre_pre_pre_pre_rel = []
for s in set(res['Sent']):
    rels = res[res['Sent'] == s]['DepRel']
    pre_rel += get_left_context(rels, 1)
    pre_pre_rel += get_left_context(rels, 2)
    pre_pre_pre_rel += get_left_context(rels, 3)
    pre_pre_pre_pre_rel += get_left_context(rels, 4)
res['pre_rel'] = pre_rel
res['pre_pre_rel'] = pre_pre_rel
res['pre_pre_pre_rel'] = pre_pre_pre_rel
res['pre_pre_pre_pre_rel'] = pre_pre_pre_pre_rel

In [17]:
that = []
for s in set(res['Sent']):
    lemmas = list(res[res['Sent'] == s]['Lemma'])
    if 'that' in lemmas:
        that += [1] * len(lemmas)
    else:
        that += [0] * len(lemmas)
res['that'] = that

# Binary classification

In [18]:
df = res[~(res['Tag'] == 'context')]
df = df[~(df['Tag'] == 'undefined')]
df = df[~(df['Tag'] == 'unclear')]
df['Tag'] = df['Tag'].map({'true': 1, 'false': 0})

In [19]:
df.shape

(993, 31)

In [20]:
df.head()

Unnamed: 0,Form,Lemma,UPosTag,DepRel,Tag,Sent,next_lemma,next_next_lemma,next_next_next_lemma,next_next_next_next_lemma,...,pre_pre_pre_pre_lemma,pre_pos,pre_pre_pos,pre_pre_pre_pos,pre_pre_pre_pre_pos,pre_rel,pre_pre_rel,pre_pre_pre_rel,pre_pre_pre_pre_rel,that
6,view,view,NOUN,obj,1,0.0,that,",",if,the,...,share,DET,PRON,ADP,VERB,det,obl,case,root,1
30,end,end,NOUN,obj,1,1.0,that,we,all,want,...,implementation,DET,VERB,PART,NOUN,det,advcl,mark,conj,1
14,proposals,proposal,NOUN,obl,0,2.0,you,have,make,",",...,you,ADJ,DET,ADP,PRON,amod,det,case,obj,0
5,question,question,NOUN,obl,0,3.0,",",lately,many,commission,...,on,ADJ,DET,ADP,ADP,amod,det,case,case,0
17,decision,decision,NOUN,obl,0,4.0,make,because,of,what,...,to,ADP,ADV,VERB,PART,case,advmod,xcomp,mark,0


In [21]:
df.Tag.value_counts(normalize=True )

0    0.564955
1    0.435045
Name: Tag, dtype: float64

In [22]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
SEED = 55
import pandas as pd
import numpy as np
from scipy import sparse as sp
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn import model_selection
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn import metrics
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [23]:
cols = [col for col in df.columns if col not in ['Tag', 'Sent']]
X_train = df[cols]
y_train = df['Tag']

In [24]:
def build_dataset(df_train, df_test, cat_cols, num_cols=None, word_cols=None, embeddings=None):
    encoder = OneHotEncoder()
    encoder.fit(pd.concat([df_train[cat_cols], df_test[cat_cols]]))
    
    X_train = encoder.transform(df_train[cat_cols])
    X_test = encoder.transform(df_test[cat_cols])
        
    if num_cols is not None:
        X_train = sp.hstack(
            [
                X_train,
                np.hstack([df_train[col].values.reshape(-1, 1) for col in num_cols])
            ]
        )
        
        X_test = sp.hstack(
            [
                X_test,
                np.hstack([df_test[col].values.reshape(-1, 1) for col in num_cols])
            ]
        )
        
    if embeddings is not None and word_cols is not None:
        X_train = sp.hstack(
            [
                X_train,
                np.hstack([embeddings.transform(df_train[col].values) for col in word_cols])
            ]
        )
        
        X_test = sp.hstack(
            [
                X_test,
                np.hstack([embeddings.transform(df_test[col].values) for col in word_cols])
            ]
        )
        
    return X_train, X_test

# Test on REALEC data

In [25]:
test = pd.read_excel('test.xlsx', names=['sent', 'Tag'])

In [26]:
test.head()

Unnamed: 0,sent,Tag
0,"I believe, that when young people listen their...",notDON
1,"In conclusion, we can see that despite minor c...",notDON
2,It would be unfair not to mention that nowaday...,notDON
3,"To sum up, I'd like to say that it is a good i...",DON
4,"Finaly, in can be noticed that despite the fac...",DON


In [27]:
from parsing import ParserUDpipe

In [28]:
res_test = pd.DataFrame(data={'Id': [], 'Form': [], 'Lemma': [],
                         'UPosTag': [], 'XPosTag': [], 'Feats': [],
                         'Head': [], 'DepRel': [], 'Deps': [],
                         'Misc': [], 'Sent': []})
i = 0
for sent in list(test['sent']):
    parser = ParserUDpipe(sent)
    df = parser.conllu2df()
    df['Sent'] = [i] * len(df)
    res_test = pd.concat([res_test, df])
    i += 1

In [29]:
res_test = res_test.drop(['Id', 'XPosTag', 'Feats', 'Head', 'Deps', 'Misc'], axis=1)

In [30]:
res_test.head()

Unnamed: 0,Form,Lemma,UPosTag,DepRel,Sent
0,I,I,PRON,nsubj,0.0
1,believe,believe,VERB,root,0.0
2,",",",",PUNCT,punct,0.0
3,that,that,SCONJ,mark,0.0
4,when,when,SCONJ,mark,0.0


In [31]:
d = {
    "dons": [
        "thing", "fact", "point", "argument", "result", "dispute",
        "problem", "factor", "approach", "view", "feeling", "process",
        "theme", "attempt", "controversy", "statement", "task", "issue",
        "dream", "matter", "situation", "need", "reason", "solution",
        "possibility", "change", "debate", "sense", "method", "theory",
        "finding", "question", "idea", "concept", "opinion", "ideas", "things"
    ]
}

In [32]:
candidates = []
for s in set(res_test['Sent']):
    ind = False
    lemmas = list(res_test[res_test['Sent'] == s]['Lemma'])
    for l in lemmas:
        for don in d['dons']:
            try:
                ind = lemmas.index(don)
            except:
                continue
    cands = [0] * len(lemmas)
    if ind:
        cands[ind] = list(test.Tag)[int(s)]
    else:
        print(s)
    candidates += cands

35.0
71.0


In [33]:
res_test['Tag'] = candidates

In [34]:
res_test.head()

Unnamed: 0,Form,Lemma,UPosTag,DepRel,Sent,Tag
0,I,I,PRON,nsubj,0.0,0
1,believe,believe,VERB,root,0.0,0
2,",",",",PUNCT,punct,0.0,0
3,that,that,SCONJ,mark,0.0,0
4,when,when,SCONJ,mark,0.0,0


In [35]:
next_lemma = []
next_next_lemma = []
next_next_next_lemma = []
next_next_next_next_lemma = []
for s in set(res_test['Sent']):
    lemmas = res_test[res_test['Sent'] == s]['Lemma']
    next_lemma += get_right_context(lemmas, 1)
    next_next_lemma += get_right_context(lemmas, 2)
    next_next_next_lemma += get_right_context(lemmas, 3)
    next_next_next_next_lemma += get_right_context(lemmas, 4)
res_test['next_lemma'] = next_lemma
res_test['next_next_lemma'] = next_next_lemma
res_test['next_next_next_lemma'] = next_next_next_lemma
res_test['next_next_next_next_lemma'] = next_next_next_next_lemma

next_pos = []
next_next_pos = []
next_next_next_pos = []
next_next_next_next_pos = []
for s in set(res_test['Sent']):
    poss = res_test[res_test['Sent'] == s]['UPosTag']
    next_pos += get_right_context(poss, 1)
    next_next_pos += get_right_context(poss, 2)
    next_next_next_pos += get_right_context(poss, 3)
    next_next_next_next_pos += get_right_context(poss, 4)
res_test['next_pos'] = next_pos
res_test['next_next_pos'] = next_next_pos
res_test['next_next_next_pos'] = next_next_next_pos
res_test['next_next_next_next_pos'] = next_next_next_next_pos

next_rel = []
next_next_rel = []
next_next_next_rel = []
next_next_next_next_rel = []
for s in set(res_test['Sent']):
    rels = res_test[res_test['Sent'] == s]['DepRel']
    next_rel += get_right_context(rels, 1)
    next_next_rel += get_right_context(rels, 2)
    next_next_next_rel += get_right_context(rels, 3)
    next_next_next_next_rel += get_right_context(rels, 4)
res_test['next_rel'] = next_rel
res_test['next_next_rel'] = next_next_rel
res_test['next_next_next_rel'] = next_next_next_rel
res_test['next_next_next_next_rel'] = next_next_next_next_rel

In [36]:
pre_lemma = []
pre_pre_lemma = []
pre_pre_pre_lemma = []
pre_pre_pre_pre_lemma = []
for s in set(res_test['Sent']):
    lemmas = res_test[res_test['Sent'] == s]['Lemma']
    pre_lemma += get_left_context(lemmas, 1)
    pre_pre_lemma += get_left_context(lemmas, 2)
    pre_pre_pre_lemma += get_left_context(lemmas, 3)
    pre_pre_pre_pre_lemma += get_left_context(lemmas, 4)
res_test['pre_lemma'] = pre_lemma
res_test['pre_pre_lemma'] = pre_pre_lemma
res_test['pre_pre_pre_lemma'] = pre_pre_pre_lemma
res_test['pre_pre_pre_pre_lemma'] = pre_pre_pre_pre_lemma

pre_pos = []
pre_pre_pos = []
pre_pre_pre_pos = []
pre_pre_pre_pre_pos = []
for s in set(res_test['Sent']):
    poss = res_test[res_test['Sent'] == s]['UPosTag']
    pre_pos += get_left_context(poss, 1)
    pre_pre_pos += get_left_context(poss, 2)
    pre_pre_pre_pos += get_left_context(poss, 3)
    pre_pre_pre_pre_pos += get_left_context(poss, 4)
res_test['pre_pos'] = pre_pos
res_test['pre_pre_pos'] = pre_pre_pos
res_test['pre_pre_pre_pos'] = pre_pre_pre_pos
res_test['pre_pre_pre_pre_pos'] = pre_pre_pre_pre_pos

pre_rel = []
pre_pre_rel = []
pre_pre_pre_rel = []
pre_pre_pre_pre_rel = []
for s in set(res_test['Sent']):
    rels = res_test[res_test['Sent'] == s]['DepRel']
    pre_rel += get_left_context(rels, 1)
    pre_pre_rel += get_left_context(rels, 2)
    pre_pre_pre_rel += get_left_context(rels, 3)
    pre_pre_pre_pre_rel += get_left_context(rels, 4)
res_test['pre_rel'] = pre_rel
res_test['pre_pre_rel'] = pre_pre_rel
res_test['pre_pre_pre_rel'] = pre_pre_pre_rel
res_test['pre_pre_pre_pre_rel'] = pre_pre_pre_pre_rel

In [37]:
that = []
for s in set(res_test['Sent']):
    lemmas = list(res_test[res_test['Sent'] == s]['Lemma'])
    if 'that' in lemmas:
        that += [1] * len(lemmas)
    else:
        that += [0] * len(lemmas)
res_test['that'] = that

In [38]:
df_test = res_test[~(res_test['Tag'] == 0)]
df_test['Tag'] = df_test['Tag'].map({'DON': 1, 'notDON': 0})

In [39]:
df_test.head()

Unnamed: 0,Form,Lemma,UPosTag,DepRel,Sent,Tag,next_lemma,next_next_lemma,next_next_next_lemma,next_next_next_next_lemma,...,pre_pre_pre_pre_lemma,pre_pos,pre_pre_pos,pre_pre_pre_pos,pre_pre_pre_pre_pos,pre_rel,pre_pre_rel,pre_pre_pre_rel,pre_pre_pre_pre_rel,that
27,things,thing,NOUN,obl,0.0,0,.,END,END,END,...,world,ADJ,ADP,ADV,NOUN,amod,case,advmod,obj,1
9,changes,change,NOUN,obj,1.0,0,in,different,region,",",...,see,ADJ,VERB,SCONJ,VERB,amod,ccomp,mark,root,1
10,process,process,NOUN,obj,2.0,0,of,nurturing,child,and,...,mention,DET,VERB,PRON,NOUN,det,acl:relcl,nsubj,obl,1
14,idea,idea,NOUN,ccomp,3.0,1,to,start,learn,foreign,...,it,ADJ,DET,AUX,PRON,amod,det,cop,nsubj,1
24,situations,situation,NOUN,obj,4.0,1,in,future,.,END,...,will,ADJ,DET,VERB,AUX,amod,det,advcl,aux,1


In [64]:
df_test.shape

(238, 31)

In [40]:
cols = [col for col in df_test.columns if col not in ['Tag', 'Sent']]
X_test = df_test[cols]
y_test = df_test['Tag']

# Fit model and check accuracy

In [41]:
X_train, X_test = build_dataset(
    X_train,
    X_test,
    cat_cols=list(X_train.columns)[:-1],
    num_cols = ['that']
)

In [43]:
%%time

scores = cross_val_score(LogisticRegression(random_state=SEED), X_train, y_train, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

[0.8019802  0.69       0.78787879 0.68686869 0.70707071 0.83838384
 0.64646465 0.70707071 0.75757576 0.81818182]
0.7441475147514751
CPU times: user 132 ms, sys: 4.91 ms, total: 137 ms
Wall time: 144 ms


# Grid Search

In [44]:
logistic = LogisticRegression()
penalty = ['l1', 'l2']
C = np.logspace(0, 4, 10)
hyperparameters = dict(C=C, penalty=penalty)
clf = GridSearchCV(logistic, hyperparameters, cv=5, verbose=0)
clf.fit(X_train,y_train)
print("Best parameters:", clf.best_params_)

Best parameters: {'C': 1.0, 'penalty': 'l2'}


In [45]:
scores = cross_val_score(LogisticRegression(random_state=SEED, C=1.0, penalty='l2'), X_train, y_train, cv=10, scoring='accuracy')
print(scores)
print(scores.mean())

[0.8019802  0.69       0.78787879 0.68686869 0.70707071 0.83838384
 0.64646465 0.70707071 0.75757576 0.81818182]
0.7441475147514751


In [46]:
clf = LogisticRegression(random_state=SEED, C=1.0, penalty='l2').fit(X_train, y_train)
preds = clf.predict(X_test)

In [47]:
metrics.accuracy_score(y_test, preds)

0.726890756302521

# Naive_bayes

In [42]:
from sklearn.naive_bayes import GaussianNB
_X_train = X_train.toarray()
_X_test = X_test.toarray()
model = GaussianNB().fit(_X_train, y_train)
preds = model.predict(_X_test)
metrics.accuracy_score(y_test, preds)

0.5630252100840336

# RandomForestClassifier

In [53]:
model = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=SEED).fit(_X_train, y_train)
preds = model.predict(_X_test)
metrics.accuracy_score(y_test, preds)

0.42436974789915966

# SVC

In [62]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC, LinearSVC

In [65]:
from sklearn.svm import SVC
model = SVC(kernel="linear", C=0.025, class_weight='balanced', decision_function_shape='ovo', random_state=SEED).fit(_X_train, y_train)
preds = model.predict(_X_test)
metrics.accuracy_score(y_test, preds)

0.7773109243697479

In [66]:
print(classification_report(y_test,preds))
print(confusion_matrix(y_test, preds))

              precision    recall  f1-score   support

           0       0.68      0.82      0.75        95
           1       0.86      0.75      0.80       143

   micro avg       0.78      0.78      0.78       238
   macro avg       0.77      0.78      0.77       238
weighted avg       0.79      0.78      0.78       238

[[ 78  17]
 [ 36 107]]


In [55]:
param_grid = {'C':[0.001, 0.01, 0.1, 1, 10, 100, 1000],'gamma':[1, 0.1, 0.001, 0.0001], 'kernel':['linear','rbf']}

In [56]:
grid = GridSearchCV(SVC(), param_grid,  cv=5, verbose=0)

In [57]:
grid.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [58]:
grid.best_params_

{'C': 0.1, 'gamma': 1, 'kernel': 'linear'}

In [53]:
model = SVC(C=10, gamma=0.1, kernel='rbf', random_state=SEED).fit(_X_train, y_train)
preds = model.predict(_X_test)

In [54]:
metrics.accuracy_score(y_test, preds)

0.6764705882352942

In [74]:
print(classification_report(y_test,predic))
print(confusion_matrix(y_test, predic))

              precision    recall  f1-score   support

           0       0.56      0.89      0.69        95
           1       0.88      0.53      0.66       143

   micro avg       0.68      0.68      0.68       238
   macro avg       0.72      0.71      0.68       238
weighted avg       0.75      0.68      0.67       238

[[85 10]
 [67 76]]


# DecisionTreeClassifier

In [55]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth=5, random_state=SEED).fit(_X_train, y_train)
preds = model.predict(_X_test)
metrics.accuracy_score(y_test, preds)

0.6176470588235294

# AdaBoostClassifier

In [57]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
model = AdaBoostClassifier().fit(_X_train, y_train)
preds = model.predict(_X_test)
metrics.accuracy_score(y_test, preds)

0.7016806722689075

# На выборке

In [68]:
with open('/Users/irene/Desktop/Диплом/code/result_criteria/result_criteria.csv', 'r') as file:
    f = file.read()
paths = ['/Users/irene/Desktop/Диплом/new_data/'+x+'.txt' for x in f.split('\n')]

In [69]:
import re
paths = [re.sub(';.+', '', x)+'.txt' for x in paths[1:]]

In [70]:
len(paths)

258

In [85]:
paths[0]

'/Users/irene/Desktop/Диплом/new_data/1.txt'

In [88]:
import nltk

In [136]:
def into_data_for_model(sent_text, X_train):
    try:
        res_test = pd.DataFrame(data={'Id': [], 'Form': [], 'Lemma': [],
                                      'UPosTag': [], 'XPosTag': [], 'Feats': [],
                                      'Head': [], 'DepRel': [], 'Deps': [],
                                      'Misc': [], 'Sent': []})
        i = 0
        for sent in list(sent_text):
            parser = ParserUDpipe(sent)
            df = parser.conllu2df()
            df['Sent'] = [i] * len(df)
            res_test = pd.concat([res_test, df])
            i += 1
        res_test = res_test.drop(['Id', 'XPosTag', 'Feats', 'Head', 'Deps', 'Misc'], axis=1)
        d = {
        "dons": [
            "thing", "fact", "point", "argument", "result", "dispute",
            "problem", "factor", "approach", "view", "feeling", "process",
            "theme", "attempt", "controversy", "statement", "task", "issue",
            "dream", "matter", "situation", "need", "reason", "solution",
            "possibility", "change", "debate", "sense", "method", "theory",
            "finding", "question", "idea", "concept", "opinion", "ideas", "things"
        ]
        }
        candidates = []
        for s in set(res_test['Sent']):
            ind = False
            lemmas = list(res_test[res_test['Sent'] == s]['Lemma'])
            for l in lemmas:
                for don in d['dons']:
                    try:
                        ind = lemmas.index(don)
                    except:
                        continue
            cands = [0] * len(lemmas)
            if ind:
                cands[ind] = list(test.Tag)[int(s)]
            else:
                pass
            candidates += cands
        res_test['Tag'] = candidates
        next_lemma = []
        next_next_lemma = []
        next_next_next_lemma = []
        next_next_next_next_lemma = []
        for s in set(res_test['Sent']):
            lemmas = res_test[res_test['Sent'] == s]['Lemma']
            next_lemma += get_right_context(lemmas, 1)
            next_next_lemma += get_right_context(lemmas, 2)
            next_next_next_lemma += get_right_context(lemmas, 3)
            next_next_next_next_lemma += get_right_context(lemmas, 4)
        res_test['next_lemma'] = next_lemma
        res_test['next_next_lemma'] = next_next_lemma
        res_test['next_next_next_lemma'] = next_next_next_lemma
        res_test['next_next_next_next_lemma'] = next_next_next_next_lemma

        next_pos = []
        next_next_pos = []
        next_next_next_pos = []
        next_next_next_next_pos = []
        for s in set(res_test['Sent']):
            poss = res_test[res_test['Sent'] == s]['UPosTag']
            next_pos += get_right_context(poss, 1)
            next_next_pos += get_right_context(poss, 2)
            next_next_next_pos += get_right_context(poss, 3)
            next_next_next_next_pos += get_right_context(poss, 4)
        res_test['next_pos'] = next_pos
        res_test['next_next_pos'] = next_next_pos
        res_test['next_next_next_pos'] = next_next_next_pos
        res_test['next_next_next_next_pos'] = next_next_next_next_pos

        next_rel = []
        next_next_rel = []
        next_next_next_rel = []
        next_next_next_next_rel = []
        for s in set(res_test['Sent']):
            rels = res_test[res_test['Sent'] == s]['DepRel']
            next_rel += get_right_context(rels, 1)
            next_next_rel += get_right_context(rels, 2)
            next_next_next_rel += get_right_context(rels, 3)
            next_next_next_next_rel += get_right_context(rels, 4)
        res_test['next_rel'] = next_rel
        res_test['next_next_rel'] = next_next_rel
        res_test['next_next_next_rel'] = next_next_next_rel
        res_test['next_next_next_next_rel'] = next_next_next_next_rel
        pre_lemma = []
        pre_pre_lemma = []
        pre_pre_pre_lemma = []
        pre_pre_pre_pre_lemma = []
        for s in set(res_test['Sent']):
            lemmas = res_test[res_test['Sent'] == s]['Lemma']
            pre_lemma += get_left_context(lemmas, 1)
            pre_pre_lemma += get_left_context(lemmas, 2)
            pre_pre_pre_lemma += get_left_context(lemmas, 3)
            pre_pre_pre_pre_lemma += get_left_context(lemmas, 4)
        res_test['pre_lemma'] = pre_lemma
        res_test['pre_pre_lemma'] = pre_pre_lemma
        res_test['pre_pre_pre_lemma'] = pre_pre_pre_lemma
        res_test['pre_pre_pre_pre_lemma'] = pre_pre_pre_pre_lemma

        pre_pos = []
        pre_pre_pos = []
        pre_pre_pre_pos = []
        pre_pre_pre_pre_pos = []
        for s in set(res_test['Sent']):
            poss = res_test[res_test['Sent'] == s]['UPosTag']
            pre_pos += get_left_context(poss, 1)
            pre_pre_pos += get_left_context(poss, 2)
            pre_pre_pre_pos += get_left_context(poss, 3)
            pre_pre_pre_pre_pos += get_left_context(poss, 4)
        res_test['pre_pos'] = pre_pos
        res_test['pre_pre_pos'] = pre_pre_pos
        res_test['pre_pre_pre_pos'] = pre_pre_pre_pos
        res_test['pre_pre_pre_pre_pos'] = pre_pre_pre_pre_pos

        pre_rel = []
        pre_pre_rel = []
        pre_pre_pre_rel = []
        pre_pre_pre_pre_rel = []
        for s in set(res_test['Sent']):
            rels = res_test[res_test['Sent'] == s]['DepRel']
            pre_rel += get_left_context(rels, 1)
            pre_pre_rel += get_left_context(rels, 2)
            pre_pre_pre_rel += get_left_context(rels, 3)
            pre_pre_pre_pre_rel += get_left_context(rels, 4)
        res_test['pre_rel'] = pre_rel
        res_test['pre_pre_rel'] = pre_pre_rel
        res_test['pre_pre_pre_rel'] = pre_pre_pre_rel
        res_test['pre_pre_pre_pre_rel'] = pre_pre_pre_pre_rel
        that = []
        for s in set(res_test['Sent']):
            lemmas = list(res_test[res_test['Sent'] == s]['Lemma'])
            if 'that' in lemmas:
                that += [1] * len(lemmas)
            else:
                that += [0] * len(lemmas)
        res_test['that'] = that
        df_test = res_test[~(res_test['Tag'] == 0)]
        df_test['Tag'] = df_test['Tag'].map({'DON': 1, 'notDON': 0})
        cols = [col for col in df_test.columns if col not in ['Tag', 'Sent']]
        X_test = df_test[cols]
        y_test = df_test['Tag']
        X_train, X_test = build_dataset(
            X_train,
            X_test,
            cat_cols=list(X_train.columns)[:-1],
            num_cols = ['that']
            )
        _X_train = X_train.toarray()
        _X_test = X_test.toarray()
        model = SVC(kernel="linear", C=0.025, random_state=SEED).fit(_X_train, y_train)
        preds = model.predict(_X_test)
        return list(preds).count(1), df_test
    except:
        return None, None

In [137]:
from tqdm import tqdm_notebook as tqdm

In [138]:
cc = []
for p in tqdm(paths):
    with open(p, 'r') as file:
        text = file.read()
        sent_text = nltk.sent_tokenize(text)
        c, df_test = into_data_for_model(sent_text, X_train)
        cc.append(c)

HBox(children=(IntProgress(value=0, max=258), HTML(value='')))

In [67]:
len(cc)

NameError: name 'cc' is not defined

In [141]:
for a in cc:
    print(a)

0
0
3
1
0
0
1
0
0
0
0
1
2
0
2
0
0
0
4
0
3
0
0
3
None
1
1
0
0
0
3
None
2
0
3
0
4
0
None
0
3
0
0
0
0
0
0
1
0
2
3
1
2
0
None
1
1
0
1
0
4
0
2
3
1
3
5
1
1
None
1
0
None
None
None
0
0
0
1
0
1
0
0
None
None
None
None
0
0
0
1
0
1
0
0
0
0
0
0
0
None
0
4
5
3
5
3
2
2
2
1
1
2
1
1
0
1
1
1
1
0
0
0
0
None
0
1
1
0
4
1
1
1
1
0
None
1
0
2
None
2
0
1
0
0
0
0
0
0
0
4
None
2
0
1
0
1
0
2
0
1
None
0
0
0
0
None
1
1
0
0
1
None
None
0
2
5
2
3
1
2
1
1
1
1
None
None
2
5
1
1
1
1
2
3
3
1
2
1
2
1
0
0
0
None
0
None
0
None
None
0
0
0
1
0
1
0
0
0
None
0
2
1
1
1
2
3
0
1
1
1
0
2
2
1
2
2
0
1
1
2
None
None
None
None
0
0
None
None
0
0
1
None
0
None
0
0
1


In [153]:
p = pd.read_csv('/Users/irene/Desktop/Диплом/code/result_criteria/DONs.csv', delimiter=';')

In [154]:
p.head()

Unnamed: 0,Essay,Class,Type,Number of DONs
0,1,aver,1,0
1,2,aver,1,0
2,3,aver,1,3
3,4,worst,2,1
4,5,worst,1,0


In [155]:
best = p[p['Class'] == 'best']
worst = p[p['Class'] == 'worst']

In [156]:
from scipy import stats

In [158]:
t, p = stats.ttest_ind(best['Number of DONs'],worst['Number of DONs'])
print('Критерий:', 'Number of DONs')
print("t = " + str(round(t, 5)))
print("p = " + str(round(2*p, 5)))
if 2*p < 0.05:
    print("Статистически значимая разница по критерию '%s'" % 'Number of DONs')

Критерий: Number of DONs
t = 2.34642
p = 0.04189
Статистически значимая разница по критерию 'Number of DONs'
