In [4]:
from gensim.models import KeyedVectors as KV
from gensim.models import Word2Vec as WV
from gensim.models import doc2vec
import warnings
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import sys
import os
import pymorphy2
from collections import namedtuple
import numpy as np
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from nltk import tokenize
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier 

  from numpy.core.umath_tests import inner1d


In [5]:
morph = pymorphy2.MorphAnalyzer()

warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

model = KV.load_word2vec_format("/Users/nadya73/Downloads/ruscorpora_upos_skipgram_600_10_2017.bin.gz", binary=True)

modelW = KV.load_word2vec_format("/Users/nadya73/Downloads/182/model.bin", binary=True)

In [6]:
model_CBOW = KV.load_word2vec_format("/Users/nadya73/Downloads/ruwikiruscorpora_upos_cbow_300_20_2017.bin.gz", binary=True)

In [7]:
model_SG = KV.load_word2vec_format("/Users/nadya73/Downloads/ruwikiruscorpora_upos_skipgram_300_2_2018.vec.gz")

In [8]:
import xml.etree.ElementTree as ET

tree_concepts = ET.parse('/Users/nadya73/Documents/university/diplom/diplom/ru_tez/concepts.xml')
root_concepts = tree_concepts.getroot()

concepts = {}

i = 0
for elem in root_concepts:
    name = ""
    for field in root_concepts.getchildren()[i].getchildren():
        if field.tag == "name":
            name = field.text
            break
    concepts[elem.attrib['id']] = name
    i += 1

tree_relations = ET.parse('/Users/nadya73/Documents/university/diplom/diplom/ru_tez/relations.xml')
root_relations = tree_relations.getroot()
rel_file = open('/Users/nadya73/Documents/university/diplom/diplom/ru_tez/relations.xml')
#for line in rel_file:
    #print(line)
relations_hyp_pos = set()
relations_hyp_neg = set()

i = 0
for elem in root_relations:
    if elem.attrib['name'] == 'ВЫШЕ':
        if len(concepts[elem.attrib['from']].split(" ")) is 1 and len(concepts[elem.attrib['to']].split(" ")) is 1:
            relations_hyp_pos.add((concepts[elem.attrib['from']].lower(), concepts[elem.attrib['to']].lower()))
    if elem.attrib['name'] == 'НИЖЕ':
        if len(concepts[elem.attrib['from']].split(" ")) is 1 and len(concepts[elem.attrib['to']].split(" ")) is 1:
            relations_hyp_pos.add((concepts[elem.attrib['to']].lower(), concepts[elem.attrib['from']].lower()))

    if elem.attrib['name'] == 'ЧАСТЬ' or elem.attrib['name'] == 'АСЦ1' or elem.attrib['name'] == 'АСЦ2':
        if len(concepts[elem.attrib['from']].split(" ")) is 1 and len(concepts[elem.attrib['to']].split(" ")) is 1:
            relations_hyp_neg.add((concepts[elem.attrib['from']].lower(), concepts[elem.attrib['to']].lower()))


In [9]:
def get_pairs(file):
    pairs = []
    for line in file:
        data = line.rstrip("\n").split('\t')
        pair = (morph.parse(data[0])[0].normal_form, morph.parse(data[1])[0].normal_form)
        if len(data[0].split(' ')) == 1 and len(data[1].split(' ')) == 1 and \
            morph.tag(pair[0])[0].POS == 'NOUN' and morph.tag(pair[1])[0].POS == 'NOUN':
            pairs.append((pair[0], pair[1]))
    return pairs

def get_hyperonims():
    ok_hyperonims = open("/Users/nadya73/Documents/university/diplom/ok_hyperonims_50", "r")
    return get_pairs(ok_hyperonims)


def get_not_hyperonims():
    not_hyperonims = open("/Users/nadya73/Documents/university/diplom/not_hyperonims_2", "r")
    return get_pairs(not_hyperonims)

hyperonims = get_hyperonims() #хорошие пары, выражающие выбранное отношение
not_hyperonims = get_not_hyperonims()  #плохие пары, не выражающие данное отношение

In [10]:
def get_features_concat(pairs, model):
    vectors = []
    for pair in pairs:
        try:
            w2v_1 = model.get_vector(pair[0] + '_NOUN')
            w2v_2 = model.get_vector(pair[1] + '_NOUN')
            res = np.concatenate((w2v_1, w2v_2))
            vectors.append(res)
        except KeyError:
            continue
    return vectors

In [11]:
def get_features_diff(pairs, model):
    vectors = []
    for pair in pairs:
        try:
            w2v_1 = model.get_vector(pair[0] + '_NOUN')
            w2v_2 = model.get_vector(pair[1] + '_NOUN')
            res = w2v_1 - w2v_2
            vectors.append(res)
        except KeyError:
            continue
    return vectors

In [12]:
def get_features_sum(pairs, model):
    vectors = []
    for pair in pairs:
        try:
            w2v_1 = model.get_vector(pair[0] + '_NOUN')
            w2v_2 = model.get_vector(pair[1] + '_NOUN')
            res = (w2v_1 + w2v_2)
            vectors.append(res)
        except KeyError:
            continue
    return vectors

In [36]:
def get_features_pw(pairs, model):
    vectors = []
    for pair in pairs:
        try:
            w2v_1 = model.get_vector(pair[0] + '_NOUN')
            w2v_2 = model.get_vector(pair[1] + '_NOUN')
            res = np.array(w2v_1) * np.array(w2v_2) 
            vectors.append(res)
        except KeyError:
            continue
    return vectors

In [13]:
len(relations_hyp_pos)

4217

In [14]:
len(relations_hyp_neg)

4041

In [15]:
len(hyperonims)

3499

In [16]:
len(not_hyperonims)

367064

In [17]:
def get_labels(pos_len, neg_len):
    ok_labels = np.empty(pos_len)
    not_labels = np.empty(neg_len)
    ok_labels.fill(1)
    not_labels.fill(0)
    return np.append(ok_labels, not_labels)

In [24]:
def learn_classifiers(model):
    features_pos = get_features_diff(relations_hyp_pos, model)
    features_pos = np.concatenate((features_pos, get_features_diff(hyperonims, model)))

    features_neg = get_features_diff(relations_hyp_neg, model)
    features_neg = np.concatenate((features_neg, get_features_diff(not_hyperonims[:len(hyperonims)], model)))
    
    print(len(features_pos))
    print(len(features_neg))
    
    COUNT_POS_TRAIN = int(len(features_pos) * 0.8)
    COUNT_POS_TEST = len(features_pos) - COUNT_POS_TRAIN

    COUNT_NEG_TRAIN = int(len(features_neg) * 0.8)
    COUNT_NEG_TEST = len(features_neg) - COUNT_NEG_TRAIN
    
    features_pos_train = features_pos[:COUNT_POS_TRAIN]
    features_pos_test = features_pos[COUNT_POS_TRAIN:]

    features_neg_train = features_neg[:COUNT_NEG_TRAIN]
    features_neg_test = features_neg[COUNT_NEG_TRAIN:]
    
    features_train = np.concatenate((features_pos_train, features_neg_train))
    labels_train = get_labels(COUNT_POS_TRAIN, COUNT_NEG_TRAIN)

    features_test = np.concatenate((features_pos_test, features_neg_test))
    labels_test = get_labels(COUNT_POS_TEST, COUNT_NEG_TEST)
    
    classifiers = {'NB': GaussianNB(),
          'KNN3': KNeighborsClassifier(n_neighbors=3),
          'KNN4': KNeighborsClassifier(n_neighbors=4),
          'KNN5': KNeighborsClassifier(n_neighbors=5),
          'KNN6': KNeighborsClassifier(n_neighbors=6),
          'KNN7': KNeighborsClassifier(n_neighbors=7),
          'DT': DecisionTreeClassifier(),
          'LR': LogisticRegression(),
          'RF': RandomForestClassifier(),
          'RF1000': RandomForestClassifier(n_estimators=1000, max_depth=6, random_state=0),
          'RF100': RandomForestClassifier(n_estimators=100, max_depth=6, random_state=0),
          'RF2000': RandomForestClassifier(n_estimators=2000, max_depth=6, random_state=0),
          'RF500': RandomForestClassifier(n_estimators=500, max_depth=6, random_state=0),
          'SVM': SVC()}
    
    for classifier in classifiers.keys():
        print(classifier)
        classifiers[classifier].fit(features_train, labels_train)
        predicted = classifiers[classifier].predict(features_test)
        print(metrics.classification_report(labels_test, predicted))
        print(metrics.confusion_matrix(labels_test, predicted))
    
    print("----------------")

In [109]:
learn_classifiers(model)

6183
5774
NB
             precision    recall  f1-score   support

        0.0       0.72      0.66      0.69      1155
        1.0       0.70      0.76      0.73      1237

avg / total       0.71      0.71      0.71      2392

[[758 397]
 [300 937]]
KNN3
             precision    recall  f1-score   support

        0.0       0.75      0.71      0.73      1155
        1.0       0.74      0.78      0.76      1237

avg / total       0.75      0.75      0.75      2392

[[819 336]
 [266 971]]
KNN4
             precision    recall  f1-score   support

        0.0       0.72      0.79      0.75      1155
        1.0       0.78      0.71      0.74      1237

avg / total       0.75      0.75      0.75      2392

[[909 246]
 [359 878]]
KNN5
             precision    recall  f1-score   support

        0.0       0.78      0.54      0.64      1155
        1.0       0.67      0.86      0.75      1237

avg / total       0.72      0.71      0.70      2392

[[ 626  529]
 [ 175 1062]]
KNN6
           

In [110]:
learn_classifiers(modelW)

5671
5233
NB
             precision    recall  f1-score   support

        0.0       0.77      0.58      0.66      1047
        1.0       0.68      0.84      0.75      1135

avg / total       0.73      0.72      0.71      2182

[[608 439]
 [182 953]]
KNN3
             precision    recall  f1-score   support

        0.0       0.79      0.57      0.66      1047
        1.0       0.68      0.86      0.76      1135

avg / total       0.74      0.72      0.71      2182

[[598 449]
 [159 976]]
KNN4
             precision    recall  f1-score   support

        0.0       0.74      0.71      0.73      1047
        1.0       0.75      0.77      0.76      1135

avg / total       0.74      0.74      0.74      2182

[[748 299]
 [259 876]]
KNN5
             precision    recall  f1-score   support

        0.0       0.83      0.31      0.45      1047
        1.0       0.60      0.94      0.73      1135

avg / total       0.71      0.64      0.59      2182

[[ 320  727]
 [  64 1071]]
KNN6
           

In [111]:
learn_classifiers(model_CBOW)

6491
6022
NB
             precision    recall  f1-score   support

        0.0       0.67      0.52      0.59      1205
        1.0       0.63      0.76      0.69      1299

avg / total       0.65      0.65      0.64      2504

[[627 578]
 [308 991]]
KNN3
             precision    recall  f1-score   support

        0.0       0.75      0.65      0.70      1205
        1.0       0.71      0.80      0.75      1299

avg / total       0.73      0.73      0.73      2504

[[ 781  424]
 [ 256 1043]]
KNN4
             precision    recall  f1-score   support

        0.0       0.71      0.73      0.72      1205
        1.0       0.74      0.72      0.73      1299

avg / total       0.72      0.72      0.72      2504

[[883 322]
 [369 930]]
KNN5
             precision    recall  f1-score   support

        0.0       0.81      0.46      0.59      1205
        1.0       0.64      0.90      0.75      1299

avg / total       0.72      0.69      0.67      2504

[[ 555  650]
 [ 130 1169]]
KNN6
       

С разностью:

In [25]:
learn_classifiers(model_CBOW)

6491
6022
NB
             precision    recall  f1-score   support

        0.0       0.67      0.52      0.59      1205
        1.0       0.63      0.76      0.69      1299

avg / total       0.65      0.65      0.64      2504

[[627 578]
 [308 991]]
KNN3
             precision    recall  f1-score   support

        0.0       0.75      0.65      0.70      1205
        1.0       0.71      0.80      0.75      1299

avg / total       0.73      0.73      0.73      2504

[[ 781  424]
 [ 255 1044]]
KNN4
             precision    recall  f1-score   support

        0.0       0.71      0.73      0.72      1205
        1.0       0.74      0.72      0.73      1299

avg / total       0.72      0.72      0.72      2504

[[883 322]
 [369 930]]
KNN5
             precision    recall  f1-score   support

        0.0       0.81      0.46      0.59      1205
        1.0       0.64      0.90      0.75      1299

avg / total       0.72      0.69      0.67      2504

[[ 555  650]
 [ 130 1169]]
KNN6
       

In [27]:
learn_classifiers(model_SG)

6174
5934
NB
             precision    recall  f1-score   support

        0.0       0.72      0.65      0.68      1187
        1.0       0.69      0.75      0.72      1235

avg / total       0.71      0.70      0.70      2422

[[776 411]
 [305 930]]
KNN3
             precision    recall  f1-score   support

        0.0       0.80      0.69      0.74      1187
        1.0       0.73      0.83      0.78      1235

avg / total       0.76      0.76      0.76      2422

[[ 815  372]
 [ 210 1025]]
KNN4
             precision    recall  f1-score   support

        0.0       0.74      0.75      0.75      1187
        1.0       0.76      0.75      0.75      1235

avg / total       0.75      0.75      0.75      2422

[[892 295]
 [311 924]]
KNN5
             precision    recall  f1-score   support

        0.0       0.78      0.47      0.58      1187
        1.0       0.63      0.87      0.73      1235

avg / total       0.70      0.67      0.66      2422

[[ 553  634]
 [ 159 1076]]
KNN6
       

In [112]:
learn_classifiers(model_SG)

6174
5934
NB
             precision    recall  f1-score   support

        0.0       0.72      0.65      0.68      1187
        1.0       0.69      0.75      0.72      1235

avg / total       0.71      0.70      0.70      2422

[[776 411]
 [305 930]]
KNN3
             precision    recall  f1-score   support

        0.0       0.80      0.69      0.74      1187
        1.0       0.73      0.83      0.78      1235

avg / total       0.76      0.76      0.76      2422

[[ 815  372]
 [ 210 1025]]
KNN4
             precision    recall  f1-score   support

        0.0       0.74      0.75      0.75      1187
        1.0       0.76      0.75      0.75      1235

avg / total       0.75      0.75      0.75      2422

[[893 294]
 [311 924]]
KNN5
             precision    recall  f1-score   support

        0.0       0.78      0.47      0.59      1187
        1.0       0.63      0.87      0.73      1235

avg / total       0.70      0.67      0.66      2422

[[ 557  630]
 [ 159 1076]]
KNN6
       

In [13]:
a = [[1,2], [2,3]]

a = np.concatenate((a, [[1,2], [2,3]]))

In [14]:
a

array([[1, 2],
       [2, 3],
       [1, 2],
       [2, 3]])

In [114]:
def learn_classifiers_concat(model):
    features_pos = get_features_concat(relations_hyp_pos, model)
    features_pos = np.concatenate((features_pos, get_features_concat(hyperonims, model)))

    features_neg = get_features_concat(relations_hyp_neg, model)
    features_neg = np.concatenate((features_neg, get_features_concat(not_hyperonims[:len(hyperonims)], model)))
    
    print(len(features_pos))
    print(len(features_neg))
    
    COUNT_POS_TRAIN = int(len(features_pos) * 0.8)
    COUNT_POS_TEST = len(features_pos) - COUNT_POS_TRAIN

    COUNT_NEG_TRAIN = int(len(features_neg) * 0.8)
    COUNT_NEG_TEST = len(features_neg) - COUNT_NEG_TRAIN
    
    features_pos_train = features_pos[:COUNT_POS_TRAIN]
    features_pos_test = features_pos[COUNT_POS_TRAIN:]

    features_neg_train = features_neg[:COUNT_NEG_TRAIN]
    features_neg_test = features_neg[COUNT_NEG_TRAIN:]
    
    features_train = np.concatenate((features_pos_train, features_neg_train))
    labels_train = get_labels(COUNT_POS_TRAIN, COUNT_NEG_TRAIN)

    features_test = np.concatenate((features_pos_test, features_neg_test))
    labels_test = get_labels(COUNT_POS_TEST, COUNT_NEG_TEST)
    
    classifiers = {'NB': GaussianNB(),
          'KNN3': KNeighborsClassifier(n_neighbors=3),
          'KNN4': KNeighborsClassifier(n_neighbors=4),
          'KNN5': KNeighborsClassifier(n_neighbors=5),
          'KNN6': KNeighborsClassifier(n_neighbors=6),
          'KNN7': KNeighborsClassifier(n_neighbors=7),
          'DT': DecisionTreeClassifier(),
          'LR': LogisticRegression(),
          'RF': RandomForestClassifier()}
    
    for classifier in classifiers.keys():
        print(classifier)
        classifiers[classifier].fit(features_train, labels_train)
        predicted = classifiers[classifier].predict(features_test)
        print(metrics.classification_report(labels_test, predicted))
        print(metrics.confusion_matrix(labels_test, predicted))
    
    print("----------------")

In [115]:
learn_classifiers_concat(modelW)

NameError: name 'w2_1' is not defined

СУММА И КОНКАТЕНАЦИЯ

In [26]:
def learn_classifiers_sum(model):
    features_pos = get_features_sum(relations_hyp_pos, model)
    features_pos = np.concatenate((features_pos, get_features_sum(hyperonims, model)))

    features_neg = get_features_sum(relations_hyp_neg, model)
    features_neg = np.concatenate((features_neg, get_features_sum(not_hyperonims[:len(hyperonims)], model)))
    
    print(len(features_pos))
    print(len(features_neg))
    
    COUNT_POS_TRAIN = int(len(features_pos) * 0.8)
    COUNT_POS_TEST = len(features_pos) - COUNT_POS_TRAIN

    COUNT_NEG_TRAIN = int(len(features_neg) * 0.8)
    COUNT_NEG_TEST = len(features_neg) - COUNT_NEG_TRAIN
    
    features_pos_train = features_pos[:COUNT_POS_TRAIN]
    features_pos_test = features_pos[COUNT_POS_TRAIN:]

    features_neg_train = features_neg[:COUNT_NEG_TRAIN]
    features_neg_test = features_neg[COUNT_NEG_TRAIN:]
    
    features_train = np.concatenate((features_pos_train, features_neg_train))
    labels_train = get_labels(COUNT_POS_TRAIN, COUNT_NEG_TRAIN)

    features_test = np.concatenate((features_pos_test, features_neg_test))
    labels_test = get_labels(COUNT_POS_TEST, COUNT_NEG_TEST)
    
    classifiers = {'NB': GaussianNB(),
          'KNN3': KNeighborsClassifier(n_neighbors=3),
          'KNN4': KNeighborsClassifier(n_neighbors=4),
          'KNN5': KNeighborsClassifier(n_neighbors=5),
          'KNN6': KNeighborsClassifier(n_neighbors=6),
          'KNN7': KNeighborsClassifier(n_neighbors=7),
          'KNN15': KNeighborsClassifier(n_neighbors=15),
          'DT': DecisionTreeClassifier(),
          'LR': LogisticRegression(),
          'RF': RandomForestClassifier(),
          'RF1000': RandomForestClassifier(n_estimators=1000, max_depth=6, random_state=0),
          'RF100': RandomForestClassifier(n_estimators=100, max_depth=6, random_state=0),
          'RF2000': RandomForestClassifier(n_estimators=2000, max_depth=6, random_state=0),
          'RF500': RandomForestClassifier(n_estimators=500, max_depth=6, random_state=0),
          'SVM': SVC()}
    
    for classifier in classifiers.keys():
        print(classifier)
        classifiers[classifier].fit(features_train, labels_train)
        predicted = classifiers[classifier].predict(features_test)
        print(metrics.classification_report(labels_test, predicted))
        print(metrics.confusion_matrix(labels_test, predicted))
    
    print("----------------")

In [21]:
def learn_classifiers_concat(model):
    features_pos = get_features_concat(relations_hyp_pos, model)
    features_pos = np.concatenate((features_pos, get_features_concat(hyperonims, model)))

    features_neg = get_features_concat(relations_hyp_neg, model)
    features_neg = np.concatenate((features_neg, get_features_concat(not_hyperonims[:len(hyperonims)], model)))
    
    print(len(features_pos))
    print(len(features_neg))
    
    COUNT_POS_TRAIN = int(len(features_pos) * 0.8)
    COUNT_POS_TEST = len(features_pos) - COUNT_POS_TRAIN

    COUNT_NEG_TRAIN = int(len(features_neg) * 0.8)
    COUNT_NEG_TEST = len(features_neg) - COUNT_NEG_TRAIN
    
    features_pos_train = features_pos[:COUNT_POS_TRAIN]
    features_pos_test = features_pos[COUNT_POS_TRAIN:]

    features_neg_train = features_neg[:COUNT_NEG_TRAIN]
    features_neg_test = features_neg[COUNT_NEG_TRAIN:]
    
    features_train = np.concatenate((features_pos_train, features_neg_train))
    labels_train = get_labels(COUNT_POS_TRAIN, COUNT_NEG_TRAIN)

    features_test = np.concatenate((features_pos_test, features_neg_test))
    labels_test = get_labels(COUNT_POS_TEST, COUNT_NEG_TEST)
    
    classifiers = {'NB': GaussianNB(),
          'KNN3': KNeighborsClassifier(n_neighbors=3),
          'KNN4': KNeighborsClassifier(n_neighbors=4),
          'KNN5': KNeighborsClassifier(n_neighbors=5),
          'KNN6': KNeighborsClassifier(n_neighbors=6),
          'KNN7': KNeighborsClassifier(n_neighbors=7),
          'KNN21': KNeighborsClassifier(n_neighbors=21),
          'KNN35': KNeighborsClassifier(n_neighbors=35),
          'KNN45': KNeighborsClassifier(n_neighbors=45),
          'DT': DecisionTreeClassifier(),
          'LR': LogisticRegression(),
          'RF': RandomForestClassifier(),
          'RF1000': RandomForestClassifier(n_estimators=1000, max_depth=6, random_state=0),
          'RF100': RandomForestClassifier(n_estimators=100, max_depth=6, random_state=0),
          'RF2000': RandomForestClassifier(n_estimators=2000, max_depth=6, random_state=0),
          'RF500': RandomForestClassifier(n_estimators=500, max_depth=6, random_state=0),
          'SVM': SVC()}
    
    for classifier in classifiers.keys():
        print(classifier)
        classifiers[classifier].fit(features_train, labels_train)
        predicted = classifiers[classifier].predict(features_test)
        print(metrics.classification_report(labels_test, predicted))
        print(metrics.confusion_matrix(labels_test, predicted))
        
    print("----------------")

In [32]:
def learn_classifiers_PW(model):
    features_pos = get_features_pw(relations_hyp_pos, model)
    features_pos = np.concatenate((features_pos, get_features_pw(hyperonims, model)))

    features_neg = get_features_pw(relations_hyp_neg, model)
    features_neg = np.concatenate((features_neg, get_features_pw(not_hyperonims[:len(hyperonims)], model)))
    
    print(len(features_pos))
    print(len(features_neg))
    
    COUNT_POS_TRAIN = int(len(features_pos) * 0.8)
    COUNT_POS_TEST = len(features_pos) - COUNT_POS_TRAIN

    COUNT_NEG_TRAIN = int(len(features_neg) * 0.8)
    COUNT_NEG_TEST = len(features_neg) - COUNT_NEG_TRAIN
    
    features_pos_train = features_pos[:COUNT_POS_TRAIN]
    features_pos_test = features_pos[COUNT_POS_TRAIN:]

    features_neg_train = features_neg[:COUNT_NEG_TRAIN]
    features_neg_test = features_neg[COUNT_NEG_TRAIN:]
    
    features_train = np.concatenate((features_pos_train, features_neg_train))
    labels_train = get_labels(COUNT_POS_TRAIN, COUNT_NEG_TRAIN)

    features_test = np.concatenate((features_pos_test, features_neg_test))
    labels_test = get_labels(COUNT_POS_TEST, COUNT_NEG_TEST)
    
    classifiers = {'NB': GaussianNB(),
          'KNN3': KNeighborsClassifier(n_neighbors=3),
          'KNN4': KNeighborsClassifier(n_neighbors=4),
          'KNN5': KNeighborsClassifier(n_neighbors=5),
          'KNN6': KNeighborsClassifier(n_neighbors=6),
          'KNN7': KNeighborsClassifier(n_neighbors=7),
          'KNN21': KNeighborsClassifier(n_neighbors=21),
          'KNN35': KNeighborsClassifier(n_neighbors=35),
          'KNN45': KNeighborsClassifier(n_neighbors=45),
          'DT': DecisionTreeClassifier(),
          'LR': LogisticRegression(),
          'RF': RandomForestClassifier(),
          'RF1000': RandomForestClassifier(n_estimators=1000, max_depth=6, random_state=0),
          'RF100': RandomForestClassifier(n_estimators=100, max_depth=6, random_state=0),
          'RF2000': RandomForestClassifier(n_estimators=2000, max_depth=6, random_state=0),
          'RF500': RandomForestClassifier(n_estimators=500, max_depth=6, random_state=0),
          'SVM': SVC()}
    
    for classifier in classifiers.keys():
        print(classifier)
        classifiers[classifier].fit(features_train, labels_train)
        predicted = classifiers[classifier].predict(features_test)
        print(metrics.classification_report(labels_test, predicted))
        print(metrics.confusion_matrix(labels_test, predicted))
        
    print("----------------")

In [22]:
learn_classifiers_sum(model_CBOW)

6491
6022
NB
             precision    recall  f1-score   support

        0.0       0.61      0.61      0.61      1205
        1.0       0.64      0.64      0.64      1299

avg / total       0.62      0.62      0.62      2504

[[732 473]
 [474 825]]
KNN3
             precision    recall  f1-score   support

        0.0       0.82      0.46      0.59      1205
        1.0       0.64      0.91      0.75      1299

avg / total       0.73      0.69      0.67      2504

[[ 552  653]
 [ 120 1179]]
KNN4
             precision    recall  f1-score   support

        0.0       0.78      0.54      0.64      1205
        1.0       0.67      0.86      0.75      1299

avg / total       0.72      0.71      0.70      2504

[[ 656  549]
 [ 187 1112]]
KNN5
             precision    recall  f1-score   support

        0.0       0.82      0.42      0.55      1205
        1.0       0.63      0.91      0.74      1299

avg / total       0.72      0.67      0.65      2504

[[ 505  700]
 [ 114 1185]]
KNN6
   

In [29]:
learn_classifiers_sum(model_SG)

6174
5934
NB
             precision    recall  f1-score   support

        0.0       0.70      0.62      0.66      1187
        1.0       0.67      0.74      0.71      1235

avg / total       0.68      0.68      0.68      2422

[[733 454]
 [315 920]]
KNN3
             precision    recall  f1-score   support

        0.0       0.82      0.46      0.59      1187
        1.0       0.64      0.90      0.75      1235

avg / total       0.73      0.69      0.67      2422

[[ 548  639]
 [ 118 1117]]
KNN4
             precision    recall  f1-score   support

        0.0       0.77      0.57      0.66      1187
        1.0       0.67      0.84      0.74      1235

avg / total       0.72      0.71      0.70      2422

[[ 680  507]
 [ 203 1032]]
KNN5
             precision    recall  f1-score   support

        0.0       0.80      0.44      0.57      1187
        1.0       0.62      0.90      0.74      1235

avg / total       0.71      0.67      0.65      2422

[[ 517  670]
 [ 126 1109]]
KNN6
   

In [23]:
learn_classifiers_concat(model_CBOW)

6491
6022
NB
             precision    recall  f1-score   support

        0.0       0.67      0.54      0.60      1205
        1.0       0.64      0.76      0.69      1299

avg / total       0.66      0.65      0.65      2504

[[648 557]
 [314 985]]
KNN3
             precision    recall  f1-score   support

        0.0       0.82      0.44      0.57      1205
        1.0       0.64      0.91      0.75      1299

avg / total       0.73      0.68      0.66      2504

[[ 525  680]
 [ 113 1186]]
KNN4
             precision    recall  f1-score   support

        0.0       0.78      0.51      0.62      1205
        1.0       0.66      0.87      0.75      1299

avg / total       0.72      0.70      0.69      2504

[[ 620  585]
 [ 174 1125]]
KNN5
             precision    recall  f1-score   support

        0.0       0.81      0.40      0.53      1205
        1.0       0.62      0.91      0.74      1299

avg / total       0.71      0.67      0.64      2504

[[ 478  727]
 [ 111 1188]]
KNN6
   

In [28]:
learn_classifiers_concat(model_SG)

6174
5934
NB
             precision    recall  f1-score   support

        0.0       0.74      0.52      0.61      1187
        1.0       0.64      0.82      0.72      1235

avg / total       0.69      0.68      0.67      2422

[[ 619  568]
 [ 217 1018]]
KNN3
             precision    recall  f1-score   support

        0.0       0.83      0.47      0.60      1187
        1.0       0.64      0.91      0.75      1235

avg / total       0.73      0.69      0.68      2422

[[ 557  630]
 [ 112 1123]]
KNN4
             precision    recall  f1-score   support

        0.0       0.78      0.57      0.66      1187
        1.0       0.67      0.85      0.75      1235

avg / total       0.72      0.71      0.70      2422

[[ 671  516]
 [ 190 1045]]
KNN5
             precision    recall  f1-score   support

        0.0       0.80      0.41      0.54      1187
        1.0       0.61      0.90      0.73      1235

avg / total       0.71      0.66      0.64      2422

[[ 486  701]
 [ 118 1117]]
KNN6

In [37]:
learn_classifiers_PW(model_SG)

6174
5934
NB
             precision    recall  f1-score   support

        0.0       0.65      0.73      0.69      1187
        1.0       0.71      0.61      0.66      1235

avg / total       0.68      0.67      0.67      2422

[[872 315]
 [478 757]]
KNN3
             precision    recall  f1-score   support

        0.0       0.80      0.50      0.62      1187
        1.0       0.65      0.88      0.75      1235

avg / total       0.72      0.70      0.68      2422

[[ 597  590]
 [ 148 1087]]
KNN4
             precision    recall  f1-score   support

        0.0       0.78      0.61      0.68      1187
        1.0       0.69      0.83      0.75      1235

avg / total       0.73      0.72      0.72      2422

[[ 721  466]
 [ 206 1029]]
KNN5
             precision    recall  f1-score   support

        0.0       0.79      0.48      0.60      1187
        1.0       0.64      0.88      0.74      1235

avg / total       0.71      0.68      0.67      2422

[[ 567  620]
 [ 147 1088]]
KNN6
   

  'precision', 'predicted', average, warn_for)


In [38]:
learn_classifiers_PW(model_CBOW)

6491
6022
NB
             precision    recall  f1-score   support

        0.0       0.61      0.60      0.60      1205
        1.0       0.63      0.64      0.64      1299

avg / total       0.62      0.62      0.62      2504

[[717 488]
 [466 833]]
KNN3
             precision    recall  f1-score   support

        0.0       0.82      0.53      0.64      1205
        1.0       0.67      0.89      0.77      1299

avg / total       0.74      0.72      0.71      2504

[[ 635  570]
 [ 141 1158]]
KNN4
             precision    recall  f1-score   support

        0.0       0.77      0.62      0.69      1205
        1.0       0.70      0.83      0.76      1299

avg / total       0.74      0.73      0.73      2504

[[ 749  456]
 [ 223 1076]]
KNN5
             precision    recall  f1-score   support

        0.0       0.83      0.49      0.62      1205
        1.0       0.66      0.91      0.76      1299

avg / total       0.74      0.71      0.69      2504

[[ 591  614]
 [ 123 1176]]
KNN6
   

  'precision', 'predicted', average, warn_for)


### Тест: поизучать расстояние между векторами в парах

In [11]:
from scipy.spatial.distance import pdist

In [18]:
arr = [0] * 10
min = 1000
max = 0
for pair in hyperonims:
    try:
        w2v_1 = model.get_vector(pair[0] + '_NOUN')
        w2v_2 = model.get_vector(pair[1] + '_NOUN')
    except:
        continue
    rast = pdist([w2v_1, w2v_2], 'cosine')
    #rast = np.sqrt(np.sum((w2v_1 - w2v_2) ** 2))
    arr[int(rast * 10)] += 1
    if min >= rast and rast > 0:
        min = rast
    if max < rast:
        max = rast
print(min)
print(max)
print(arr)

[2.22044605e-16]
[0.95612142]
[1, 0, 5, 33, 309, 576, 831, 1061, 482, 33]


In [19]:
1.0 - 1.3

-0.30000000000000004

In [26]:
arr = [0] * 15
min = 1000
max = 0
for pair in not_hyperonims:
    try:
        w2v_1 = model.get_vector(pair[0] + '_NOUN')
        w2v_2 = model.get_vector(pair[1] + '_NOUN')
    except:
        continue
    rast = pdist([w2v_1, w2v_2], 'cosine')
    #rast = np.sqrt(np.sum((w2v_1 - w2v_2) ** 2))
    arr[int(rast * 10)] += 1
    if min >= rast and rast > 0:
        min = rast
    if max < rast:
        max = rast
print(min)
print(max)
print(arr)

[1.11022302e-16]
[1.03514759]
[453, 22, 129, 796, 5351, 16418, 41047, 97500, 138751, 23091, 75, 0, 0, 0, 0]


In [23]:
4-8
0.9 - 1.3

-0.4

In [24]:

arr = [0] * 20
min = 1000
max = 0
for pair in relations_hyp_pos:
    try:
        w2v_1 = model.get_vector(pair[0] + '_NOUN')
        w2v_2 = model.get_vector(pair[1] + '_NOUN')
    except:
        continue
    rast = pdist([w2v_1, w2v_2], 'cosine')
    #rast = np.sqrt(np.sum((w2v_1 - w2v_2) ** 2))
    arr[int(rast * 10)] += 1
    if min >= rast and rast > 0:
        min = rast
    if max < rast:
        max = rast
print(min)
print(max)
print(arr)


[0.16472771]
[0.92994878]
[0, 4, 25, 161, 663, 873, 678, 348, 97, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [25]:
arr = [0] * 20
min = 1000
max = 0
for pair in relations_hyp_neg:
    try:
        w2v_1 = model.get_vector(pair[0] + '_NOUN')
        w2v_2 = model.get_vector(pair[1] + '_NOUN')
    except:
        continue
    rast = pdist([w2v_1, w2v_2], 'cosine')
    #rast = np.sqrt(np.sum((w2v_1 - w2v_2) ** 2))
    arr[int(rast * 10) ] += 1
    if min >= rast and rast > 0:
        min = rast
    if max < rast:
        max = rast
print(min)
print(max)
print(arr)

[0.19989802]
[0.92795374]
[0, 2, 14, 130, 460, 934, 823, 292, 42, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
