In [37]:
import gensim
import pandas as pd
from lxml import html
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.wsd import lesk
from pymorphy2 import MorphAnalyzer
from string import punctuation
import json, os
from collections import Counter
import numpy as np
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
morph = MorphAnalyzer()
punct = punctuation+'«»—…“”*№–'
stops = set(stopwords.words('english'))

In [39]:
def normalize(text):
    
    words = [word.strip(punct) for word in text.lower().split()]
    words = [morph.parse(word)[0].normal_form for word in words if word]

    return words

def tokenize(text):
    
    words = [word.strip(punct) for word in text.lower().split() if word and word not in stops]
    words = [word for word in words if word]

    return words

In [40]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/kata/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [120]:
def lesk( word, sentence ):
    """Ваш код тут"""
    bestsense = 0
    maxoverlap = 0
    sample = set(tokenize(sentence))
    i = 0
    final_i = 0
    sens = 'cannot define your world, try another sentence'
    for synset in wn.synsets(word):
        i+=1
        current = set(tokenize(synset.definition()))
        maxoverlap = len(sample&current)
        if maxoverlap > bestsense:
            bestsense = maxoverlap
            sens = synset.definition()
            final_i = i
        current.clear()
    return final_i, sens

#### Теперь проверим, как работает наша функция с примерами для каждого из значений:

In [98]:
lesk('day', 'two days later they left')

(0, 'cannot define your world, try another sentence')

In [101]:
lesk('day', 'after that day she never trusted him again')

(3, 'a day assigned to a particular purpose or observance')

In [102]:
lesk('day', "What did you prepared for Mother's Day")

(3, 'a day assigned to a particular purpose or observance')

In [104]:
lesk('day', 'it is easier to make the repairs in the daytime')

(1, 'time for Earth to make a complete rotation on its axis')

In [107]:
lesk('day', 'she called it a day and went to bed')

(3, 'a day assigned to a particular purpose or observance')

In [110]:
lesk('day', 'in the days of sailing ships')

(0, 'cannot define your world, try another sentence')

In [112]:
lesk('day','how long is a day on Jupiter?')

(3, 'a day assigned to a particular purpose or observance')

In [117]:
lesk('day', 'he deserves his day in court')

(3, 'a day assigned to a particular purpose or observance')

Не очень хорошо. Похоже, единственное опрделение, где содержится слово day - третье, поэтому практически все примеры и попали под это определение. А если бы мы не считали слово day в совпадениях, то их бы вообще не было и ничего нельзя было бы определить. Улучшим нашу функцию, добавив помимо определения из словаря слова из примеров:

In [121]:
def improved_lesk( word, sentence ):
    """Ваш код тут"""
    bestsense = 0
    maxoverlap = 0
    sample = set(tokenize(sentence))
    i = 0
    final_i = 0
    sens = 'cannot define your world, try another sentence'
    for synset in wn.synsets(word):
        i+=1
        current = set(tokenize(synset.definition()))
        for example in synset.examples():
            current.update(set(tokenize(example)))
        maxoverlap = len(sample&current)
        if maxoverlap > bestsense:
            bestsense = maxoverlap
            sens = synset.definition()
            final_i = i
        current.clear()
    return final_i, sens

#### Протестируем работу новой функции:

In [123]:
improved_lesk('day', 'There are 365 days in a year')

(1, 'time for Earth to make a complete rotation on its axis')

In [126]:
improved_lesk('day', 'One day I will be a star')

(8,
 'the time for one complete rotation of the earth relative to a particular star, about 4 minutes shorter than a mean solar day')

In [127]:
improved_lesk('day', "What have you prepared for Mother's day?")

(3, 'a day assigned to a particular purpose or observance')

In [132]:
improved_lesk('day', "I like days more then nights, i hate when it is dark outside")

(1, 'time for Earth to make a complete rotation on its axis')

In [133]:
improved_lesk('day', "I was busy and worked all day")

(5,
 'the recurring hours when you are not sleeping (especially those when you are working)')

In [138]:
improved_lesk('day', "Life was difficult in the days of Vikings")

(1, 'time for Earth to make a complete rotation on its axis')

In [139]:
improved_lesk('day', 'how long is a day on Mars?')

(7,
 'the period of time taken by a particular planet (e.g. Mars) to make a complete rotation on its axis')

In [141]:
improved_lesk('day', 'everyone deserves a day of shine and glory')

(9, 'a period of opportunity')

Как видно, результаты, хоть и не совершенны, но уже намного лучше!