In [1]:
from parsing import ParserUDpipe

In [2]:
#parser = ParserUDpipe("It is an example.")
#df = parser.conllu2df()

In [3]:
#df.head(10)

In [4]:
# проверить на punct

# Lexical Complexity

In [29]:
import collections
import json
import math

open_class = ["NOUN", "VERB", "ADV", "ADJ"]
with open('lists.json') as data_file:
    lists = json.load(data_file)
fivetfrequentCOCA = lists['5000frequentCOCA']
frequentverbsCOCAfromfivet = lists['frequentverbsCOCAfrom5000']
uwl = lists['UWL']

class LexicalComplexity:
    """Returns values of lexical criteria."""
    
    def __init__(self, text):
        self.text = text
    
    def get_verb_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'VERB']['Lemma']
    
    def get_noun_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'NOUN']['Lemma']

    def get_adj_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'ADJ']['Lemma']
    
    def get_adv_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'] == 'ADV']['Lemma']
    
    def get_lex_lemmas(self):
        df = parser.conllu2df()
        return df[df['UPosTag'].isin(open_class)]['Lemma']
    
    def get_lemmas(self):
        df = parser.conllu2df()
        return df['Lemma']
    
    def devision(self, list1, list2):
        try:
            return len(list1)/len(list2)
        except:
            return 0
        
    def corrected_devision(self, list1, list2):
        try:
            return len(list1)/math.sqrt(2*len(list2))
        except:
            return 0
        
    def root_devision(self, list1, list2):
        try:
            return len(list1)/math.sqrt(len(list2))
        except:
            return 0
        
    def squared_devision(self, list1, list2):
        try:
            return len(list1)**2/len(list2)
        except:
            return 0
        
    def log_devision(self, list1, list2):
        try:
            return math.log(len(list1))/math.log(len(list2))
        except:
            return 0
        
    def uber(self, list1, list2):
        try:
            return math.log(len(list1))**2/math.log(len(set(list2))/len(list1))
        except:
            return 0

    def density(self, punct=False):
        """
        number of lexical tokens/number of tokens
        """
        lex_lemmas = self.get_lex_lemmas()
        lemmas = self.get_lemmas()
        return self.devision(lex_lemmas, lemmas)
    
    def LS(self):
        """
        number of sophisticated lexical tokens/number of lexical tokens
        """
        lex_lemmas = self.get_lex_lemmas()
        soph_lex_lemmas = [i for i in lex_lemmas if i not in fivetfrequentCOCA]
        return self.devision(soph_lex_lemmas, lex_lemmas)
    
    def VS(self):
        """
        number of sophisticated verb lemmas/number of verb tokens
        """
        verb_lemmas = self.get_verb_lemmas()
        soph_verbs = set([i for i in verb_lemmas if i not in frequentverbsCOCAfromfivet])
        VSI = self.devision(soph_verbs, verb_lemmas)
        VSII = self.corrected_devision(soph_verbs, verb_lemmas)
        VSIII = self.squared_devision(soph_verbs, verb_lemmas)
        return VSI, VSII, VSIII

    def LFP(self):
        """
        Lexical Frequency Profile is the proportion of tokens:
        first - 1000 most frequent words
        second list - the second 1000
        third - University Word List (Xue & Nation 1989)
        none - list of those that are not in these lists
        """
        lemmas = self.get_lemmas()
        first = [i for i in lemmas if i in fivetfrequentCOCA[0:1000]]
        second = [i for i in lemmas if i in fivetfrequentCOCA[1000:2000]]
        third = [i for i in lemmas if i in uwl]
        first_procent = self.devision(first, lemmas)
        second_procent = self.devision(second, lemmas)
        third_procent = self.devision(third, lemmas)
        none = 1 - (first_procent + second_procent + third_procent)
        return first_procent, second_procent , third_procent, none
    
    def NDW(self):
        """
        number of lemmas
        """
        lemmas = self.get_lemmas()
        return len(set(lemmas))
    
    def TTR(self):
        """
        number of lemmas/number of tokens
        """
        lemmas = set(self.get_lemmas())
        tokens = self.get_lemmas()
        TTR = self.devision(lemmas, tokens)
        CTTR = self.corrected_devision(lemmas, tokens)
        RTTR = self.root_devision(lemmas, tokens)
        LogTTR = self.log_devision(lemmas, tokens)
        Uber = self.uber(lemmas, tokens)
        D = None
        return TTR, CTTR, RTTR, LogTTR, Uber, D

    def LV(self):
        """
        number of lexical lemmas/number of lexical tokens
        """
        lex_lemmas = set(self.get_lex_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return len(lex_lemmas)/len(lex_tokens)
    
    def VV(self):
        """
        VVI: number of verb lemmas/number of verb tokens
        VVII: number of verb lemmas/number of lexical tokens
        """
        verb_lemmas = set(self.get_verb_lemmas())
        verb_tokens = self.get_verb_lemmas()
        lex_tokens = self.get_lex_lemmas()
        VVI = self.devision(verb_lemmas, verb_tokens)
        SVVI = self.squared_devision(verb_lemmas, verb_tokens)
        CVVI = self.corrected_devision(verb_lemmas, verb_tokens)
        VVII = self.devision(verb_lemmas, lex_tokens)
        return VVI, SVVI, CVVI, VVII
        
    def NV(self):
        """
        number of noun lemmas/number of lexical tokens
        """
        noun_lemmas = set(self.get_noun_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return self.devision(noun_lemmas, lex_tokens)

    def AdjV(self):
        """
        number of adjective lemmas/number of lexical tokens
        """
        adj_lemmas = set(self.get_adj_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return self.devision(adj_lemmas, lex_tokens)
    
    def AdvV(self):
        """
        number of adverb lemmas/number of lexical tokens
        """
        adv_lemmas = set(self.get_adv_lemmas())
        lex_tokens = self.get_lex_lemmas()
        return self.devision(adv_lemmas, lex_tokens)
    
    def ModV(self):
        return self.AdjV() + self.AdvV()

In [30]:
example = "He hisses him."
parser = ParserUDpipe(example)
#print(parser.conllu2df())
LC = LexicalComplexity(example)

In [31]:
dict_lex_comp = {'density': LC.density(), 'LS': LC.LS(), 'VSI': LC.VS()[0],
                 'VSII': LC.VS()[1], 'VSIII': LC.VS()[2], 'LFP_first': LC.LFP()[0], 
                 'LFP_second': LC.LFP()[1], 'LFP_third': LC.LFP()[2], 'LFP_none': LC.LFP()[3], 
                 'NDW': LC.NDW(), 'TTR': LC.TTR()[0], 'CTTR': LC.TTR()[1], 'RTTR': LC.TTR()[2], 
                 'LogTTR': LC.TTR()[3], 'Uber': LC.TTR()[4], 'D': LC.TTR()[5], 
                 'LV': LC.LV(), 'VVI': LC.VV()[0], 'SVVI': LC.VV()[1], 'CVVI': LC.VV()[2],
                 'VVII': LC.VV()[3], 'NV': LC.NV(), 'AdjV': LC.AdjV(), 'AdvV': LC.AdvV(), 
                 'ModV': LC.ModV()}  

In [32]:
dict_lex_comp

{'AdjV': 0.0,
 'AdvV': 0.0,
 'CTTR': 1.0606601717798212,
 'CVVI': 0.7071067811865475,
 'D': None,
 'LFP_first': 0.5,
 'LFP_none': 0.5,
 'LFP_second': 0.0,
 'LFP_third': 0.0,
 'LS': 1.0,
 'LV': 1.0,
 'LogTTR': 0.7924812503605781,
 'ModV': 0.0,
 'NDW': 3,
 'NV': 0.0,
 'RTTR': 1.5,
 'SVVI': 1.0,
 'TTR': 0.75,
 'Uber': 0,
 'VSI': 1.0,
 'VSII': 0.7071067811865475,
 'VSIII': 1.0,
 'VVI': 1.0,
 'VVII': 1.0,
 'density': 0.25}

In [65]:
#with open('5000frequentCOCA.csv', 'r') as file:
#    f = file.read()
#f = f.replace('  ', '')
#with open('5000frequentCOCA.csv', 'w') as file:
#    file.write(f)

In [25]:
import pandas as pd

In [78]:
df1 = pd.read_csv('5000frequentCOCA.csv')
df2 = pd.read_csv('frequentverbsCOCAfrom5000.csv')
with open('UWL.txt', 'r') as file:
    f = file.read()
uwl = f.split()

In [79]:
import json

import io
try:
    to_unicode = unicode
except NameError:
    to_unicode = str

# Write JSON file
data = {'5000frequentCOCA': list(df1['Word']), 
        'frequentverbsCOCAfrom5000': list(df1['Word']), 
        'UWL': uwl}
with io.open('lists.json', 'w', encoding='utf8') as outfile:
    str_ = json.dumps(data,
                      indent=4, sort_keys=True,
                      separators=(',', ': '), ensure_ascii=False)
    outfile.write(to_unicode(str_))

# Read JSON file
with open('lists.json') as data_file:
    data_loaded = json.load(data_file)

print(data == data_loaded)

True


In [None]:
# проверить ord/lemma/token/verb

In [66]:
df = pd.read_csv('5000frequentCOCA.csv')

In [67]:
df.head()

Unnamed: 0,Rank,Word,Part_of_speech,Frequency,Dispersion
0,1,the,a,22038615,0.98
1,2,be,v,12545825,0.97
2,3,and,c,10741073,0.99
3,4,of,i,10343885,0.97
4,5,a,a,10144200,0.98
