In [6]:
import django

In [7]:
django.setup()

In [8]:
from corpustool.models import *

In [11]:
from stattool.settings import LANG_MODEL_PATH, TEXT_FILE_PATH

In [9]:
import os

In [4]:
help(os.path.exists)

Help on function exists in module genericpath:

exists(path)
    Test whether a path exists.  Returns False for broken symbolic links



In [3]:
os.getcwd()

'D:\\Studies\\3rd-year-thesis\\Code\\stattool'

In [17]:
data_folder = TEXT_FILE_PATH

In [18]:
data_folder

'C:\\HP_PC\\hp_pc\\Studies\\3rd-year-thesis\\Data\\realec_dump_2019_2_10_16_7_20_6_41_0'

In [19]:
from corpustool.management.commands.udpiper import Udpiper

In [20]:
import re

In [21]:
def escapepath(fp):
    return fp.replace(os.sep, '%').replace('/', '%')

In [22]:
def try_to_get(Model, *args, **kwargs):
    ''' 
    Returns Model instance with specified property if it exists,
    If it doesn't creates a new one
    
    Arguments:
    Model - a class inherited from django models.Model
    *args, **kwargs - unnamed and named properties of needed object
    '''
    try:
        obj = Model.objects.get(*args, **kwargs)
    except Model.DoesNotExist:
        obj = Model(*args, **kwargs)
        obj.save()
    return obj

In [23]:
extension_split = lambda x: x[:x.rfind('.')]

In [64]:
class TextBaseFiller(object):
    def __init__(self, inp='raw_text', model=None, folder='.', recursive=True, include_metadata=False):
        self.inp = inp
        if self.inp == 'raw_text':
            if not model:
                raise Exception("No model for UDPipe specified")
            self.piper = Udpiper(model)
        elif self.inp == 'conllu':
            pass
        else:
            raise ValueError("Only 'raw_text' or 'conllu' can be passed as 'model' argument")
        
        self.include_metadata = include_metadata
        self.folder = folder
        self.text_filenames = []
        if recursive:
            for root, dirs, files in os.walk(self.folder):
                for f in files:
                    if f.endswith('.txt'):
                        self.text_filenames.append(os.path.join(root, f))
        else:
            self.text_filenames = [os.path.join(self.folder, f) for f in os.listdir(self.folder) if f.endswith('.txt')]
        self.current_text_id = 0
    
    
    def process_all(self, show_titles=False, show_time = False):
        st_time = time.time()
        while self.current_text_id < len(self.text_filenames):
            if show_titles:
                print(self.text_filenames[self.current_text_id])
            if show_time:
                print(time.time() - st_time)
            self.process_next()
    
    
    def process_next_n(self, n, show_titles=False, show_time = False):
        st_time = time.time()
        limit = min(self.current_text_id + n, len(self.text_filenames))
        while self.current_text_id < limit:
            if show_titles:
                print(self.text_filenames[self.current_text_id])
            if show_time:
                print(time.time() - st_time)
            self.process_next()
    
    
    def process_next(self):
        if self.current_text_id >= len(self.text_filenames):
            print('Specified folder is fully processed')
            return
        
        fn = self.text_filenames[self.current_text_id]
        
        title = escapepath(fn[len(self.folder):])
        self.textobj = Document(title=title)
        if self.include_metadata:
            self.process_metadata()
        self.textobj.save()
        
        if self.inp == 'conllu':
            with open(fn, 'r', encoding='utf-8') as f:
                processed = f.read()
        elif self.inp == 'raw_text':
            processed = self.piper.process_file(fn)
        
        self.write_data(processed)
        self.current_text_id += 1
    
    
    def write_data(self, processed):
        sentences = processed.split('\n\n')
        #for sent in sentences:
            #for line in sent.splitlines():
                #print(line)
        for sent in sentences:
            if sent:
                self.write_sent(sent)
    
    def write_sent(self, sent):
        sent = [i for i in sent.splitlines() if i!="# newdoc" and i!="# newpar"]
        sent_id = int(re.match('# sent_id = ([0-9]+)', sent[0]).group(1))
        occurences = []
        #deprels = []
        for token in sent[2:]:
            ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = token.split('\t')
            ID = int(ID)
            HEAD = int(HEAD)
            space_after = True
            if MISC == 'SpaceAfter=No':
                space_after = False
            ##Check if lemma exists and if not add to db:
            lemma = try_to_get(Lemma, text=LEMMA, pos=UPOS)
            ##Check if token exists and if not add to db:
            token = try_to_get(Token, text=FORM, lemma=lemma, grammar=FEATS)
            ##Add new Occurence to db:
            occurence = Occurence(document=self.textobj, token=token, sentence=sent_id,
                                 index=ID, space_after=space_after)
            occurence.save()
            occurences.append(occurence)
            #deprels.append((HEAD, ID, DEPREL))
        ## Add deprels to db:
        #for head_id, dep_id, deprel in deprels:
            #if len(deprel) >= 50:
                #print('Too long deprel - "'+deprel+'""')
            ## Let's not forget that UDPipe starts counting from 1:
            #new_rel = DepRel(head=occurences[head_id-1], dependant=occurences[dep_id-1], typerel=deprel)
            #new_rel.save()
    
    def process_metadata():
        fn = extension_split(self.text_filenames[self.current_text_id]) + '.json'
        if os.path.exists(fn):
            with open(fn, 'r', encoding='utf-8') as inp:
                meta = json.load(inp.read())
            for attr in ('sex', 'date', 'mark'):
                if attr in meta:
                    self.textobj.__setattr__(attr, meta[attr])

In [65]:
model = LANG_MODEL_PATH

In [66]:
model

'C:\\HP_PC\\hp_pc\\Studies\\3rd-year-thesis\\Udpipe_Models\\english-ewt-ud-2.3-181115.udpipe'

In [67]:
base_filler = TextBaseFiller(model=model, folder=data_folder)

In [37]:
import time

In [68]:
len(base_filler.text_filenames)

14359

In [69]:
base_filler.process_next_n(100, show_titles=True, show_time=True)

C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data\2012-2014\esl_00011.txt
0.0
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data\2012-2014\esl_00012.txt
92.94651365280151
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data\2012-2014\esl_00013.txt
156.52012991905212
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data\2012-2014\esl_00014.txt
241.44506359100342


KeyboardInterrupt: 

In [70]:
Lemma.objects.all()

<QuerySet [<Lemma: Lemma object (952)>, <Lemma: Lemma object (953)>, <Lemma: Lemma object (954)>, <Lemma: Lemma object (955)>, <Lemma: Lemma object (956)>, <Lemma: Lemma object (957)>, <Lemma: Lemma object (958)>, <Lemma: Lemma object (959)>, <Lemma: Lemma object (960)>, <Lemma: Lemma object (961)>, <Lemma: Lemma object (962)>, <Lemma: Lemma object (963)>, <Lemma: Lemma object (964)>, <Lemma: Lemma object (965)>, <Lemma: Lemma object (966)>, <Lemma: Lemma object (967)>, <Lemma: Lemma object (968)>, <Lemma: Lemma object (969)>, <Lemma: Lemma object (970)>, <Lemma: Lemma object (971)>, '...(remaining elements truncated)...']>

In [71]:
Document.objects.all().delete()
Token.objects.all().delete()
Lemma.objects.all().delete()

(530, {'corpustool.Lemma': 530})

Восстанавливаем исходный текст из базы данных:

In [31]:
first_doc = Document.objects.all()[0]
for occurence in Occurence.objects.filter(document = first_doc):
    end = ' '
    if not occurence.space_after:
        end = ''
    print(occurence.token.text, end = end)

This episode is about a very interesting case in which a complainant wants to have 1.2 million dollars as a Compensatory damage as she fell down because of defective eruv wire. Will Gardner, a founder of a law firm, informed his employee Alicia about This case and demanded to deal with it. He noticed that it is a very sensitive issue for the company as a defendant is a daughter of another founder. Alicia and her colleague Kalinda imagined that they would see a sophisticated and spoiled woman named Anna so they were very surprised to see a modest woman who lived in a rather usual house with her Jewish husband. she explained that she had a Shabbat the day when the incident had happened so she couldn't use any electrical devices in order to tell someone about a defective eruv wire. she also presented her husband to them and He once more accentuated that it wasn't their fault. Then there were an acquaintance with another lawyer Alprin, who investigated This case. He expected Alicia to go a

In [29]:
base_filler.text_filenames[0]

'D:\\Studies\\3rd-year-thesis\\Data\\realec_dump_2019_2_10_16_7_20_6_41_0\\data\\2012-2014\\esl_00011.txt'

In [22]:
Token.objects.all().delete()

(2546, {'corpustool.Token': 2546})

In [23]:
Lemma.objects.all().delete()

(2121, {'corpustool.Lemma': 2121})

In [13]:
Occurence.objects.all().delete()

(0, {})

In [62]:
DepRel.objects.all()

<QuerySet [<DepRel: DepRel object (1)>, <DepRel: DepRel object (2)>, <DepRel: DepRel object (3)>, <DepRel: DepRel object (4)>, <DepRel: DepRel object (5)>, <DepRel: DepRel object (6)>, <DepRel: DepRel object (7)>, <DepRel: DepRel object (8)>, <DepRel: DepRel object (9)>, <DepRel: DepRel object (10)>, <DepRel: DepRel object (11)>, <DepRel: DepRel object (12)>, <DepRel: DepRel object (13)>, <DepRel: DepRel object (14)>, <DepRel: DepRel object (15)>, <DepRel: DepRel object (16)>, <DepRel: DepRel object (17)>, <DepRel: DepRel object (18)>, <DepRel: DepRel object (19)>, <DepRel: DepRel object (20)>, '...(remaining elements truncated)...']>

In [15]:
for deprel in DepRel.objects.all():
    print(deprel.typerel)

det
nsubj
cop
case
det
advmod
amod
root
case
obl
det
nsubj
acl:relcl
mark
xcomp
compound
nummod
obj
case
det
compound
obl
mark
nsubj
advcl
advmod
case
fixed
amod
compound
obl
punct
aux
nsubj
punct
det
appos
case
det
compound
nmod
punct
root
nmod:poss
compound
obj
case
det
nmod
cc
conj
mark
xcomp
case
obl
punct
nsubj
root
mark
nsubj
cop
det
advmod
amod
nsubj
case
det
nmod
case
det
nmod
cop
det
ccomp
case
det
nmod
punct
nsubj
cc
nmod:poss
compound
conj
root
mark
nsubj
aux
ccomp
det
amod
cc
conj
obj
acl
obj
advmod
nsubj
cop
advmod
parataxis
mark
advcl
det
amod
obj
nsubj
acl:relcl
case
det
advmod
amod
obl
case
nmod:poss
amod
obl
punct
nsubj
root
mark
nsubj
ccomp
det
obj
det
obl:tmod
mark
det
nsubj
aux
advcl
advmod
nsubj
compound
case
obj
acl
det
amod
obj
mark
fixed
mark
advcl
obj
case
det
amod
compound
nmod
punct
nsubj
advmod
root
nmod:poss
obj
case
obl
cc
nsubj
advmod
advmod
conj
mark
nsubj
ccomp
punct
conj
nmod:poss
obj
punct
advmod
expl
root
det
nsubj
case
det
compound
nmod
punct
nsubj
