In [18]:
from corpustool.models import *

In [19]:
import os

In [20]:
import json

In [21]:
import time

In [22]:
import re

In [23]:
import functools

In [24]:
from django.db import DataError

Самописная кэш-функция, которая возвращает None, если набор аргументов есть в хэше, так как Model instances without primary key value are unhashable

In [25]:
def cached(func, *args, **kwargs):  # correct signature is not known
    if not hasattr(cached, "cache"):
        cached.cache = []
    
    def cache_clear():
        cached.cache = []

    def find_in_cache(*args, **kwargs):
        if (args, kwargs) in cached.cache:
            return None
        cached.cache.append((args, kwargs))
        return func(*args, **kwargs)
    
    find_in_cache.cache_clear = cache_clear
    find_in_cache.__name__ = func.__name__
    find_in_cache.__doc__ = func.__doc__
    find_in_cache.__module__ = func.__module__
    return find_in_cache

In [26]:
def parse_rus_date(s):
    s = s.split('.')
    if len(s) != 3 or len(s[0]) != 2 or len(s[1]) != 2 or len(s[2]) != 4:
        return None
    return s[2]+'-'+s[1]+'-'+s[0]

In [27]:
def check_int(s):
    try:
        int(s)
        return True
    except:
        return False

In [28]:
parse_rus_date('29.03.2017')

'2017-03-29'

In [29]:
import spacy

In [30]:
from stattool.settings import TEXT_FILE_PATH

In [31]:
nlp = spacy.load("en_core_web_sm")

In [32]:
def escapepath(fp):
    return fp.replace(os.sep, '%').replace('/', '%')

In [33]:
extension_split = lambda x: x[:x.rfind('.')]

In [34]:
@cached
def try_to_get(Model, *args, **kwargs):
    ''' 
    Returns Model instance with specified property if it exists,
    If it doesn't creates a new one
    
    Arguments:
    Model - a class inherited from django models.Model
    *args, **kwargs - unnamed and named properties of needed object
    '''
    #print(kwargs)
    try:
        obj = Model.objects.get(*args, **kwargs)
    except Model.DoesNotExist:
        obj = Model(*args, **kwargs)
        return obj

Сделать нормальную проверку всех полей метаданных (пола, даты, оценки) на валидность

In [37]:
class TextBaseFiller(object):
    def __init__(self, model=None, folder='.', recursive=True, include_metadata=False):
        self.parser = model
        
        self.include_metadata = include_metadata
        self.folder = folder
        self.text_filenames = []
        if recursive:
            for root, dirs, files in os.walk(self.folder):
                for f in files:
                    if f.endswith('.txt'):
                        self.text_filenames.append(os.path.join(root, f))
        else:
            self.text_filenames = [os.path.join(self.folder, f) for f in os.listdir(self.folder) if f.endswith('.txt')]
        self.current_text_id = 0
    
    
    def process_all(self, show_titles=False, show_time = False):
        st_time = time.time()
        while self.current_text_id < len(self.text_filenames):
            if show_titles:
                print(self.text_filenames[self.current_text_id])
            if show_time:
                print(time.time() - st_time)
            self.process_next()
    
    
    def process_next_n(self, n, show_titles=False, show_time = False):
        st_time = time.time()
        limit = min(self.current_text_id + n, len(self.text_filenames))
        while self.current_text_id < limit:
            if show_titles:
                print(self.text_filenames[self.current_text_id])
            if show_time:
                print(time.time() - st_time)
            self.process_next()
    
    
    def process_next(self):
        if self.current_text_id >= len(self.text_filenames):
            print('Specified folder is fully processed')
            return
        
        fn = self.text_filenames[self.current_text_id]
        
        title = escapepath(fn[len(self.folder):])
        self.textobj = Document(title=title)
        if self.include_metadata:
            self.process_metadata()
        self.textobj.save()
        with open(fn, 'r', encoding='utf-8-sig') as f:
            parsed = self.parser(f.read().replace('\ufeff', ''))
        
        try_to_get.cache_clear()
        
        ##saving lemmas to db:
        lemmas = [try_to_get(Lemma, text=token.lemma_.lower(), pos=token.pos_.upper()) for token in parsed]
        Lemma.objects.bulk_create([lemma for lemma in lemmas if lemma is not None and lemma.pk is None])
        
        ##saving tokens to db:
        tokens = [try_to_get(Token, text=token.text.lower(), lemma = Lemma.objects.get(text=token.lemma_.lower(),
                                                                              pos=token.pos_.upper())) for token in parsed]
        Token.objects.bulk_create([token for token in tokens if token is not None and token.pk is None])
        
        ##saving occurences to db:
        occurences = [Occurence(document=self.textobj,
                                token=Token.objects.get(text=parsed[i].text.lower(),
                                                        lemma = Lemma.objects.get(text=parsed[i].lemma_.lower(),
                                                                                  pos=parsed[i].pos_.upper())),
                                index=i) for i in range(len(parsed))]
        Occurence.objects.bulk_create(occurences)
        
        self.current_text_id += 1
    
    def process_metadata(self):
        fn = extension_split(self.text_filenames[self.current_text_id]) + '.json'
        if os.path.exists(fn):
            with open(fn, 'r', encoding='utf-8') as inp:
                try:
                    meta = json.load(inp)
                except:
                    return
            if 'sex' in meta:
                if meta['sex'] in ('m','f'):
                    self.textobj.sex = meta['sex']
            if 'date' in meta:
                parsed_date = parse_rus_date(meta['date'])
                if parsed_date is not None:
                    self.textobj.date = parsed_date
            if 'mark' in meta:
                if check_int(meta['mark']):
                    self.textobj.mark = meta['mark']
            if 'study_year' in meta:
                if check_int(meta['mark']):
                    self.textobj.study_year = meta['study_year']
            if 'department' in meta:
                self.textobj.department = meta['department']
            

In [49]:
new_path = os.path.join(TEXT_FILE_PATH, r'data/exam')

In [50]:
base_filler = TextBaseFiller(model=nlp, folder=new_path+'/', include_metadata=True)

In [51]:
len(base_filler.text_filenames)

8429

In [65]:
base_filler.process_next_n(1000, show_titles=True)

C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00011.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00012.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00013.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00014.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00015.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00016.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00017.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00018.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00019.txt
C:\HP_PC\h

C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00219.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00220.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00221.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00222.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00223.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00224.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00225.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00226.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00227.txt
C:\HP_PC\h

C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00368.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00369.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00370.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00371.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00372.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00373.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00374.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00375.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/2012-2014\esl_00376.txt
C:\HP_PC\h

DoesNotExist: Lemma matching query does not exist.

In [42]:
base_filler.current_text_id = 1340

In [43]:
base_filler.process_next_n(1000, show_titles=True)

C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\2_exercise\student29_revised.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\2_exercise\student2_revised.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\2_exercise\student30.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\2_exercise\student30_revised.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\2_exercise\student31.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\2_exercise\student31_revised.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\2_exercise\student35.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20

C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\3_exercise\student24_ex.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\3_exercise\student24_review.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\3_exercise\student25_ex.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\3_exercise\student25_review.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\3_exercise\student25_review_revised.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\3_exercise\student27_ex.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_2019_2_10_16_7_20_6_41_0\data/Academic_Writing\3_exercise\student27_review.txt
C:\HP_PC\hp_pc\Studies\3rd-year-thesis\Data\realec_dump_201

MultipleObjectsReturned: get() returned more than one Lemma -- it returned 2!

In [48]:
Document.objects.all().delete()
Token.objects.all().delete()
Lemma.objects.all().delete()

(15675, {'corpustool.Lemma': 15675})

In [44]:
base_filler.current_text_id

1448

In [1]:
import datetime

In [6]:
datetime.date.fromisoformat('2008-06-03').month

6

In [86]:
Lemma.objects.values()

<QuerySet [{'id': 3585, 'text': 'This', 'pos': 'DET'}, {'id': 3586, 'text': 'chart', 'pos': 'NOUN'}, {'id': 3587, 'text': 'show', 'pos': 'VERB'}, {'id': 3588, 'text': '-PRON-', 'pos': 'PRON'}, {'id': 3589, 'text': 'the', 'pos': 'DET'}, {'id': 3590, 'text': 'datum', 'pos': 'NOUN'}, {'id': 3591, 'text': 'about', 'pos': 'ADP'}, {'id': 3592, 'text': 'the', 'pos': 'DET'}, {'id': 3593, 'text': 'unemployment', 'pos': 'NOUN'}, {'id': 3594, 'text': 'in', 'pos': 'ADP'}, {'id': 3595, 'text': 'a', 'pos': 'DET'}, {'id': 3596, 'text': 'few', 'pos': 'ADJ'}, {'id': 3597, 'text': 'world', 'pos': 'NOUN'}, {'id': 3598, 'text': 'area', 'pos': 'NOUN'}, {'id': 3599, 'text': '.', 'pos': 'PUNCT'}, {'id': 3600, 'text': 'the', 'pos': 'DET'}, {'id': 3601, 'text': 'datum', 'pos': 'NOUN'}, {'id': 3602, 'text': 'be', 'pos': 'VERB'}, {'id': 3603, 'text': 'divide', 'pos': 'VERB'}, {'id': 3604, 'text': 'into', 'pos': 'ADP'}, '...(remaining elements truncated)...']>

In [91]:
d = Document(title='heh')

In [92]:
d.save()

In [96]:
a = Document.objects.get(title='heh')

In [97]:
b = Document.objects.get(title='heh')

In [98]:
a == b

True

In [103]:
hash(a)

141

In [104]:
hash(b)

141

In [106]:
hash('abc')

-1583079279397609165

In [107]:
help(lru_cache)

Help on function lru_cache in module functools:

lru_cache(maxsize=128, typed=False)
    Least-recently-used cache decorator.
    
    If *maxsize* is set to None, the LRU features are disabled and the cache
    can grow without bound.
    
    If *typed* is True, arguments of different types will be cached separately.
    For example, f(3.0) and f(3) will be treated as distinct calls with
    distinct results.
    
    Arguments to the cached function must be hashable.
    
    View the cache statistics named tuple (hits, misses, maxsize, currsize)
    with f.cache_info().  Clear the cache and statistics with f.cache_clear().
    Access the underlying function with f.__wrapped__.
    
    See:  http://en.wikipedia.org/wiki/Cache_algorithms#Least_Recently_Used



In [114]:
{'a':1, 'b':2} in [{'a':1, 'b':2}]

True

In [201]:
len(list(Lemma.objects.values('text', 'pos')))

89

In [200]:
len(set(list((i['text'], i['pos']) for i in Lemma.objects.values('text', 'pos'))))

89

In [46]:
vals = Lemma.objects.values('text', 'pos')

In [47]:
for val in vals:
    try:
        Lemma.objects.get(**val)
        pass
    except:
        print(val, Lemma.objects.filter(**val))

{'text': '\u200b1995\u200b', 'pos': 'PROPN'} <QuerySet [<Lemma: Lemma object (26525)>, <Lemma: Lemma object (26544)>]>
{'text': '\u200b1995', 'pos': 'PROPN'} <QuerySet [<Lemma: Lemma object (26525)>, <Lemma: Lemma object (26544)>]>
{'text': 'tugen', 'pos': 'NOUN'} <QuerySet [<Lemma: Lemma object (30953)>, <Lemma: Lemma object (30954)>]>
{'text': 'tügen', 'pos': 'NOUN'} <QuerySet [<Lemma: Lemma object (30953)>, <Lemma: Lemma object (30954)>]>
{'text': 'gunnen', 'pos': 'NOUN'} <QuerySet [<Lemma: Lemma object (30956)>, <Lemma: Lemma object (30957)>]>
{'text': 'günnen', 'pos': 'NOUN'} <QuerySet [<Lemma: Lemma object (30956)>, <Lemma: Lemma object (30957)>]>
{'text': 'kunnen', 'pos': 'NOUN'} <QuerySet [<Lemma: Lemma object (30959)>, <Lemma: Lemma object (30960)>]>
{'text': 'künnen', 'pos': 'NOUN'} <QuerySet [<Lemma: Lemma object (30959)>, <Lemma: Lemma object (30960)>]>
{'text': 'turren', 'pos': 'NOUN'} <QuerySet [<Lemma: Lemma object (30968)>, <Lemma: Lemma object (30969)>]>
{'text': 'türr

Пример запроса - сколько словоупотреблений в текстах девушек

In [295]:
len(Occurence.objects.filter(document__sex='f'))

51749

Сколько словоупотреблений в текстах парней:

In [296]:
len(Occurence.objects.filter(document__sex='m'))

39698

Средняя длина текста у девушек:

In [299]:
len(Occurence.objects.filter(document__sex='f'))/len(Document.objects.filter(sex='f'))

265.37948717948717

У парней:

In [300]:
len(Occurence.objects.filter(document__sex='m'))/len(Document.objects.filter(sex='m'))

277.6083916083916

In [53]:
'\ufeffabsurdity'.strip()

'\ufeffabsurdity'

In [70]:
Occurences = Occurence.objects.filter(document = Document.objects.all()[3])

Попробуем восстановить текст, чтобы проверить что всё нормально:

In [72]:
for occ in Occurences:
    print(occ.token.text, end=' ')

  the course of time , the means of getting from one place to another   becom more and more sophisticated . two hundred years ago no one could even imagine that it would be possible to travel flying above the earth without any difficulty , but now it 's quite    to get on a plane and to fly to another city , country or even continent . so let 's imagine how the world of transport can improve in the future . 

 firstly , some people used to joke that the only way to solve the problem of traffic congestions in moscow is to    learn how to teleport or to fly . for me , the idea of teleportation sounds quite unreal ( but not    unreal ) , but flying from one place to another looks quite nice . as far as i know , some means of   flying have already been invented - for instance , there 're some rocket packs . they look like a quite large sack with small jet engines inside .    who uses this thing should put it on their back and , having switched it on , take off . the most advanced kind of s

In [73]:
Token.objects.filter(lemma=None)

<QuerySet []>

In [None]:
'\ufeffwhen'.encode('utf')