# Название проекта по Базам Данных

Требования к нереляционным БД

2 - нереляционная БД + 

2 - красивая структура БД + 

2 - интерфейс позволяет класть+, доставать+, удалять данные (проводить операции CRUD - Create, Read, Update, Delete) + 

2 - два действия помимо CRUD (сортировка, группировка, агрегация, ...)

2 - зависит от БД: Redis - использование ключей, хешей, ...; Neo4j - нахождение путей не только к соседним вершинам, операции с графами; MongoDB, ElasticSearch - работа с текстами или географией; остальные БД - будем договарив

In [2]:
from elasticsearch import Elasticsearch
import json
from tqdm import tqdm

In [None]:
# https://www.elastic.co/downloads/elasticsearch

В базе данных хранятся статьи из датасета Cornell Newsroom Summarization (https://summari.es/). Он используется в задаче саммаризации. Мы будем хранить для каждой статьи информацию об url, архиве, заголовке, дате, полном тексте, и саммари текста.  

# Cтруктура 

In [11]:
def clean_tqdm():
    for instance in list(tqdm._instances): 
        tqdm._decr_instances(instance)

In [12]:
es = Elasticsearch()

In [13]:
# Класс, хранящий информацию о статье.
class Article:
    def __init__(self):
        self.url = ""
        self.archive = ""
        self.title = ""
        self.date = ""
        self.text = ""
        self.summary = ""        
    
    # Конвертация в JSON.
    def toJSON(self):
        res='{"url":"'+self.url+'", "archive":"'+self.archive+'", "title":"'+self.title+'", "date":"'
        res+=self.date+'", "text":"'+self.text+'", "summary":"'+self.summer+'"}'
        return res
    
    # Конвертация в словарь.
    def toDict(self):
        res={"url":self.url, "archive":self.archive, "title":self.title, "date":self.date,\
             "text":self.text, "summary":self.summary}
        return res

In [14]:
def getArticles(json_file, N): # test_stats.jsonl
    """
    Выкачивает N статей из json_file
    Возвращает список, где каждый элемент - Article с заполненными полями.
    """
    articles = []
    with open(json_file, 'r') as json_file:
        json_list = list(json_file)[:N]
    
    for json_str in json_list:
            result = json.loads(json_str)
            a = Article()
            a.url = result['url']
            a.archive = result['archive']
            a.title = result['title']
            a.date = result['date']
            a.text = result['text']
            a.summary = result['summary']
            
            articles.append(a)                           
    return articles

## Создаем индекс и маппинг к нему

In [47]:
# es.indices.delete("cornell")

{'acknowledged': True}

In [48]:
es.indices.create(index="cornell")

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'cornell'}

In [49]:
mapit={"article":{"properties":{"url":{"type":"text"},
                                "archive": {"type":"text"},
                                "title": {"type":"text", "analyzer":"english"},
                                "date":{"type":"text"},
                                "text":{"type":"text","analyzer" : "english"},
                                "summary":{"type":"text","analyzer" : "english"}}}}

es.indices.put_mapping(index="cornell", doc_type='article', body=mapit, include_type_name=True)

{'acknowledged': True}

## Create

In [50]:
def add_items(N, file="test-stats.jsonl", index="cornell", es=es):
    """
    Добавляет N статей из file в БД(index)
    """
    articles = getArticles(file, N=N)
    for ind, article in tqdm(enumerate(articles)):
        es.index(index=index, id=ind, doc_type='article', body=article.toDict())
    return "Added {} articles successfully".format(N) 

In [51]:
%%time 

# clean_tqdm()
add_items(10)

10it [00:00, 61.72it/s]


Wall time: 4.39 s


'Added 10 articles successfully'

## Read

In [52]:
def read_items(index="cornell", body={"query": {"match_all": {}}}, es=es):
    """
    Вытаскивает элементы из БД. По умолчанию вытащит все элементы.
    body: dict
    return: list
    """
    res = es.search(index=index, body=body)
    res["hits"]["hits"]
    return res["hits"]["hits"]

In [53]:
read_items()[0]

{'_index': 'cornell',
 '_type': 'article',
 '_id': '0',
 '_score': 1.0,
 '_source': {'url': 'http://www.nydailynews.com/archives/news/1995/10/14/1995-10-14_selena_s_last_cries___shot_s.html',
  'archive': 'http://web.archive.org/web/20090428161725id_/http://www.nydailynews.com:80/archives/news/1995/10/14/1995-10-14_selena_s_last_cries___shot_s.html',
  'title': "SELENA'S LAST CRIES SHOT SINGER BEGGED HELP, NAMED SUSPECT",
  'date': '20090428161725',
  'text': 'By MATT SCHWARTZ in Houston and WENDELL JAMIESON in New York Daily News Writers\n\nSaturday, October 14th 1995, 4:22AM\n\nBleeding from a massive chest wound, Tejano star Selena cried, "Help me! Help me! I\'ve been shot!" and then named her killer with her dying breath.\n\nShaken witnesses yesterday told a spellbound Houston courtroom how the blood-covered, mortally wounded 23-year-old Hispanic singing sensation burst into the lobby of the Corpus Christi Days Inn last March 31.\n\nGasping for breath, Selena told motel workers tha

In [54]:
# Вытащим статью, в которой в названии есть "Mexican"
read_items(body={"query": {"match": {"title": "Mexican"}}})

[{'_index': 'cornell',
  '_type': 'article',
  '_id': '5',
  '_score': 2.5153728,
  '_source': {'url': 'http://www.economist.com/blogs/americasview/2012/06/mexico-election-diary',
   'archive': 'http://web.archive.org/web/20120604112028id_/http://www.economist.com/blogs/americasview/2012/06/mexico-election-diary',
   'title': 'What’s on Mexican minds?',
   'date': '20120604112028',
   'text': 'Dear TW: There are serious numerical anomalies in the debate you are moderating.You may want to check it out.\n\nI will state no opinion on who should or should not, will or will not become President of Mexico in four weeks´ time.\n\nI will nevertheless tell you that there is ample evidence to point out that the vote in this debate has been tampered with.\n\nI am sorry and certainly I believe the political operative who quite possibly has been behind this vote-rigging has to be at least privately reprimanded by his party;The Economist´s staff may want to do a quick investigation with the ample me

## Update

In [55]:
es.update(index='cornell',doc_type='article',id=0,
                body={"doc": {"title": "I love dogs"}})

{'_index': 'cornell',
 '_type': 'article',
 '_id': '0',
 '_version': 2,
 'result': 'updated',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 10,
 '_primary_term': 1}

In [57]:
def update_item(id, body, index="cornell", doc_type="article", es=es):
    """
    Обновляет документ по его id
    """
    return es.update(index=index,doc_type=doc_type,id=id, body=body)

In [58]:
update_item(0, body={"doc": {"title": "I love dogs"}})

{'_index': 'cornell',
 '_type': 'article',
 '_id': '0',
 '_version': 2,
 'result': 'noop',
 '_shards': {'total': 0, 'successful': 0, 'failed': 0},
 '_seq_no': 10,
 '_primary_term': 1}

In [60]:
es.get(index="cornell", id=0)

{'_index': 'cornell',
 '_type': '_doc',
 '_id': '0',
 '_version': 2,
 '_seq_no': 10,
 '_primary_term': 1,
 'found': True,
 '_source': {'url': 'http://www.nydailynews.com/archives/news/1995/10/14/1995-10-14_selena_s_last_cries___shot_s.html',
  'archive': 'http://web.archive.org/web/20090428161725id_/http://www.nydailynews.com:80/archives/news/1995/10/14/1995-10-14_selena_s_last_cries___shot_s.html',
  'title': 'I love dogs',
  'date': '20090428161725',
  'text': 'By MATT SCHWARTZ in Houston and WENDELL JAMIESON in New York Daily News Writers\n\nSaturday, October 14th 1995, 4:22AM\n\nBleeding from a massive chest wound, Tejano star Selena cried, "Help me! Help me! I\'ve been shot!" and then named her killer with her dying breath.\n\nShaken witnesses yesterday told a spellbound Houston courtroom how the blood-covered, mortally wounded 23-year-old Hispanic singing sensation burst into the lobby of the Corpus Christi Days Inn last March 31.\n\nGasping for breath, Selena told motel workers 

## Delete

In [64]:
def delete_item(id, doc_type="article", index="cornell",es=es):
    return  es.delete(index=index,doc_type=doc_type,id=id)

In [65]:
delete_item(0)

{'_index': 'cornell',
 '_type': 'article',
 '_id': '0',
 '_version': 3,
 'result': 'deleted',
 '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 '_seq_no': 11,
 '_primary_term': 1}

In [66]:
es.get(index="cornell", id=0)

NotFoundError: NotFoundError(404, '{"_index":"cornell","_type":"_doc","_id":"0","found":false}')

Теперь в БД нет документа с таким id. 

In [None]:
# Класс, хранящий информацию о сгенерированном тексте.
class GeneratedArticle:
    def __init__(self):
        self.text = ""
        self.model = ""
        self.sampling_type = ""
        self.parameter = ""      
        
    
    # Конвертация в JSON.
    def toJSON(self):
        res='{"text":"'+self.text+'", "model":"'+self.model+'", "sampling_type":"'+self.sampling_type+'", "parameter":"'
        res+=self.parameter+'"}'
        return res
    
    # Конвертация в словарь.
    def toDict(self):
        res={"text":self.text, "model":self.model, "sampling_type":self.sampling_type, "parameter":self.parameter}
        return res