From dee124f878a78c6961722f03a5d8593943780c0d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 20:16:58 +0200 Subject: [PATCH 1/8] Keep a reference on the content in the blob. The blob is mostly a pointer to a char array. We must keep a reference on the `bytes` to be sure that the data is not gc while we are using it. (Assuming the python ZimBlob is not gc either) --- libzim/libzim.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx index 6361ac92..c1a5f999 100644 --- a/libzim/libzim.pyx +++ b/libzim/libzim.pyx @@ -17,15 +17,14 @@ from collections import defaultdict cdef class ZimBlob: cdef Blob* c_blob - - def __init__(self, content): + cdef bytes ref_content + def __cinit__(self, content): if isinstance(content, str): - ref_content = content.encode('UTF-8') + self.ref_content = content.encode('UTF-8') else: - ref_content = content - - self.c_blob = new Blob( ref_content, len(ref_content)) + self.ref_content = content + self.c_blob = new Blob( self.ref_content, len(self.ref_content)) def __dealloc__(self): if self.c_blob != NULL: From 14f3ecf14469fcbe7a67d61151442413702c339f Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 20:21:41 +0200 Subject: [PATCH 2/8] Compute the ZimBlob only once. If we already have a ZimBlob we don't need to get the data a second time. Reuse the one we have. More that a performance point, this is needed to avoid to loose the reference to the existing blob (and so gc internal data). --- libzim/libzim.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx index c1a5f999..61c66fa8 100644 --- a/libzim/libzim.pyx +++ b/libzim/libzim.pyx @@ -67,7 +67,8 @@ cdef class ZimArticle: raise NotImplementedError def _get_data(self): - self.blob = self.get_data() + if self.blob is None: + self.blob = self.get_data() return self.blob def get_data(self): From 16ca69dc3b6a0acf501b73a2b1347f12723dc4aa Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 20:27:19 +0200 Subject: [PATCH 3/8] =?UTF-8?q?Release=20and=20acquire=20the=20GIL=C2=A0as?= =?UTF-8?q?=20needed.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `*_cy_call_fct` will be called by the C++ code in a different thread. To avoid race condition, we need to acquire the GIL before doing python operation. However, those function will also be called from the same thread (in addArticle). To avoid deadlock we need to release the GIL when we enter C++ function. --- libzim/libzim.pxd | 6 +++--- libzim/libzim.pyx | 14 ++++++++------ 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/libzim/libzim.pxd b/libzim/libzim.pxd index abaaef7a..e9ef8c14 100644 --- a/libzim/libzim.pxd +++ b/libzim/libzim.pxd @@ -39,8 +39,8 @@ cdef extern from "lib.h": cdef cppclass ZimCreatorWrapper: @staticmethod - ZimCreatorWrapper *create(string fileName, string mainPage, string fullTextIndexLanguage, int minChunkSize) except + - void addArticle(shared_ptr[ZimArticleWrapper] article) except + - void finalize() except + + ZimCreatorWrapper *create(string fileName, string mainPage, string fullTextIndexLanguage, int minChunkSize) nogil except + + void addArticle(shared_ptr[ZimArticleWrapper] article) nogil except + + void finalize() nogil except + Url getMainUrl() except + void setMainUrl(string) except + diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx index 61c66fa8..767466f8 100644 --- a/libzim/libzim.pyx +++ b/libzim/libzim.pyx @@ -94,13 +94,13 @@ cdef get_article_method_from_object_ptr(void *ptr, string method, int *error): #------- ZimArticle pure virtual methods -------- cdef public api: - string string_cy_call_fct(void *ptr, string method, int *error): + string string_cy_call_fct(void *ptr, string method, int *error) with gil: """Lookup and execute a pure virtual method on ZimArticle returning a string""" func = get_article_method_from_object_ptr(ptr, method, error) ret_str = func() return ret_str.encode('UTF-8') - Blob blob_cy_call_fct(void *ptr, string method, int *error): + Blob blob_cy_call_fct(void *ptr, string method, int *error) with gil: """Lookup and execute a pure virtual method on ZimArticle returning a Blob""" cdef ZimBlob blob @@ -108,12 +108,12 @@ cdef public api: blob = func() return dereference(blob.c_blob) - bool bool_cy_call_fct(void *ptr, string method, int *error): + bool bool_cy_call_fct(void *ptr, string method, int *error) with gil: """Lookup and execute a pure virtual method on ZimArticle returning a bool""" func = get_article_method_from_object_ptr(ptr, method, error) return func() - uint64_t int_cy_call_fct(void *ptr, string method, int *error): + uint64_t int_cy_call_fct(void *ptr, string method, int *error) with gil: """Lookup and execute a pure virtual method on ZimArticle returning an int""" func = get_article_method_from_object_ptr(ptr, method, error) return func() @@ -313,7 +313,8 @@ cdef class ZimCreator: # Make a shared pointer to ZimArticleWrapper from the ZimArticle object (dereference internal c_article) cdef shared_ptr[ZimArticleWrapper] art = make_shared[ZimArticleWrapper](dereference(article.c_article)); try: - self.c_creator.addArticle(art) + with nogil: + self.c_creator.addArticle(art) except: raise else: @@ -337,7 +338,8 @@ cdef class ZimCreator: raise RuntimeError("ZimCreator already finalized") self.write_metadata(self._get_metadata()) - self.c_creator.finalize() + with nogil: + self.c_creator.finalize() self._finalized = True def __repr__(self): From 218d541839af3c9d392248b0992b5627cf9fda0c Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 20:30:52 +0200 Subject: [PATCH 4/8] ZimArticle must not contain its own wrapper. This ZimArticleWrapper that contains ZimArticle. We create the wrapper "on the fly" when we need it. The wrapper will increment the ref to ZimArticle to be sure that ZimArticle is alive as long as the wrapper is. Even if the ZimArticle is not referenced anymore in the python world. --- libzim/libzim.pyx | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx index 767466f8..f54246d7 100644 --- a/libzim/libzim.pyx +++ b/libzim/libzim.pyx @@ -36,12 +36,8 @@ cdef class ZimBlob: ######################### cdef class ZimArticle: - cdef ZimArticleWrapper* c_article cdef ZimBlob blob - def __init__(self): - self.c_article = new ZimArticleWrapper(self) - def get_url(self): raise NotImplementedError @@ -74,9 +70,6 @@ cdef class ZimArticle: def get_data(self): raise NotImplementedError - @property - def mimetype(self): - return self.c_article.getMimeType().decode('UTF-8') #------ Helper for pure virtual methods -------- @@ -293,7 +286,7 @@ cdef class ZimCreator: # default dict update self._article_counter[article.get_mime_type().strip()] += 1 - def add_article(self, ZimArticle article not None): + def add_article(self, article not None): """Add a ZimArticle to the Creator object. Parameters @@ -311,7 +304,8 @@ cdef class ZimCreator: raise RuntimeError("ZimCreator already finalized") # Make a shared pointer to ZimArticleWrapper from the ZimArticle object (dereference internal c_article) - cdef shared_ptr[ZimArticleWrapper] art = make_shared[ZimArticleWrapper](dereference(article.c_article)); + cdef shared_ptr[ZimArticleWrapper] art = shared_ptr[ZimArticleWrapper]( + new ZimArticleWrapper(article)); try: with nogil: self.c_creator.addArticle(art) From aaa01a49b988464c9bb8614daa35bcb72366a66d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 20:37:39 +0200 Subject: [PATCH 5/8] =?UTF-8?q?Acquire=20the=20GIL=C2=A0before=20decrement?= =?UTF-8?q?ing=20the=20article=20ref.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the ZimArticle is not referenced in the python world after the call to add_article[1], the only reference to the zimArticle is the one the wrapper has. The wrapper is "stored" in a std::shared_ptr. When libzim ends using it, it will decrement the reference in the std::shared_ptr, and if it comes to 0, it will deleted the Wrapper, which in turn, will decrement the ref to the article, and potentially makes the ZimArticle to be deleted. However, everything is done in a different thread, so we must protect the decrement by acquiring the GIL first. [1] As this : ``` for i in range(N): article = ZimArticle(...) zimCreator.add_article(article) ``` --- libzim/lib.cxx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libzim/lib.cxx b/libzim/lib.cxx index 5903833e..3c7194d4 100644 --- a/libzim/lib.cxx +++ b/libzim/lib.cxx @@ -29,7 +29,10 @@ ZimArticleWrapper::ZimArticleWrapper(PyObject *obj) : m_obj(obj) ZimArticleWrapper::~ZimArticleWrapper() { + PyGILState_STATE gstate; + gstate = PyGILState_Ensure(); Py_XDECREF(this->m_obj); + PyGILState_Release(gstate); } std::string ZimArticleWrapper::callCythonReturnString(std::string methodName) const From 72f804e65d91b800a8b6113297cc490e04e8b204 Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 20:39:09 +0200 Subject: [PATCH 6/8] Delete the c ZimCreator when we dealloc the python ZimCreator. --- libzim/libzim.pyx | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx index f54246d7..a097327b 100644 --- a/libzim/libzim.pyx +++ b/libzim/libzim.pyx @@ -224,6 +224,8 @@ cdef class ZimCreator: self._article_counter = defaultdict(int) self.update_metadata(date=datetime.date.today(), language= index_language) + def __dealloc__(self): + del self.c_creator @property def filename(self): From 32ba902aa38fc9c638aceb8bd73ffa1ccddb4f4d Mon Sep 17 00:00:00 2001 From: Matthieu Gautier Date: Wed, 22 Apr 2020 20:41:27 +0200 Subject: [PATCH 7/8] Use a empty bytes instead of None as default metadata. None cannot be use to create a ZimBlob. --- libzim/libzim.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx index a097327b..bfa18470 100644 --- a/libzim/libzim.pyx +++ b/libzim/libzim.pyx @@ -219,7 +219,7 @@ cdef class ZimCreator: self._main_page = self.c_creator.getMainUrl().getLongUrl().decode("UTF-8", "strict") self._index_language = index_language self._min_chunk_size = min_chunk_size - self._metadata = {k:None for k in MANDATORY_METADATA_KEYS} + self._metadata = {k:b"" for k in MANDATORY_METADATA_KEYS} self._article_counter = defaultdict(int) self.update_metadata(date=datetime.date.today(), language= index_language) From e15448ef2ffae1ef1e888c56fac9a07bdb16caf0 Mon Sep 17 00:00:00 2001 From: Juan Diego Caballero Date: Thu, 23 Apr 2020 00:04:56 -0500 Subject: [PATCH 8/8] Acquire the GIL at the helper function --- libzim/libzim.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libzim/libzim.pyx b/libzim/libzim.pyx index bfa18470..7cf47e75 100644 --- a/libzim/libzim.pyx +++ b/libzim/libzim.pyx @@ -73,7 +73,7 @@ cdef class ZimArticle: #------ Helper for pure virtual methods -------- -cdef get_article_method_from_object_ptr(void *ptr, string method, int *error): +cdef get_article_method_from_object_ptr(void *ptr, string method, int *error) with gil: cdef ZimArticle art = (ptr) try: func = getattr(art, method.decode('UTF-8'))