small edits

markomanninen · Apr 5, 2018 · b84dc26 · b84dc26
1 parent 235b9a4
commit b84dc26
Show file tree

Hide file tree

Showing 5 changed files with 57 additions and 66 deletions.
diff --git a/doc/appendix1.rst b/doc/appendix1.rst
@@ -2,17 +2,26 @@ Appendix 1 - Store database
 ===========================
 
 Minimum code to create a unique word database for the riddle solver. Download,
-preprocess, and store Greek corpora, then same word database.
+preprocess, and store Greek corpora, then save and retrieve word database as
+a `DataFrame` object.
+
+.. code-block:: bash
+
+  pip install grcriddles
 
 .. code-block:: python
 
   # import download and preprocess function
-  from grcriddles import download_and_preprocess_corpora, save_database, get_database
+  from grcriddles import download_and_preprocess_corpora, save_database
   # call function to create Greek file directories and retrieve corpora data
   greek_corpora = download_and_preprocess_corpora()
-  # save word database
-  save_database(greek_corpora)
-  # retrieve word database
-  df = get_database()
+  # save and retrieve word database
+  df = save_database(greek_corpora)
   # how many records there are in the database?
   print("Total records: %s" % len(df))
+
+Output:
+
+.. code-block:: text
+
+  Total records: 1708
diff --git a/doc/appendix2.rst b/doc/appendix2.rst
@@ -1,7 +1,8 @@
 Appendix 2 - Solve riddles
 ==========================
 
-Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles.
+Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles. Get
+word database and filter by different columns.
 
 .. code-block:: bash
 
@@ -17,11 +18,11 @@ Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles.
   a = words[words['Isopsephy'] == 1697]
   a = a[a['Chars'] == 9]
   a = a[a['Mutes'] == 5]
-  a = a[a.apply(lambda x: len(x['Syllables'][0]) == 2 and \
-                          len(x['Syllables'][1]) == 2 and \
-                          len(x['Syllables'][2]) == 2, axis=1)]
-  # output words ordered alphabetically
-  a.sort_index()
+  a[a.apply(lambda x: len(x['Syllables'][0]) == 2 and \
+                      len(x['Syllables'][1]) == 2 and \
+                      len(x['Syllables'][2]) == 2, axis=1)]
+
+Output:
 
 .. code-block:: text
 
@@ -35,8 +36,9 @@ Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles.
 .. code-block:: python
 
   # get words containing ΑΜΦΕΚΑΛΥ stem word
-  b = words.filter(like="ΑΜΦΕΚΑΛΥ", axis=0)
-  b.sort_index()
+  words.filter(like="ΑΜΦΕΚΑΛΥ", axis=0)
+
+Output:
 
 .. code-block:: text
 

diff --git a/doc/appendix3.rst b/doc/appendix3.rst
@@ -3,8 +3,12 @@ Appendix 3 - Search results
 
 Minimum code to search words from the Greek corpora. `download_and_preprocess_corpora`
 should be run at least once in the working directory to make search
-functionality to work because it will create necessary Greek text directories
-from the original `Perseus` and `First1K` repositories.
+functionality to work because it will create all necessary Greek text files and
+directories from the original `Perseus` and `First1K` repositories.
+
+.. code-block:: bash
+
+  pip install grcriddles
 
 .. code-block:: python
 
@@ -17,5 +21,19 @@ from the original `Perseus` and `First1K` repositories.
   from grcriddles import search_words_from_corpora, perseus_dir, first1k_dir
   search_words_from_corpora(["ΑΜΦΕΚΑΛΥΨ"], [perseus_dir, first1k_dir], None, True)
 
+Output:
+
+.. code-block:: text
+
+  ..
+
+.. code-block:: python
+
   # search partial match(es) for the word from both perseus and first1k corpora
   search_words_from_corpora(["ΑΜΦΕΚΑΛΥΨ"], [perseus_dir, first1k_dir], None, False)
+
+Output:
+
+.. code-block:: text
+
+  ..
diff --git a/functions.py b/functions.py
@@ -46,10 +46,6 @@
 # ϒ not needed?
 vowels = "ΩΗΥΕΙΟΑ"
 
-# roman letters, big and small
-roman_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-roman_letters += "ABCDEFGHIJKLMNOPQRSTUVWXYZ".lower()
-
 # main database entry point
 database = None
 
@@ -302,21 +298,19 @@ def download_and_preprocess_corpora():
     download_with_indicator(fs, perseus_zip_file)
     fs = "https://github.com/OpenGreekAndLatin/First1KGreek/archive/master.zip"
     download_with_indicator(fs, first1k_zip_file)
-
     # extract zip files
     print("Extracting zip files...")
     unzip(perseus_zip_file, perseus_zip_dir)
     unzip(first1k_zip_file, first1k_zip_dir)
-
     # copy greek text files from repository
     print("Copying Greek text files from repository...")
     for item in [[joinpaths(perseus_zip_dir, ["canonical-greekLit-master", "data"]), perseus_tmp_dir],
                  [joinpaths(first1k_zip_dir, ["First1KGreek-master", "data"]), first1k_tmp_dir]]:
         copy_corpora(*item)
-
-    # process files
+    # init files
     print("Initializing corpora...")
     greek_corpora_x = init_corpora([[perseus_tmp_dir, perseus_dir], [first1k_tmp_dir, first1k_dir]])
+    # process files
     print("Processing files...")
     return process_greek_corpora(greek_corpora_x)
 
@@ -355,6 +349,9 @@ def save_database(greek_corpora):
     df[8] = df[0].apply(lambda x: len(x)-sum(list(x.count(c) for c in vowels)))
     # save dataframe to CSV file
     df.to_csv(csv_file_name, header=False, index=False, encoding='utf-8')
+    # set global database variable
+    database = df
+    return get_database()
 
 # get word database
 def get_database(cols = None):
@@ -382,6 +379,7 @@ def get_database(cols = None):
         words.columns = list(cols.values())
         if 0 in cols:
             words.set_index(cols[0], inplace=True)
+            return words.sort_index()
         return words
     else:
         return database.copy()
@@ -395,23 +393,6 @@ def display_side_by_side(**kwargs):
                   .replace("<thead>", "<caption style='text-align:center'>%s</caption><thead>" % caption)
     display_html(html.replace('table', 'table style="display:inline"'), raw=True)
 
-# is there roman letters in data?
-def has_roman_letters(data):
-    a = {}
-    for x in roman_letters:
-        if x in data:
-            a[x] = data.count(x)
-    return a
-
-# number of vowels in text
-def nvowels(x, n):
-    word, tot = x[0], 0
-    for c in vowels:
-        tot += word.count(c)
-        if tot > n:
-            return False
-    return tot == n
-
 # find the string (s) from the search source text (t) and match with the original source text (u)
 # search and original source texts should have same character indices for keywords
 # based on that assumption the location of the matches +- threshold (l)

diff --git a/grcriddles/functions.py b/grcriddles/functions.py
@@ -46,10 +46,6 @@
 # ϒ not needed?
 vowels = "ΩΗΥΕΙΟΑ"
 
-# roman letters, big and small
-roman_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-roman_letters += "ABCDEFGHIJKLMNOPQRSTUVWXYZ".lower()
-
 # main database entry point
 database = None
 
@@ -302,21 +298,19 @@ def download_and_preprocess_corpora():
     download_with_indicator(fs, perseus_zip_file)
     fs = "https://github.com/OpenGreekAndLatin/First1KGreek/archive/master.zip"
     download_with_indicator(fs, first1k_zip_file)
-
     # extract zip files
     print("Extracting zip files...")
     unzip(perseus_zip_file, perseus_zip_dir)
     unzip(first1k_zip_file, first1k_zip_dir)
-
     # copy greek text files from repository
     print("Copying Greek text files from repository...")
     for item in [[joinpaths(perseus_zip_dir, ["canonical-greekLit-master", "data"]), perseus_tmp_dir],
                  [joinpaths(first1k_zip_dir, ["First1KGreek-master", "data"]), first1k_tmp_dir]]:
         copy_corpora(*item)
-
-    # process files
+    # init files
     print("Initializing corpora...")
     greek_corpora_x = init_corpora([[perseus_tmp_dir, perseus_dir], [first1k_tmp_dir, first1k_dir]])
+    # process files
     print("Processing files...")
     return process_greek_corpora(greek_corpora_x)
 
@@ -355,6 +349,9 @@ def save_database(greek_corpora):
     df[8] = df[0].apply(lambda x: len(x)-sum(list(x.count(c) for c in vowels)))
     # save dataframe to CSV file
     df.to_csv(csv_file_name, header=False, index=False, encoding='utf-8')
+    # set global database variable
+    database = df
+    return get_database()
 
 # get word database
 def get_database(cols = None):
@@ -382,6 +379,7 @@ def get_database(cols = None):
         words.columns = list(cols.values())
         if 0 in cols:
             words.set_index(cols[0], inplace=True)
+            return words.sort_index()
         return words
     else:
         return database.copy()
@@ -395,23 +393,6 @@ def display_side_by_side(**kwargs):
                   .replace("<thead>", "<caption style='text-align:center'>%s</caption><thead>" % caption)
     display_html(html.replace('table', 'table style="display:inline"'), raw=True)
 
-# is there roman letters in data?
-def has_roman_letters(data):
-    a = {}
-    for x in roman_letters:
-        if x in data:
-            a[x] = data.count(x)
-    return a
-
-# number of vowels in text
-def nvowels(x, n):
-    word, tot = x[0], 0
-    for c in vowels:
-        tot += word.count(c)
-        if tot > n:
-            return False
-    return tot == n
-
 # find the string (s) from the search source text (t) and match with the original source text (u)
 # search and original source texts should have same character indices for keywords
 # based on that assumption the location of the matches +- threshold (l)