Skip to content

Commit

Permalink
small edits
Browse files Browse the repository at this point in the history
  • Loading branch information
markomanninen committed Apr 5, 2018
1 parent 235b9a4 commit b84dc26
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 66 deletions.
21 changes: 15 additions & 6 deletions doc/appendix1.rst
Expand Up @@ -2,17 +2,26 @@ Appendix 1 - Store database
===========================

Minimum code to create a unique word database for the riddle solver. Download,
preprocess, and store Greek corpora, then same word database.
preprocess, and store Greek corpora, then save and retrieve word database as
a `DataFrame` object.

.. code-block:: bash
pip install grcriddles
.. code-block:: python
# import download and preprocess function
from grcriddles import download_and_preprocess_corpora, save_database, get_database
from grcriddles import download_and_preprocess_corpora, save_database
# call function to create Greek file directories and retrieve corpora data
greek_corpora = download_and_preprocess_corpora()
# save word database
save_database(greek_corpora)
# retrieve word database
df = get_database()
# save and retrieve word database
df = save_database(greek_corpora)
# how many records there are in the database?
print("Total records: %s" % len(df))
Output:

.. code-block:: text
Total records: 1708
18 changes: 10 additions & 8 deletions doc/appendix2.rst
@@ -1,7 +1,8 @@
Appendix 2 - Solve riddles
==========================

Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles.
Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles. Get
word database and filter by different columns.

.. code-block:: bash
Expand All @@ -17,11 +18,11 @@ Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles.
a = words[words['Isopsephy'] == 1697]
a = a[a['Chars'] == 9]
a = a[a['Mutes'] == 5]
a = a[a.apply(lambda x: len(x['Syllables'][0]) == 2 and \
len(x['Syllables'][1]) == 2 and \
len(x['Syllables'][2]) == 2, axis=1)]
# output words ordered alphabetically
a.sort_index()
a[a.apply(lambda x: len(x['Syllables'][0]) == 2 and \
len(x['Syllables'][1]) == 2 and \
len(x['Syllables'][2]) == 2, axis=1)]
Output:

.. code-block:: text
Expand All @@ -35,8 +36,9 @@ Minimum code to solve isopsephical riddles in the Pseudo-Sibylline oracles.
.. code-block:: python
# get words containing ΑΜΦΕΚΑΛΥ stem word
b = words.filter(like="ΑΜΦΕΚΑΛΥ", axis=0)
b.sort_index()
words.filter(like="ΑΜΦΕΚΑΛΥ", axis=0)
Output:

.. code-block:: text
Expand Down
22 changes: 20 additions & 2 deletions doc/appendix3.rst
Expand Up @@ -3,8 +3,12 @@ Appendix 3 - Search results

Minimum code to search words from the Greek corpora. `download_and_preprocess_corpora`
should be run at least once in the working directory to make search
functionality to work because it will create necessary Greek text directories
from the original `Perseus` and `First1K` repositories.
functionality to work because it will create all necessary Greek text files and
directories from the original `Perseus` and `First1K` repositories.

.. code-block:: bash
pip install grcriddles
.. code-block:: python
Expand All @@ -17,5 +21,19 @@ from the original `Perseus` and `First1K` repositories.
from grcriddles import search_words_from_corpora, perseus_dir, first1k_dir
search_words_from_corpora(["ΑΜΦΕΚΑΛΥΨ"], [perseus_dir, first1k_dir], None, True)
Output:

.. code-block:: text
..
.. code-block:: python
# search partial match(es) for the word from both perseus and first1k corpora
search_words_from_corpora(["ΑΜΦΕΚΑΛΥΨ"], [perseus_dir, first1k_dir], None, False)
Output:

.. code-block:: text
..
31 changes: 6 additions & 25 deletions functions.py
Expand Up @@ -46,10 +46,6 @@
# ϒ not needed?
vowels = "ΩΗΥΕΙΟΑ"

# roman letters, big and small
roman_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
roman_letters += "ABCDEFGHIJKLMNOPQRSTUVWXYZ".lower()

# main database entry point
database = None

Expand Down Expand Up @@ -302,21 +298,19 @@ def download_and_preprocess_corpora():
download_with_indicator(fs, perseus_zip_file)
fs = "https://github.com/OpenGreekAndLatin/First1KGreek/archive/master.zip"
download_with_indicator(fs, first1k_zip_file)

# extract zip files
print("Extracting zip files...")
unzip(perseus_zip_file, perseus_zip_dir)
unzip(first1k_zip_file, first1k_zip_dir)

# copy greek text files from repository
print("Copying Greek text files from repository...")
for item in [[joinpaths(perseus_zip_dir, ["canonical-greekLit-master", "data"]), perseus_tmp_dir],
[joinpaths(first1k_zip_dir, ["First1KGreek-master", "data"]), first1k_tmp_dir]]:
copy_corpora(*item)

# process files
# init files
print("Initializing corpora...")
greek_corpora_x = init_corpora([[perseus_tmp_dir, perseus_dir], [first1k_tmp_dir, first1k_dir]])
# process files
print("Processing files...")
return process_greek_corpora(greek_corpora_x)

Expand Down Expand Up @@ -355,6 +349,9 @@ def save_database(greek_corpora):
df[8] = df[0].apply(lambda x: len(x)-sum(list(x.count(c) for c in vowels)))
# save dataframe to CSV file
df.to_csv(csv_file_name, header=False, index=False, encoding='utf-8')
# set global database variable
database = df
return get_database()

# get word database
def get_database(cols = None):
Expand Down Expand Up @@ -382,6 +379,7 @@ def get_database(cols = None):
words.columns = list(cols.values())
if 0 in cols:
words.set_index(cols[0], inplace=True)
return words.sort_index()
return words
else:
return database.copy()
Expand All @@ -395,23 +393,6 @@ def display_side_by_side(**kwargs):
.replace("<thead>", "<caption style='text-align:center'>%s</caption><thead>" % caption)
display_html(html.replace('table', 'table style="display:inline"'), raw=True)

# is there roman letters in data?
def has_roman_letters(data):
a = {}
for x in roman_letters:
if x in data:
a[x] = data.count(x)
return a

# number of vowels in text
def nvowels(x, n):
word, tot = x[0], 0
for c in vowels:
tot += word.count(c)
if tot > n:
return False
return tot == n

# find the string (s) from the search source text (t) and match with the original source text (u)
# search and original source texts should have same character indices for keywords
# based on that assumption the location of the matches +- threshold (l)
Expand Down
31 changes: 6 additions & 25 deletions grcriddles/functions.py
Expand Up @@ -46,10 +46,6 @@
# ϒ not needed?
vowels = "ΩΗΥΕΙΟΑ"

# roman letters, big and small
roman_letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
roman_letters += "ABCDEFGHIJKLMNOPQRSTUVWXYZ".lower()

# main database entry point
database = None

Expand Down Expand Up @@ -302,21 +298,19 @@ def download_and_preprocess_corpora():
download_with_indicator(fs, perseus_zip_file)
fs = "https://github.com/OpenGreekAndLatin/First1KGreek/archive/master.zip"
download_with_indicator(fs, first1k_zip_file)

# extract zip files
print("Extracting zip files...")
unzip(perseus_zip_file, perseus_zip_dir)
unzip(first1k_zip_file, first1k_zip_dir)

# copy greek text files from repository
print("Copying Greek text files from repository...")
for item in [[joinpaths(perseus_zip_dir, ["canonical-greekLit-master", "data"]), perseus_tmp_dir],
[joinpaths(first1k_zip_dir, ["First1KGreek-master", "data"]), first1k_tmp_dir]]:
copy_corpora(*item)

# process files
# init files
print("Initializing corpora...")
greek_corpora_x = init_corpora([[perseus_tmp_dir, perseus_dir], [first1k_tmp_dir, first1k_dir]])
# process files
print("Processing files...")
return process_greek_corpora(greek_corpora_x)

Expand Down Expand Up @@ -355,6 +349,9 @@ def save_database(greek_corpora):
df[8] = df[0].apply(lambda x: len(x)-sum(list(x.count(c) for c in vowels)))
# save dataframe to CSV file
df.to_csv(csv_file_name, header=False, index=False, encoding='utf-8')
# set global database variable
database = df
return get_database()

# get word database
def get_database(cols = None):
Expand Down Expand Up @@ -382,6 +379,7 @@ def get_database(cols = None):
words.columns = list(cols.values())
if 0 in cols:
words.set_index(cols[0], inplace=True)
return words.sort_index()
return words
else:
return database.copy()
Expand All @@ -395,23 +393,6 @@ def display_side_by_side(**kwargs):
.replace("<thead>", "<caption style='text-align:center'>%s</caption><thead>" % caption)
display_html(html.replace('table', 'table style="display:inline"'), raw=True)

# is there roman letters in data?
def has_roman_letters(data):
a = {}
for x in roman_letters:
if x in data:
a[x] = data.count(x)
return a

# number of vowels in text
def nvowels(x, n):
word, tot = x[0], 0
for c in vowels:
tot += word.count(c)
if tot > n:
return False
return tot == n

# find the string (s) from the search source text (t) and match with the original source text (u)
# search and original source texts should have same character indices for keywords
# based on that assumption the location of the matches +- threshold (l)
Expand Down

0 comments on commit b84dc26

Please sign in to comment.