Skip to content

Commit

Permalink
Merge branch 'master' into mq13
Browse files Browse the repository at this point in the history
  • Loading branch information
sveinbjornt committed Oct 27, 2020
2 parents 6a548c3 + c93b670 commit 82521ac
Show file tree
Hide file tree
Showing 3 changed files with 105 additions and 1 deletion.
13 changes: 13 additions & 0 deletions queries/special.py
Expand Up @@ -448,7 +448,9 @@ def _play_film(qs: str, q: Query) -> AnswerType:
"veistu nokkuð": {"answer": "Ég veit eitt og annað. Spurðu mig!"},
"veistu svarið": {"answer": "Spurðu mig!"},
"veistu ekki neitt": {"answer": "Ég veit nú eitt og annað. Spurðu mig!"},
"veistu bara ekki neitt": {"answer": "Ég veit nú eitt og annað. Spurðu mig!"},
"veistu ekkert": {"answer": "Ég veit nú eitt og annað. Spurðu mig!"},
"veistu bara ekkert": {"answer": "Ég veit nú eitt og annað. Spurðu mig!"},
"hver er flottastur": {"answer": "Teymið hjá Miðeind."},
"hverjir eru flottastir": {"answer": "Teymið hjá Miðeind."},
"hver eru flottust": {"answer": "Teymið hjá Miðeind."},
Expand Down Expand Up @@ -1070,6 +1072,10 @@ def _play_film(qs: str, q: Query) -> AnswerType:
"tilgangur heimsins": _MEANING_OF_LIFE,
"hver er tilgangur heimsins": _MEANING_OF_LIFE,
"tilgangur lífsins": _MEANING_OF_LIFE,
"af hverju er ég til": _MEANING_OF_LIFE,
"af hverju er ég eiginlega til": _MEANING_OF_LIFE,
"af hverju erum við til": _MEANING_OF_LIFE,
"af hverju erum við eiginlega til": _MEANING_OF_LIFE,
"hver er tilgangurinn": _MEANING_OF_LIFE,
"hver er tilgangur lífsins": _MEANING_OF_LIFE,
"hvað er tilgangur lífsins": _MEANING_OF_LIFE,
Expand All @@ -1084,6 +1090,7 @@ def _play_film(qs: str, q: Query) -> AnswerType:
"hvert er leyndarmál lífsins": _MEANING_OF_LIFE,
"hver er sannleikurinn": _MEANING_OF_LIFE,
"hvað er 42": {"answer": "Sex sinnum sjö"}, # :)
"hvað meinarðu með 42": {"answer": "Sex sinnum sjö"},
# What is best in life? https://www.youtube.com/watch?v=Oo9buo9Mtos
"hvað er best": {"answer": "Að horfa á kvikmynd um villimanninn Kónan."},
"hvað er best í lífinu": {"answer": "Að horfa á kvikmynd um villimanninn Kónan."},
Expand Down Expand Up @@ -1151,6 +1158,9 @@ def _play_film(qs: str, q: Query) -> AnswerType:
"hvar ertu stödd": _LOC_ANSWER,
"ertu til": _LOC_ANSWER,
"í hverju ertu": _LOC_ANSWER,
"hvar er best að búa": {
"answer": "Í stafrænu skýjunum, eins og ég."
},
# Name explained
"hvers vegna heitir þú embla": _NAME_EXPL,
"hvers vegna heitirðu embla": _NAME_EXPL,
Expand Down Expand Up @@ -1786,6 +1796,9 @@ def _play_film(qs: str, q: Query) -> AnswerType:
"hver er sveinbjörn þórðarson": {
"answer": "Sveinbjörn Þórðarson er hugbúnaðarsmiður. Hann átti þátt í að skapa mig."
},
"hver forritar emblu": {
"answer": "Aðallega Sveinbjörn Þórðarson."
}
}


Expand Down
2 changes: 1 addition & 1 deletion tools/dbclean.py
Expand Up @@ -40,7 +40,7 @@
basepath = basepath[0 : -len(_TOOLS)]
sys.path.append(basepath)

from settings import Settings
from settings import Settings, ConfigError
# from article import Article
from db import SessionContext
from db.models import Article as ArticleModel
Expand Down
91 changes: 91 additions & 0 deletions tools/wmt.py
@@ -0,0 +1,91 @@
#!/usr/bin/env python
"""
Greynir: Natural language processing for Icelandic
Copyright (C) 2020 Miðeind ehf.
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see http://www.gnu.org/licenses/.
This utility extracts the text of all articles on a given day, with
associated metadata such as URL, title, timestamp, etc.
"""

import os
import sys
import json
from datetime import datetime


# Hack to make this Python program executable from the tools subdirectory
basepath, _ = os.path.split(os.path.realpath(__file__))
_TOOLS = os.sep + "tools"
if basepath.endswith(_TOOLS):
basepath = basepath[0 : -len(_TOOLS)]
sys.path.append(basepath)


from settings import Settings, ConfigError
from db import SessionContext
from db.models import Article
from tokenizer import correct_spaces

def main():

try:
# Read configuration file
Settings.read(os.path.join(basepath, "config", "GreynirSimple.conf"))
except ConfigError as e:
print("Configuration error: {0}".format(e))
exit()

with SessionContext(commit=False) as session:
bef = datetime(2020, 7, 26, 0, 0, 1)
aft = datetime(2020, 7, 27, 0, 0, 1)
q = (
session.query(
Article.url, Article.timestamp, Article.heading, Article.tokens
)
.filter(Article.timestamp > bef)
.filter(Article.timestamp < aft)
.order_by(Article.timestamp)
)
items = list()
for r in q.all():
(url, ts, title, tokens) = r
text = ""
tokens = json.loads(tokens)
if not tokens:
continue
# Paragraphs
for p in tokens:
# Sentences
for s in p:
# Tokens
for t in s:
text += t["x"] + " "

d = dict(url=url, timestamp=ts.isoformat(), title=title, text=text)
d["text"] = correct_spaces(d["text"])
items.append(d)
# print(d)
# print(text)
# print("____________________________")

print(json.dumps(items, ensure_ascii=False, sort_keys=True, indent=4))


if __name__ == "__main__":
main()

0 comments on commit 82521ac

Please sign in to comment.