From ad2ec51e825a3a80e307a5c8dbc40aa9d0ef28b6 Mon Sep 17 00:00:00 2001 From: damien Date: Sun, 15 May 2016 19:42:37 +0200 Subject: [PATCH] first take at #2 (search code base) via elastic search Now I wonder mainly if I should do it in mariadb/postgres instead since we want precise search and fuzzy searchs are not that useful here Also, I was hoping that using ES would make search efficient like with Algolia but it's hard to tune the thing --- elastic.py | 45 +++++++++++++++++++++++++ extstats/source_extractor.py | 65 ++++++++++++++++++++++++++++++++++++ req.txt | 1 + 3 files changed, 111 insertions(+) create mode 100644 elastic.py create mode 100644 extstats/source_extractor.py diff --git a/elastic.py b/elastic.py new file mode 100644 index 0000000..678d449 --- /dev/null +++ b/elastic.py @@ -0,0 +1,45 @@ +from datetime import datetime +from elasticsearch_dsl import DocType, String, Date, Integer, Float +from elasticsearch_dsl.connections import connections + +# Define a default Elasticsearch client +connections.create_connection(hosts=['localhost']) + +class Extension(DocType): + name = String() + url = String() + description = String() + user_count = Integer() + review_count = Float() + review_score = Float() + + class Meta: + index = 'exts' + +# create the mappings in elasticsearch +Extension.init() + +import json +exts = json.load(open('data/PAGES.json')) + +# TODO source code extract + +# rob query: all ext with this permission in manifest and this regex in source code +# https://www.elastic.co/guide/en/elasticsearch/guide/current/nested-query.html + +for ext in exts: + print(ext['name']) + sources = extract_sources(ext['id']) + # create and save + ext = Extension(meta={'id': ext['ext_id']}, + name=ext['name'], + sources=sources, + url=ext['url'], + review_count=ext['aggregateRating.properties.ratingCount'], + review_score=ext['aggregateRating.properties.ratingValue'], + description=ext['full_description'], + user_count=int(ext['user_count'])) + ext.save() + +# Display cluster health +print(connections.get_connection().cluster.health()) \ No newline at end of file diff --git a/extstats/source_extractor.py b/extstats/source_extractor.py new file mode 100644 index 0000000..d3aa7ae --- /dev/null +++ b/extstats/source_extractor.py @@ -0,0 +1,65 @@ +import os +import json +import shlex +import shutil +import subprocess +from subprocess import STDOUT, check_output +from distutils.version import LooseVersion +from random import shuffle + +from .CONSTS import CRX_DIRECTORY + +# TODO code quality is veryyy low (chdir, duplication) + +def attrget(item, key): + keys = key.split('.') + for key in keys: + if item: + item = item.get(key, '') + return item + + +def sort_semverfiles(files): + def keyfunc(filename): + return LooseVersion(filename.replace('.zip', '')) + return sorted(files, key=keyfunc) + + +FNULL = open(os.devnull, 'w') + +def get_latest_version(ext_id): + return filename + +def extract(ext_id): + crx_dir = CRX_DIRECTORY+ext_id + if os.path.exists(crx_dir): + tmp_dir = 'tmp/history/'+ext_id + if os.path.exists(tmp_dir): + return + shutil.copytree(crx_dir, tmp_dir) + os.chdir(tmp_dir) + files = os.listdir() + files = sort_semverfiles(files) + prevdir = None + for i, file in enumerate(files): + print('doing', file) + print('dtrx..') + check_output('dtrx {} 2>&1 > /dev/null'.format(shlex.quote(file)), timeout=60, shell=True) + dirname = file.replace('.zip', '') # = version_name + os.chdir(dirname) # security is crazy low + # os.system('grep -Ri -C 2 "API_KEY" .') + # os.system('ack-grep -R "UA\-[0-9]+\-[0-9]+" .') + os.system('ack-grep -Ri "API_*KEY" .') + os.chdir('..') + prevdir = dirname + try: + shutil.rmtree(final_target_dir) + except: + pass + print(ext_id, 'done') + os.chdir('../../..') + #shutil.rmtree(tmp_dir) + else: + print('no crx, so saddddd', crx_dir) + print() + print() \ No newline at end of file diff --git a/req.txt b/req.txt index 4c26783..069951c 100644 --- a/req.txt +++ b/req.txt @@ -6,3 +6,4 @@ tqdm jinja2 arrow beautifulsoup4 +elasticsearch_dsl \ No newline at end of file