first take at #2 (search code base) via elastic search

Now I wonder mainly if I should do it in mariadb/postgres instead since we want precise search and fuzzy searchs are not that useful here Also, I was hoping that using ES would make search efficient like with Algolia but it's hard to tune the thing
mdamien · May 15, 2016 · ad2ec51 · ad2ec51
1 parent 89a4c2b
commit ad2ec51
Show file tree

Hide file tree

Showing 3 changed files with 111 additions and 0 deletions.
diff --git a/elastic.py b/elastic.py
@@ -0,0 +1,45 @@
+from datetime import datetime
+from elasticsearch_dsl import DocType, String, Date, Integer, Float
+from elasticsearch_dsl.connections import connections
+
+# Define a default Elasticsearch client
+connections.create_connection(hosts=['localhost'])
+
+class Extension(DocType):
+    name = String()
+    url = String()
+    description = String()
+    user_count = Integer()
+    review_count = Float()
+    review_score = Float()
+
+    class Meta:
+        index = 'exts'
+
+# create the mappings in elasticsearch
+Extension.init()
+
+import json
+exts = json.load(open('data/PAGES.json'))
+
+# TODO source code extract
+
+# rob query: all ext with this permission in manifest and this regex in source code
+# https://www.elastic.co/guide/en/elasticsearch/guide/current/nested-query.html
+
+for ext in exts:
+    print(ext['name'])
+    sources = extract_sources(ext['id'])
+    # create and save
+    ext = Extension(meta={'id': ext['ext_id']},
+        name=ext['name'],
+        sources=sources,
+        url=ext['url'],
+        review_count=ext['aggregateRating.properties.ratingCount'],
+        review_score=ext['aggregateRating.properties.ratingValue'],
+        description=ext['full_description'],
+        user_count=int(ext['user_count']))
+    ext.save()
+
+# Display cluster health
+print(connections.get_connection().cluster.health())
diff --git a/extstats/source_extractor.py b/extstats/source_extractor.py
@@ -0,0 +1,65 @@
+import os
+import json
+import shlex
+import shutil
+import subprocess
+from subprocess import STDOUT, check_output
+from distutils.version import LooseVersion
+from random import shuffle
+
+from .CONSTS import CRX_DIRECTORY
+
+# TODO code quality is veryyy low (chdir, duplication)
+
+def attrget(item, key):
+    keys = key.split('.')
+    for key in keys:
+        if item:
+            item = item.get(key, '')
+    return item
+
+
+def sort_semverfiles(files):
+    def keyfunc(filename):
+        return LooseVersion(filename.replace('.zip', ''))
+    return sorted(files, key=keyfunc)
+
+
+FNULL = open(os.devnull, 'w')
+
+def get_latest_version(ext_id):
+    return filename
+
+def extract(ext_id):
+    crx_dir = CRX_DIRECTORY+ext_id
+    if os.path.exists(crx_dir):
+        tmp_dir = 'tmp/history/'+ext_id
+        if os.path.exists(tmp_dir):
+            return
+        shutil.copytree(crx_dir, tmp_dir)
+        os.chdir(tmp_dir)
+        files = os.listdir()
+        files = sort_semverfiles(files)
+        prevdir = None
+        for i, file in enumerate(files):
+            print('doing', file)
+            print('dtrx..')
+            check_output('dtrx {} 2>&1 > /dev/null'.format(shlex.quote(file)), timeout=60, shell=True)
+            dirname = file.replace('.zip', '')  # = version_name
+            os.chdir(dirname)  # security is crazy low
+            # os.system('grep -Ri -C 2 "API_KEY" .')
+            # os.system('ack-grep -R "UA\-[0-9]+\-[0-9]+" .')
+            os.system('ack-grep -Ri "API_*KEY" .')
+            os.chdir('..')
+            prevdir = dirname
+        try:
+            shutil.rmtree(final_target_dir)
+        except:
+            pass
+        print(ext_id, 'done')
+        os.chdir('../../..')
+        #shutil.rmtree(tmp_dir)
+    else:
+        print('no crx, so saddddd', crx_dir)
+    print()
+    print()
diff --git a/req.txt b/req.txt
@@ -6,3 +6,4 @@ tqdm
 jinja2
 arrow
 beautifulsoup4
+elasticsearch_dsl