-
Notifications
You must be signed in to change notification settings - Fork 70
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
first take at #2 (search code base) via elastic search
Now I wonder mainly if I should do it in mariadb/postgres instead since we want precise search and fuzzy searchs are not that useful here Also, I was hoping that using ES would make search efficient like with Algolia but it's hard to tune the thing
- Loading branch information
Showing
3 changed files
with
111 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
from datetime import datetime | ||
from elasticsearch_dsl import DocType, String, Date, Integer, Float | ||
from elasticsearch_dsl.connections import connections | ||
|
||
# Define a default Elasticsearch client | ||
connections.create_connection(hosts=['localhost']) | ||
|
||
class Extension(DocType): | ||
name = String() | ||
url = String() | ||
description = String() | ||
user_count = Integer() | ||
review_count = Float() | ||
review_score = Float() | ||
|
||
class Meta: | ||
index = 'exts' | ||
|
||
# create the mappings in elasticsearch | ||
Extension.init() | ||
|
||
import json | ||
exts = json.load(open('data/PAGES.json')) | ||
|
||
# TODO source code extract | ||
|
||
# rob query: all ext with this permission in manifest and this regex in source code | ||
# https://www.elastic.co/guide/en/elasticsearch/guide/current/nested-query.html | ||
|
||
for ext in exts: | ||
print(ext['name']) | ||
sources = extract_sources(ext['id']) | ||
# create and save | ||
ext = Extension(meta={'id': ext['ext_id']}, | ||
name=ext['name'], | ||
sources=sources, | ||
url=ext['url'], | ||
review_count=ext['aggregateRating.properties.ratingCount'], | ||
review_score=ext['aggregateRating.properties.ratingValue'], | ||
description=ext['full_description'], | ||
user_count=int(ext['user_count'])) | ||
ext.save() | ||
|
||
# Display cluster health | ||
print(connections.get_connection().cluster.health()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
import os | ||
import json | ||
import shlex | ||
import shutil | ||
import subprocess | ||
from subprocess import STDOUT, check_output | ||
from distutils.version import LooseVersion | ||
from random import shuffle | ||
|
||
from .CONSTS import CRX_DIRECTORY | ||
|
||
# TODO code quality is veryyy low (chdir, duplication) | ||
|
||
def attrget(item, key): | ||
keys = key.split('.') | ||
for key in keys: | ||
if item: | ||
item = item.get(key, '') | ||
return item | ||
|
||
|
||
def sort_semverfiles(files): | ||
def keyfunc(filename): | ||
return LooseVersion(filename.replace('.zip', '')) | ||
return sorted(files, key=keyfunc) | ||
|
||
|
||
FNULL = open(os.devnull, 'w') | ||
|
||
def get_latest_version(ext_id): | ||
return filename | ||
|
||
def extract(ext_id): | ||
crx_dir = CRX_DIRECTORY+ext_id | ||
if os.path.exists(crx_dir): | ||
tmp_dir = 'tmp/history/'+ext_id | ||
if os.path.exists(tmp_dir): | ||
return | ||
shutil.copytree(crx_dir, tmp_dir) | ||
os.chdir(tmp_dir) | ||
files = os.listdir() | ||
files = sort_semverfiles(files) | ||
prevdir = None | ||
for i, file in enumerate(files): | ||
print('doing', file) | ||
print('dtrx..') | ||
check_output('dtrx {} 2>&1 > /dev/null'.format(shlex.quote(file)), timeout=60, shell=True) | ||
dirname = file.replace('.zip', '') # = version_name | ||
os.chdir(dirname) # security is crazy low | ||
# os.system('grep -Ri -C 2 "API_KEY" .') | ||
# os.system('ack-grep -R "UA\-[0-9]+\-[0-9]+" .') | ||
os.system('ack-grep -Ri "API_*KEY" .') | ||
os.chdir('..') | ||
prevdir = dirname | ||
try: | ||
shutil.rmtree(final_target_dir) | ||
except: | ||
pass | ||
print(ext_id, 'done') | ||
os.chdir('../../..') | ||
#shutil.rmtree(tmp_dir) | ||
else: | ||
print('no crx, so saddddd', crx_dir) | ||
print() | ||
print() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ tqdm | |
jinja2 | ||
arrow | ||
beautifulsoup4 | ||
elasticsearch_dsl |