Skip to content

Commit

Permalink
first take at #2 (search code base) via elastic search
Browse files Browse the repository at this point in the history
Now I wonder mainly if I should do it in mariadb/postgres instead
since we want precise search and fuzzy searchs are not that useful
here

Also, I was hoping that using ES would make search efficient like
with Algolia but it's hard to tune the thing
  • Loading branch information
mdamien committed May 15, 2016
1 parent 89a4c2b commit ad2ec51
Show file tree
Hide file tree
Showing 3 changed files with 111 additions and 0 deletions.
45 changes: 45 additions & 0 deletions elastic.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from datetime import datetime
from elasticsearch_dsl import DocType, String, Date, Integer, Float
from elasticsearch_dsl.connections import connections

# Define a default Elasticsearch client
connections.create_connection(hosts=['localhost'])

class Extension(DocType):
name = String()
url = String()
description = String()
user_count = Integer()
review_count = Float()
review_score = Float()

class Meta:
index = 'exts'

# create the mappings in elasticsearch
Extension.init()

import json
exts = json.load(open('data/PAGES.json'))

# TODO source code extract

# rob query: all ext with this permission in manifest and this regex in source code
# https://www.elastic.co/guide/en/elasticsearch/guide/current/nested-query.html

for ext in exts:
print(ext['name'])
sources = extract_sources(ext['id'])
# create and save
ext = Extension(meta={'id': ext['ext_id']},
name=ext['name'],
sources=sources,
url=ext['url'],
review_count=ext['aggregateRating.properties.ratingCount'],
review_score=ext['aggregateRating.properties.ratingValue'],
description=ext['full_description'],
user_count=int(ext['user_count']))
ext.save()

# Display cluster health
print(connections.get_connection().cluster.health())
65 changes: 65 additions & 0 deletions extstats/source_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import os
import json
import shlex
import shutil
import subprocess
from subprocess import STDOUT, check_output
from distutils.version import LooseVersion
from random import shuffle

from .CONSTS import CRX_DIRECTORY

# TODO code quality is veryyy low (chdir, duplication)

def attrget(item, key):
keys = key.split('.')
for key in keys:
if item:
item = item.get(key, '')
return item


def sort_semverfiles(files):
def keyfunc(filename):
return LooseVersion(filename.replace('.zip', ''))
return sorted(files, key=keyfunc)


FNULL = open(os.devnull, 'w')

def get_latest_version(ext_id):
return filename

def extract(ext_id):
crx_dir = CRX_DIRECTORY+ext_id
if os.path.exists(crx_dir):
tmp_dir = 'tmp/history/'+ext_id
if os.path.exists(tmp_dir):
return
shutil.copytree(crx_dir, tmp_dir)
os.chdir(tmp_dir)
files = os.listdir()
files = sort_semverfiles(files)
prevdir = None
for i, file in enumerate(files):
print('doing', file)
print('dtrx..')
check_output('dtrx {} 2>&1 > /dev/null'.format(shlex.quote(file)), timeout=60, shell=True)
dirname = file.replace('.zip', '') # = version_name
os.chdir(dirname) # security is crazy low
# os.system('grep -Ri -C 2 "API_KEY" .')
# os.system('ack-grep -R "UA\-[0-9]+\-[0-9]+" .')
os.system('ack-grep -Ri "API_*KEY" .')
os.chdir('..')
prevdir = dirname
try:
shutil.rmtree(final_target_dir)
except:
pass
print(ext_id, 'done')
os.chdir('../../..')
#shutil.rmtree(tmp_dir)
else:
print('no crx, so saddddd', crx_dir)
print()
print()
1 change: 1 addition & 0 deletions req.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ tqdm
jinja2
arrow
beautifulsoup4
elasticsearch_dsl

0 comments on commit ad2ec51

Please sign in to comment.