Skip to content

Commit

Permalink
Support customizing default values in the Run Spider page (fix #55)
Browse files Browse the repository at this point in the history
  • Loading branch information
my8100 committed Jun 22, 2019
1 parent 9a161c8 commit 30b39b7
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 19 deletions.
31 changes: 31 additions & 0 deletions scrapydweb/default_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,37 @@
JOBS_SNAPSHOT_INTERVAL = 300


############################## Run Spider #####################################
# The default is False, set it to True to automatically
# expand the 'settings & arguments' section in the Run Spider page.
SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = False

# The default is 'Mozilla/5.0', set it a non-empty string to customize the default value of `custom`
# in the drop-down list of `USER_AGENT`.
SCHEDULE_CUSTOM_USER_AGENT = 'Mozilla/5.0'

# The default is None, set it to any value of ['custom', 'Chrome', 'iPhone', 'iPad', 'Android']
# to customize the default value of `USER_AGENT`.
SCHEDULE_USER_AGENT = None

# The default is None, set it to True or False to customize the default value of `ROBOTSTXT_OBEY`.
SCHEDULE_ROBOTSTXT_OBEY = None

# The default is None, set it to True or False to customize the default value of `COOKIES_ENABLED`.
SCHEDULE_COOKIES_ENABLED = None

# The default is None, set it to a non-negative integer to customize the default value of `CONCURRENT_REQUESTS`.
SCHEDULE_CONCURRENT_REQUESTS = None

# The default is None, set it to a non-negative number to customize the default value of `DOWNLOAD_DELAY`.
SCHEDULE_DOWNLOAD_DELAY = None

# The default is "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1",
# set it to '' or any non-empty string to customize the default value of `additional`.
# Use '\r\n' as the line separator.
SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1"


############################## Page Display ###################################
# The default is True, set it to False to hide the Items page, as well as
# the Items column in the Jobs page.
Expand Down
11 changes: 7 additions & 4 deletions scrapydweb/templates/scrapydweb/schedule.html
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,7 @@ <h4>HELP</h4>
<el-form-item label="USER_AGENT">
<el-col :span="9">
<el-select v-model="form.USER_AGENT" placeholder="the default User-Agent for crawling" clearable>
<el-option :label="form.CUSTOM_USER_AGENT" value="custom"></el-option>
<el-option label="Mozilla/5.0 Windows NT Chrome..." value="Chrome"></el-option>
<el-option label="Mozilla/5.0 iPhone Safari..." value="iPhone"></el-option>
<el-option label="Mozilla/5.0 iPad Safari..." value="iPad"></el-option>
Expand Down Expand Up @@ -187,13 +188,13 @@ <h4>HELP</h4>

<el-form-item label="CONCURRENT_REQUESTS">
<el-col :span="9">
<el-input v-model="form.CONCURRENT_REQUESTS" placeholder="defaults to 16" clearable></el-input>
<el-input v-model="form.CONCURRENT_REQUESTS" placeholder="defaults to 16 in Scrapy" clearable></el-input>
</el-col>
</el-form-item>

<el-form-item label="DOWNLOAD_DELAY">
<el-col :span="9">
<el-input v-model="form.DOWNLOAD_DELAY" placeholder="defaults to 0" clearable></el-input>
<el-input v-model="form.DOWNLOAD_DELAY" placeholder="defaults to 0 in Scrapy" clearable></el-input>
</el-col>
</el-form-item>

Expand All @@ -214,7 +215,7 @@ <h4>HELP</h4>
<el-form-item class="main_settings" label="target">
<el-col :span="9">
<el-select v-model="form.replace_existing" @change='timerTaskReplaceExisting'>
<el-option label="Update task #{{ task_id }}" value="True"></el-option>
<el-option :label="`Update task #` + form.task_id" value="True"></el-option>
<el-option label="Save as a new task" value="False"></el-option>
</el-select>
</el-col>
Expand Down Expand Up @@ -301,7 +302,7 @@ <h4>HELP</h4>
</el-form-item>
<el-form-item label="timezone" v-show="form.expandTimerTaskMoreSettings">
<el-col :span="9">
<el-input v-model="form.timezone" placeholder="defaults to {{ timezone }}, (via: from tzlocal import get_localzone)" clearable></el-input>
<el-input v-model="form.timezone" :placeholder="`defaults to ` + form._timezone + `, (via: from tzlocal import get_localzone)`" clearable></el-input>
</el-col>
</el-form-item>
<el-form-item label="jitter (0)" v-show="form.expandTimerTaskMoreSettings">
Expand Down Expand Up @@ -520,6 +521,7 @@ <h4>HELP</h4>

// jobid: now.toISOString().slice(0,19).replace(/:/g, "_"),
jobid: '{{ jobid }}',
CUSTOM_USER_AGENT: '{{ CUSTOM_USER_AGENT }}',
USER_AGENT: '{{ USER_AGENT }}',
ROBOTSTXT_OBEY: '{{ ROBOTSTXT_OBEY }}',
COOKIES_ENABLED: '{{ COOKIES_ENABLED }}',
Expand All @@ -546,6 +548,7 @@ <h4>HELP</h4>
end_date: '{{ end_date }}',

timezone: '{{ timezone }}',
_timezone: '{{ timezone }}',
jitter: {{ jitter }}, // 0
misfire_grace_time: {{ misfire_grace_time }}, // 600
coalesce: '{{ coalesce }}',
Expand Down
12 changes: 11 additions & 1 deletion scrapydweb/templates/scrapydweb/settings.html
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,17 @@ <h3>Timer tasks</h3>
<li><div class="title"><h4>JOBS_SNAPSHOT_INTERVAL = {{ JOBS_SNAPSHOT_INTERVAL }}</h4></div></li>
</ul>
</div>


<div class="wrap collapse-wrap">
<h3>Run Spider</h3>
<ul class="collapse">
<li>
<div class="title"><h4>details</h4><i class="iconfont icon-right"></i></div>
<pre>{{ run_spider_details }}</pre>
</li>
</ul>
</div>

<div class="wrap collapse-wrap">
<h3>Page Display</h3>
<ul class="collapse">
Expand Down
32 changes: 29 additions & 3 deletions scrapydweb/utils/check_app_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
from ..common import handle_metadata, handle_slash, json_dumps, session
from ..utils.scheduler import scheduler
from ..vars import (ALLOWED_SCRAPYD_LOG_EXTENSIONS, EMAIL_TRIGGER_KEYS,
SCHEDULER_STATE_DICT, STATE_PAUSED, STATE_RUNNING)
SCHEDULER_STATE_DICT, STATE_PAUSED, STATE_RUNNING,
SCHEDULE_ADDITIONAL, UA_DICT)
from .send_email import send_email
from .sub_process import init_logparser, init_poll

Expand Down Expand Up @@ -42,7 +43,7 @@ def check_assert(key, default, is_instance, allow_zero=True, non_empty=False, co
else:
should_be = "an instance of %s%s" % (is_instance, ' and not empty' if non_empty else '')

value = config.get(key, default)
value = config.setdefault(key, default)
kws = dict(
key=key,
should_be=should_be,
Expand All @@ -61,7 +62,7 @@ def check_assert(key, default, is_instance, allow_zero=True, non_empty=False, co

# ScrapydWeb
check_assert('SCRAPYDWEB_BIND', '0.0.0.0', str, non_empty=True)
SCRAPYDWEB_PORT = config.get('SCRAPYDWEB_PORT', 5000)
SCRAPYDWEB_PORT = config.setdefault('SCRAPYDWEB_PORT', 5000)
try:
assert not isinstance(SCRAPYDWEB_PORT, bool)
SCRAPYDWEB_PORT = int(SCRAPYDWEB_PORT)
Expand Down Expand Up @@ -148,6 +149,31 @@ def check_assert(key, default, is_instance, allow_zero=True, non_empty=False, co
"via command 'logparser' as you like. ")
check_assert('BACKUP_STATS_JSON_FILE', True, bool)

# Run Spider
check_assert('SCHEDULE_EXPAND_SETTINGS_ARGUMENTS', False, bool)
check_assert('SCHEDULE_CUSTOM_USER_AGENT', '', str)
config['SCHEDULE_CUSTOM_USER_AGENT'] = config['SCHEDULE_CUSTOM_USER_AGENT'] or 'Mozilla/5.0'
UA_DICT.update(custom=config['SCHEDULE_CUSTOM_USER_AGENT'])
if config.get('SCHEDULE_USER_AGENT', None) is not None:
check_assert('SCHEDULE_USER_AGENT', '', str)
user_agent = config['SCHEDULE_USER_AGENT']
assert user_agent in UA_DICT.keys(), \
"SCHEDULE_USER_AGENT should be any value of %s. Current value: %s" % (UA_DICT.keys(), user_agent)
if config.get('SCHEDULE_ROBOTSTXT_OBEY', None) is not None:
check_assert('SCHEDULE_ROBOTSTXT_OBEY', False, bool)
if config.get('SCHEDULE_COOKIES_ENABLED', None) is not None:
check_assert('SCHEDULE_COOKIES_ENABLED', False, bool)
if config.get('SCHEDULE_CONCURRENT_REQUESTS', None) is not None:
check_assert('SCHEDULE_CONCURRENT_REQUESTS', 16, int, allow_zero=False)
if config.get('SCHEDULE_DOWNLOAD_DELAY', None) is not None:
download_delay = config['SCHEDULE_DOWNLOAD_DELAY']
if isinstance(download_delay, float):
assert download_delay >= 0.0, \
"SCHEDULE_DOWNLOAD_DELAY should a non-negative number. Current value: %s" % download_delay
else:
check_assert('SCHEDULE_DOWNLOAD_DELAY', 0, int)
check_assert('SCHEDULE_ADDITIONAL', SCHEDULE_ADDITIONAL, str)

# Page Display
check_assert('SHOW_SCRAPYD_ITEMS', True, bool)
check_assert('SHOW_JOBS_JOB_COLUMN', False, bool)
Expand Down
2 changes: 2 additions & 0 deletions scrapydweb/vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,9 @@
LEGAL_NAME_PATTERN = re.compile(r'[^0-9A-Za-z_-]')

# For schedule.py
SCHEDULE_ADDITIONAL = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1"
UA_DICT = {
'custom': "Mozilla/5.0",
'Chrome': ("Mozilla/5.0 (Windows NT 10.0; WOW64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36"),
'iPhone': ("Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) "
Expand Down
18 changes: 15 additions & 3 deletions scrapydweb/views/myview.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@
from logparser import __version__ as LOGPARSER_VERSION

from ..__version__ import __version__ as SCRAPYDWEB_VERSION
from ..common import get_now_string, get_response_from_view, handle_metadata, handle_slash, json_dumps, session
from ..vars import (ALLOWED_SCRAPYD_LOG_EXTENSIONS, DEMO_PROJECTS_PATH, DEPLOY_PATH, EMAIL_TRIGGER_KEYS, PARSE_PATH,
LEGAL_NAME_PATTERN, SCHEDULE_PATH, STATE_PAUSED, STATE_RUNNING, STATS_PATH, STRICT_NAME_PATTERN)
from ..common import (get_now_string, get_response_from_view, handle_metadata,
handle_slash, json_dumps, session)
from ..vars import (ALLOWED_SCRAPYD_LOG_EXTENSIONS, DEMO_PROJECTS_PATH, DEPLOY_PATH,
EMAIL_TRIGGER_KEYS, PARSE_PATH, LEGAL_NAME_PATTERN, SCHEDULE_ADDITIONAL,
SCHEDULE_PATH, STATE_PAUSED, STATE_RUNNING, STATS_PATH, STRICT_NAME_PATTERN)
from ..utils.scheduler import scheduler


Expand Down Expand Up @@ -100,6 +102,16 @@ def __init__(self, *args, **kwargs):
self.scheduler = scheduler
self.JOBS_SNAPSHOT_INTERVAL = app.config.get('JOBS_SNAPSHOT_INTERVAL', 300)

# Run Spider
self.SCHEDULE_EXPAND_SETTINGS_ARGUMENTS = app.config.get('SCHEDULE_EXPAND_SETTINGS_ARGUMENTS', False)
self.SCHEDULE_CUSTOM_USER_AGENT = app.config.get('SCHEDULE_CUSTOM_USER_AGENT', 'Mozilla/5.0')
self.SCHEDULE_USER_AGENT = app.config.get('SCHEDULE_USER_AGENT', None)
self.SCHEDULE_ROBOTSTXT_OBEY = app.config.get('SCHEDULE_ROBOTSTXT_OBEY', None)
self.SCHEDULE_COOKIES_ENABLED = app.config.get('SCHEDULE_COOKIES_ENABLED', None)
self.SCHEDULE_CONCURRENT_REQUESTS = app.config.get('SCHEDULE_CONCURRENT_REQUESTS', None)
self.SCHEDULE_DOWNLOAD_DELAY = app.config.get('SCHEDULE_DOWNLOAD_DELAY', None)
self.SCHEDULE_ADDITIONAL = app.config.get('SCHEDULE_ADDITIONAL', SCHEDULE_ADDITIONAL)

# Page Display
self.SHOW_SCRAPYD_ITEMS = app.config.get('SHOW_SCRAPYD_ITEMS', True)
self.SHOW_JOBS_JOB_COLUMN = app.config.get('SHOW_JOBS_JOB_COLUMN', False)
Expand Down
19 changes: 11 additions & 8 deletions scrapydweb/views/operations/schedule.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,15 +170,18 @@ def update_kwargs(self):
version_spider_job='VERSION_PLACEHOLDER'),
url_schedule_check=url_for('schedule.check', node=self.node)
))
self.kwargs.setdefault('expand_settings_arguments', False)
self.kwargs.setdefault('expand_settings_arguments', self.SCHEDULE_EXPAND_SETTINGS_ARGUMENTS)
self.kwargs.setdefault('jobid', '')
self.kwargs.setdefault('USER_AGENT', '') # Chrome|iPhone|iPad|Android
self.kwargs.setdefault('ROBOTSTXT_OBEY', '')
self.kwargs.setdefault('COOKIES_ENABLED', '')
self.kwargs.setdefault('CONCURRENT_REQUESTS', '')
self.kwargs.setdefault('DOWNLOAD_DELAY', '')
_additional = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1"
self.kwargs.setdefault('additional', _additional)
# self.kwargs.setdefault('UA_DICT', UA_DICT)
self.kwargs.setdefault('CUSTOM_USER_AGENT', self.SCHEDULE_CUSTOM_USER_AGENT)
# custom|Chrome|iPhone|iPad|Android
self.kwargs.setdefault('USER_AGENT', '' if self.SCHEDULE_USER_AGENT is None else self.SCHEDULE_USER_AGENT)
self.kwargs.setdefault('ROBOTSTXT_OBEY', '' if self.SCHEDULE_ROBOTSTXT_OBEY is None else self.SCHEDULE_ROBOTSTXT_OBEY)
self.kwargs.setdefault('COOKIES_ENABLED', '' if self.SCHEDULE_COOKIES_ENABLED is None else self.SCHEDULE_COOKIES_ENABLED)
self.kwargs.setdefault('CONCURRENT_REQUESTS', '' if self.SCHEDULE_CONCURRENT_REQUESTS is None else self.SCHEDULE_CONCURRENT_REQUESTS)
self.kwargs.setdefault('DOWNLOAD_DELAY', '' if self.SCHEDULE_DOWNLOAD_DELAY is None else self.SCHEDULE_DOWNLOAD_DELAY)
# additional = "-d setting=CLOSESPIDER_TIMEOUT=60\r\n-d setting=CLOSESPIDER_PAGECOUNT=10\r\n-d arg1=val1"
self.kwargs.setdefault('additional', self.SCHEDULE_ADDITIONAL)

self.kwargs.setdefault('expand_timer_task', 'add_task' in request.args) # '+' button in the TimeTasks page
self.kwargs.setdefault('task_id', 0)
Expand Down
12 changes: 12 additions & 0 deletions scrapydweb/views/system/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,18 @@ def update_kwargs(self):
self.kwargs['scheduler_state'] = SCHEDULER_STATE_DICT[self.scheduler.state]
self.kwargs['JOBS_SNAPSHOT_INTERVAL'] = self.JOBS_SNAPSHOT_INTERVAL

# Run Spider
self.kwargs['run_spider_details'] = self.json_dumps(dict(
SCHEDULE_EXPAND_SETTINGS_ARGUMENTS=self.SCHEDULE_EXPAND_SETTINGS_ARGUMENTS,
SCHEDULE_CUSTOM_USER_AGENT=self.SCHEDULE_CUSTOM_USER_AGENT,
SCHEDULE_USER_AGENT=self.SCHEDULE_USER_AGENT,
SCHEDULE_ROBOTSTXT_OBEY=self.SCHEDULE_ROBOTSTXT_OBEY,
SCHEDULE_COOKIES_ENABLED=self.SCHEDULE_COOKIES_ENABLED,
SCHEDULE_CONCURRENT_REQUESTS=self.SCHEDULE_CONCURRENT_REQUESTS,
SCHEDULE_DOWNLOAD_DELAY=self.SCHEDULE_DOWNLOAD_DELAY,
SCHEDULE_ADDITIONAL=self.SCHEDULE_ADDITIONAL
))

# Page Display
self.kwargs['page_display_details'] = self.json_dumps(dict(
SHOW_SCRAPYD_ITEMS=self.SHOW_SCRAPYD_ITEMS,
Expand Down

0 comments on commit 30b39b7

Please sign in to comment.