Permalink
Browse files

Merge branch 'master' of github.com:mikegagnon/nginx-overload-handler

  • Loading branch information...
2 parents 04c624e + 7f5d624 commit 3339e7fd4e689f0e08fdedf0f4d402346ba4104f Mike Gagnon committed Sep 2, 2012
@@ -0,0 +1,60 @@
+Copyright 2012 github.com/one10, Hellasec
+http://www.apache.org/licenses/LICENSE-2.0
+################################################################################
+
+MediaWiki "realistic" traffic replay toolkit
+
+Best setup is if the MW exp-import script is run on a target MediaWiki server
+While the trace and traffic gen. scripts are run from the second, "attacking" server
+
+* assumes MediaWiki has been installed using the scripts in the dir above
+
+The flow and the files are as follows:
+
+=====================
+1. download the big pageview count file from, e.g.:
+http://dumps.wikimedia.org/other/pagecounts-raw/2012/2012-03/
+gunzip it into ./data
+
+2. Script filters out garbage pages, generates pageview count json
+Files:
+./prepare_inputs.sh
+Output example:
+./data/pagecounts-20120301-040000.json
+
+3. Trace gen .sh generates a trace from json (envetually using the new variates alg)
+4. collapses the trace into a importable Wiki list
+Files:
+./trace_and_list_gen.sh
+./trace_gen.py (invoked by .sh)
+Output example:
+./data/pagecounts-20120301-040000.trace
+./data/pagecounts-20120301-040000.titles
+
+5. Script to exp-import WP pages (import script+data must be run on the target MW server)
+Files:
+./export_wp_pages.sh
+(calls MW's importDump.php, so run with the input list from the prior step on MW server)
+Output example:
+./data/pagecounts-20120301-040000.0.wpimport.xml
+./data/pagecounts-20120301-040000.15.wpimport.xml
+
+6. Test or puzzle_solver: loops through trace, gets/posts MW page data according to trace
+Files:
+./test_run_trace.sh (calls postToMW.py)
+(just to test the generated trace, may need to re-run all steps 1-6 after done testing)
+
+For now, simply appends a few random chars to the old page body.
+
+for the real trace runs, read documentation for doorman_test, e.g.:
+nginx-overload-handler/doorman_test/puzzle_solver.py
+
+Misc:
+A script to erase all data from a Mediawiki instance for re-import (CLI php on MW server):
+recreate_mw_db.sh
+
+=====================
+Notes/Todo/Limitations:
+* Once a trace is generated, then some page export-import might fail. Such pages though
+present in the trace, will not contribute to a realistic traffic pattern if ex-imp fails.
+
@@ -0,0 +1,58 @@
+{
+ "verbs": [
+ {
+ "desc": "normal view",
+ "freq": "51",
+ "needsNoun": "yes",
+ "urlPrefix": "/index.php?title="
+ },
+ {
+ "desc": "NoticeLocal",
+ "freq": "12.32",
+ "urlPrefix": "/index.php?title=Special:NoticeLocal/anon&action=raw"
+ },
+ {
+ "desc": "search",
+ "freq": "5.88",
+ "needsNoun": "yes",
+ "urlPrefix": "/index.php?title=Special%3ASearch&search="
+ },
+ {
+ "desc": "rand",
+ "freq": "3",
+ "urlPrefix": "/index.php?title=Special:Random"
+ },
+ {
+ "desc": "all recn changes",
+ "freq": "2.31",
+ "urlPrefix": "/index.php?title=Special:RecentChanges"
+ },
+ {
+ "desc": "export a page",
+ "freq": ".21",
+ "needsNoun": "yes",
+ "urlPrefix": "/index.php?title=Special:Export/"
+ },
+ {
+ "desc": "sample load.php for css",
+ "freq": "8",
+ "urlPrefix": "/load.php?debug=false&lang=en&modules=mediawiki.legacy.commonPrint%2Cshared%7Cskins.vector&only=styles&skin=vector&*"
+ },
+ {
+ "desc": "sample load.php for js",
+ "freq": "8",
+ "urlPrefix": "/load.php?debug=false&lang=en&modules=startup&only=scripts&skin=vector&*"
+ },
+ {
+ "desc": "opensearch_desc.php",
+ "freq": "7",
+ "urlPrefix": "/opensearch_desc.php"
+ },
+ {
+ "desc": "action edit - not submitting",
+ "freq": "1",
+ "needsNoun": "yes",
+ "urlPrefix": "/index.php?action=edit&title="
+ }
+ ]
+}
@@ -0,0 +1,90 @@
+#!/bin/bash
+#
+# Copyright 2012 github.com/one10, Hellasec
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+#
+# This script takes a list of WP page titles and exports them
+# in batches of <20 files at a time into multiple local files
+# then it also attempts to import these local files into a local Mediawiki
+#
+# Usage: ./export_wp_pages.sh <title-list-file> [import-only]
+#
+# Specify below a proper LOCAL_MW_INSTALL_DIR for successful import
+#
+# Note: No guarantees how many of the supplied title list will be actually successfully
+# downloaded from Wikipedia, many pages fail (deleted since, or special, or error, etc),
+# so watch the "verified downloaded" stats at the end of this script
+
+PAGES_PER_CURBATCH=20
+SLEEP=3
+TITLE_LIST_FILE=$1
+if [ -z "$TITLE_LIST_FILE" ]; then
+ echo "Usage: ./export_wp_pages.sh <title-list-file>"
+ echo "Also, don't forget to update path to local Mediawiki path for proper import"
+ # TITLE_LIST_FILE="data/sample-pagecounts-input00.titles" # test only, remove
+ exit
+fi
+EXP_SERVER="http://en.wikipedia.org"
+EXP_URL="/w/index.php?title=Special:Export&action=submit&curonly=1&wpDownload=1&pages="
+LOCAL_MW_INSTALL_DIR="/home/fcgi_user/mediawiki-1.18.2/"
+
+# if we wish to import an existing bunch of files only...
+IMPORT_ONLY=$2
+if [ -n "$IMPORT_ONLY" ]; then
+ # pagecounts-20120301-210000.json.
+ for i in `ls $TITLE_LIST_FILE.*.xml`
+ do
+ echo "exporting $i..."
+ php ${LOCAL_MW_INSTALL_DIR}/maintenance/importDump.php $i
+ done
+ exit
+fi
+
+data=`cat $TITLE_LIST_FILE`
+datasize=`echo "$data" | wc -l`
+
+actual_downloaded=0
+count=0
+
+# after done preparing titles, export them from the Wikipedia
+for i in `echo "$data"`
+do
+ # TODO: for early termination before some limit N for whatever reason...
+ # if [[ (( "$count" -ge "$N" )) || (( "$actual_downloaded" -ge "$N" )) ]]; then
+ # break
+ # fi
+ count=`expr $count + 1`
+ if [[ (( $[$count % $PAGES_PER_CURBATCH] -eq "0" )) || (( $count -eq $datasize )) ]]; then
+ # append 20 from data to curdata
+ # download curdata to ./datafolder
+ postval=$curbatch
+ #postval=$(python -c "import urllib; print urllib.quote('''$postval''')")
+ echo $postval
+ out_file=`echo $TITLE_LIST_FILE | sed s'/.titles$//'`".$actual_downloaded.wpimport.xml"
+ curl -o $out_file ${EXP_SERVER}${EXP_URL}${postval}
+ sleep $SLEEP
+ verify_num_pages=`grep -o "text.*bytes=[^>]*" $out_file | wc -l`
+ actual_downloaded=`expr $actual_downloaded + $verify_num_pages`
+ echo "number of pages downloaded: $verify_num_pages, total: $actual_downloaded"
+
+ # import this batch into local MW
+ php ${LOCAL_MW_INSTALL_DIR}/maintenance/importDump.php $out_file
+ curbatch=""
+ fi
+ curbatch=$curbatch"%0D%0A"$i
+done
+
+echo "Totals: input list size: $datasize, verified downloaded: $actual_downloaded"
@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+#
+# Copyright 2012 github.com/one10, Hellasec
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+#
+# Convert space-separated Wikipedia pageview count dump files into JSON
+#
+import sys
+import json
+import urllib2
+
+output = {}
+for line in sys.stdin:
+ parts = line.split()
+ assert(len(parts) == 4)
+ page = parts[1]
+ try:
+ page_unicode = page.encode("utf8")
+ except UnicodeDecodeError:
+ page_unicode = page
+ page = urllib2.quote(urllib2.unquote(page_unicode))
+ output[page] = int(parts[2])
+
+json.dump(output, sys.stdout, indent=2)
+
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+#
+# Copyright 2012 github.com/one10, Hellasec
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+################################################################################
+#
+# view, view edit form, post edit form for a Mediawiki page
+#
+
+from BeautifulSoup import BeautifulSoup
+import urllib
+import urllib2
+import time # only for test
+
+# SERVER_NAME = 'bg2.hellasec.com'
+SERVER_NAME = 'localhost'
+
+MW_EDIT_TOKEN_NAME = 'wpEdittime'
+LASTMOD_ID = 't-permalink'
+VIEW_URL = 'http://' + SERVER_NAME + '/index.php?title='
+EDIT_URL = 'http://' + SERVER_NAME + '/index.php?action=edit&title='
+EDIT_TARGET_URL = 'http://' + SERVER_NAME + '/index.php?action=submit&title='
+
+# this function assumes a urlencoded page title
+# it also taks a non-urlencode page replacement text
+def postToMW(urlEncodedPageTitle, newPagePlainText):
+ viewUrl = VIEW_URL + urlEncodedPageTitle
+ editUrl = EDIT_URL + urlEncodedPageTitle
+ editTargetUrl = EDIT_TARGET_URL + urlEncodedPageTitle
+
+ # get the last mod string from a normal page view: footer-info-lastmod
+ doc = urllib2.urlopen(viewUrl)
+ soup = BeautifulSoup(doc.read())
+ lastmod = soup.find(attrs={'id':LASTMOD_ID})
+ # print "oldlastmod: " + str(lastmod)
+
+ # fetch the edit token
+ doc = urllib2.urlopen(editUrl)
+ soup = BeautifulSoup(doc.read())
+ token = soup.find(attrs={'name':MW_EDIT_TOKEN_NAME})
+ # print "token: " + token["value"]
+
+ # make a post
+ values = {'wpEdittime' : token["value"],
+ 'wpEditToken' : '+\\',
+ 'wpTextbox1' : newPagePlainText,
+ 'wpSave' : 'Save page'}
+
+ data = urllib.urlencode(values)
+ req = urllib2.Request(editTargetUrl, data)
+
+ # validate the response
+ response = urllib2.urlopen(req)
+ soup = BeautifulSoup(response.read())
+ newlastmod = soup.find(attrs={'id':LASTMOD_ID})
+ # print "newlastmod: " + str(newlastmod.__class__)
+
+ if lastmod == None or lastmod == "" or \
+ newlastmod == None or newlastmod == "" or \
+ lastmod == newlastmod:
+ # print "error"
+ # the page hasn't changed after post, or unable to determine = we have problems
+ raise ValueError("post had problems and most likely didn't succeed")
+
+################# a few test cases
+# e.g.: S%CC%88%C3%B6m%CC%88%C3%AB_crazy_%C7%98mla%E1%B9%B3ts_%28and_some_%D0%A0%D1%83%D1%81%D1%81%D0%BA%D0%B8%D0%B9_text_for_a_good_measure%29
+## pageTitle = 'S%CC%88%C3%B6m%CC%88%C3%AB_crazy_%C7%98mla%E1%B9%B3ts_%28and_some_%D0%A0%D1%83%D1%81%D1%81%D0%BA%D0%B8%D0%B9_text_for_a_good_measure%29'
+#pageTitle = 'S%CC%88%C3%B6m%CC%88%C3%AB_crazy_%C7%98mla%E1%B9%B3ts_(and_some_%D0%A0%D1%83%D1%81%D1%81%D0%BA%D0%B8%D0%B9_text_for_a_good_measure)'
+#pageTitle = "Test00"
+##pageTitle = "Test00"
+##pageText = time.asctime(time.localtime(time.time()))
+##postToMW(pageTitle, pageText)
+
Oops, something went wrong.

0 comments on commit 3339e7f

Please sign in to comment.