Permalink
Browse files

initial picurls.com import

  • Loading branch information...
0 parents commit b667ee308b3eb6b892e0b350752d7915a87a2d8b @pkrumins committed Dec 5, 2009
Showing with 27,639 additions and 0 deletions.
  1. BIN db/picurls.db
  2. +88 −0 db/picurls.db.sql.txt
  3. +15 −0 readme.txt
  4. +46 −0 scraper/picurls.sb.txt
  5. +52 −0 scraper/picurls.sn.txt
  6. +203 −0 scraper/scraper.pl
  7. +112 −0 scraper/sites/boingboing.pm
  8. +100 −0 scraper/sites/delicious.pm
  9. +117 −0 scraper/sites/digg.pm
  10. +125 −0 scraper/sites/flickr.pm
  11. +93 −0 scraper/sites/furl.pm
  12. +180 −0 scraper/sites/reddit.pm
  13. +241 −0 scraper/sites/scraper.pm
  14. +109 −0 scraper/sites/simpy.pm
  15. +140 −0 scraper/sites/stumbleupon.pm
  16. +94 −0 scraper/sites/wired.pm
  17. +243 −0 scripts/ImageFinder.pm
  18. +339 −0 scripts/NetPbm.pm
  19. +349 −0 scripts/ThumbExtractor.pm
  20. +206 −0 scripts/ThumbMaker.pm
  21. +51 −0 scripts/cronjob.sh
  22. +321 −0 scripts/pic_mover.pl
  23. +122 −0 scripts/picurls_db_inserter.pl
  24. +4 −0 templates/content-error.tpl.html
  25. +46 −0 templates/content-index.tpl.html
  26. +63 −0 templates/content-item.tpl.html
  27. +52 −0 templates/content-login-register.tpl.html
  28. +43 −0 templates/content-my-comments.tpl.html
  29. +54 −0 templates/content-my-profile.tpl.html
  30. +49 −0 templates/content-site.tpl.html
  31. +126 −0 templates/index.tpl.html
  32. +7 −0 www/.htaccess
  33. +65 −0 www/config.php
  34. BIN www/favicon.ico
  35. +7 −0 www/ie-hacks-hello-world.css
  36. BIN www/img/bookmark.gif
  37. BIN www/img/comments.gif
  38. BIN www/img/date.gif
  39. BIN www/img/delicious-small.gif
  40. BIN www/img/delicious.gif
  41. BIN www/img/dotter.gif
  42. BIN www/img/link.gif
  43. BIN www/img/local.gif
  44. BIN www/img/more.gif
  45. BIN www/img/next.gif
  46. BIN www/img/oops.gif
  47. BIN www/img/pics.gif
  48. BIN www/img/previous.gif
  49. BIN www/img/register.gif
  50. BIN www/img/user.gif
  51. BIN www/img/vbulletin.gif
  52. +97 −0 www/index.php
  53. +60 −0 www/mysmarty.php
  54. +42 −0 www/page-error.php
  55. +148 −0 www/page-index.php
  56. +232 −0 www/page-item.php
  57. +124 −0 www/page-login.php
  58. +46 −0 www/page-logout.php
  59. +137 −0 www/page-my-comments.php
  60. +146 −0 www/page-my-profile.php
  61. +133 −0 www/page-register.php
  62. +165 −0 www/page-site.php
  63. BIN www/picurls.gif
  64. +389 −0 www/smarty/Config_File.class.php
  65. +1,944 −0 www/smarty/Smarty.class.php
  66. +2,327 −0 www/smarty/Smarty_Compiler.class.php
  67. +157 −0 www/smarty/debug.tpl
  68. +67 −0 www/smarty/internals/core.assemble_plugin_filepath.php
  69. +43 −0 www/smarty/internals/core.assign_smarty_interface.php
  70. +79 −0 www/smarty/internals/core.create_dir_structure.php
  71. +61 −0 www/smarty/internals/core.display_debug_console.php
  72. +44 −0 www/smarty/internals/core.get_include_path.php
  73. +23 −0 www/smarty/internals/core.get_microtime.php
  74. +80 −0 www/smarty/internals/core.get_php_resource.php
  75. +59 −0 www/smarty/internals/core.is_secure.php
  76. +47 −0 www/smarty/internals/core.is_trusted.php
  77. +125 −0 www/smarty/internals/core.load_plugins.php
  78. +74 −0 www/smarty/internals/core.load_resource_plugin.php
  79. +71 −0 www/smarty/internals/core.process_cached_inserts.php
  80. +37 −0 www/smarty/internals/core.process_compiled_include.php
  81. +101 −0 www/smarty/internals/core.read_cache_file.php
  82. +71 −0 www/smarty/internals/core.rm_auto.php
  83. +54 −0 www/smarty/internals/core.rmdir.php
  84. +71 −0 www/smarty/internals/core.run_insert_handler.php
  85. +50 −0 www/smarty/internals/core.smarty_include_php.php
  86. +96 −0 www/smarty/internals/core.write_cache_file.php
  87. +91 −0 www/smarty/internals/core.write_compiled_include.php
  88. +35 −0 www/smarty/internals/core.write_compiled_resource.php
  89. +54 −0 www/smarty/internals/core.write_file.php
  90. +7 −0 www/smarty/not-used/BUGS
  91. +458 −0 www/smarty/not-used/COPYING.lib
  92. +8,667 −0 www/smarty/not-used/ChangeLog
  93. +284 −0 www/smarty/not-used/FAQ
  94. +29 −0 www/smarty/not-used/INSTALL
  95. +1,013 −0 www/smarty/not-used/NEWS
  96. +103 −0 www/smarty/not-used/QUICK_START
  97. +80 −0 www/smarty/not-used/README
  98. +428 −0 www/smarty/not-used/RELEASE_NOTES
  99. +10 −0 www/smarty/not-used/TODO
  100. +5 −0 www/smarty/not-used/demo/configs/test.conf
  101. +25 −0 www/smarty/not-used/demo/index.php
  102. +2 −0 www/smarty/not-used/demo/templates/footer.tpl
  103. +6 −0 www/smarty/not-used/demo/templates/header.tpl
  104. +81 −0 www/smarty/not-used/demo/templates/index.tpl
  105. +6 −0 www/smarty/not-used/misc/smarty_icon.README
  106. BIN www/smarty/not-used/misc/smarty_icon.gif
  107. +32 −0 www/smarty/not-used/unit_test/README
  108. +5 −0 www/smarty/not-used/unit_test/config.php
  109. +1 −0 www/smarty/not-used/unit_test/configs/globals_double_quotes.conf
  110. +1 −0 www/smarty/not-used/unit_test/configs/globals_single_quotes.conf
  111. +10 −0 www/smarty/not-used/unit_test/smarty_unit_test.php
  112. +10 −0 www/smarty/not-used/unit_test/smarty_unit_test_gui.php
  113. +1 −0 www/smarty/not-used/unit_test/templates/assign_var.tpl
  114. +1 −0 www/smarty/not-used/unit_test/templates/constant.tpl
  115. +1 −0 www/smarty/not-used/unit_test/templates/index.tpl
  116. +12 −0 www/smarty/not-used/unit_test/templates/parse_math.tpl
  117. +8 −0 www/smarty/not-used/unit_test/templates/parse_obj_meth.tpl
  118. +450 −0 www/smarty/not-used/unit_test/test_cases.php
  119. +103 −0 www/smarty/plugins/block.textformat.php
  120. +40 −0 www/smarty/plugins/compiler.assign.php
  121. +40 −0 www/smarty/plugins/function.assign_debug_info.php
  122. +142 −0 www/smarty/plugins/function.config_load.php
  123. +80 −0 www/smarty/plugins/function.counter.php
  124. +102 −0 www/smarty/plugins/function.cycle.php
  125. +35 −0 www/smarty/plugins/function.debug.php
  126. +49 −0 www/smarty/plugins/function.eval.php
  127. +221 −0 www/smarty/plugins/function.fetch.php
  128. +143 −0 www/smarty/plugins/function.html_checkboxes.php
  129. +142 −0 www/smarty/plugins/function.html_image.php
  130. +122 −0 www/smarty/plugins/function.html_options.php
  131. +156 −0 www/smarty/plugins/function.html_radios.php
  132. +331 −0 www/smarty/plugins/function.html_select_date.php
  133. +194 −0 www/smarty/plugins/function.html_select_time.php
  134. +177 −0 www/smarty/plugins/function.html_table.php
  135. +165 −0 www/smarty/plugins/function.mailto.php
  136. +84 −0 www/smarty/plugins/function.math.php
  137. +119 −0 www/smarty/plugins/function.popup.php
  138. +40 −0 www/smarty/plugins/function.popup_init.php
  139. +43 −0 www/smarty/plugins/modifier.capitalize.php
  140. +33 −0 www/smarty/plugins/modifier.cat.php
  141. +32 −0 www/smarty/plugins/modifier.count_characters.php
  142. +29 −0 www/smarty/plugins/modifier.count_paragraphs.php
  143. +29 −0 www/smarty/plugins/modifier.count_sentences.php
  144. +33 −0 www/smarty/plugins/modifier.count_words.php
  145. +58 −0 www/smarty/plugins/modifier.date_format.php
  146. +90 −0 www/smarty/plugins/modifier.debug_print_var.php
  147. +32 −0 www/smarty/plugins/modifier.default.php
  148. +93 −0 www/smarty/plugins/modifier.escape.php
  149. +28 −0 www/smarty/plugins/modifier.indent.php
  150. +26 −0 www/smarty/plugins/modifier.lower.php
  151. +35 −0 www/smarty/plugins/modifier.nl2br.php
  152. +35 −0 www/smarty/plugins/modifier.regex_replace.php
  153. +30 −0 www/smarty/plugins/modifier.replace.php
  154. +30 −0 www/smarty/plugins/modifier.spacify.php
  155. +29 −0 www/smarty/plugins/modifier.string_format.php
  156. +33 −0 www/smarty/plugins/modifier.strip.php
  157. +32 −0 www/smarty/plugins/modifier.strip_tags.php
  158. +50 −0 www/smarty/plugins/modifier.truncate.php
  159. +26 −0 www/smarty/plugins/modifier.upper.php
  160. +29 −0 www/smarty/plugins/modifier.wordwrap.php
  161. +75 −0 www/smarty/plugins/outputfilter.trimwhitespace.php
  162. +31 −0 www/smarty/plugins/shared.escape_special_chars.php
  163. +46 −0 www/smarty/plugins/shared.make_timestamp.php
  164. +141 −0 www/style-index.css
  165. +183 −0 www/style-item.css
  166. +58 −0 www/style-login.css
  167. +68 −0 www/style-my-comments.css
  168. +65 −0 www/style-my-profile.css
  169. +132 −0 www/style-page.css
  170. +202 −0 www/style.css
  171. +120 −0 www/system/db.sqlite.php
  172. BIN www/thumbnails/0/a-hole-in-the-wall.jpg
  173. BIN www/thumbnails/0/a-sharing-moment.jpg
  174. BIN ...-com-american-shelter-an-illustrated-encyclopedia-of-the-american-homes-books-lester-r-walker.jpg
  175. BIN www/thumbnails/0/an-evil-exists-pic.jpg
  176. BIN www/thumbnails/0/ancient-flying-vehicles-pics.jpg
  177. BIN www/thumbnails/0/animals-really-can-get-along-best-pics-around.jpg
  178. BIN www/thumbnails/0/animals-really-can-get-along-pictures.jpg
  179. BIN www/thumbnails/0/anyone-remember-this-headline-from-a-few-years-back-pic.jpg
  180. BIN www/thumbnails/0/bezprzewodowe-cyfrowe-ramki-od-photovu.jpg
  181. BIN www/thumbnails/0/buycostumes-com-image-zoom.jpg
  182. BIN www/thumbnails/0/calvin-hobbes-not-elected-you-mean-you-can-govern-with-dictatorial-impunity.jpg
  183. BIN www/thumbnails/0/carmen-sandiego-found-in-airport-lounge.jpg
  184. BIN www/thumbnails/0/cedec-2007-nvidia.jpg
  185. BIN www/thumbnails/0/cheese-that-i-like-photo.jpg
  186. BIN www/thumbnails/0/chocolate-craving-its-not-just-your-sweet-tooth-talking.jpg
  187. BIN www/thumbnails/0/clipart-etc-homepage.jpg
  188. BIN www/thumbnails/0/cool-artistic-lunch-boxes-pictures.jpg
  189. BIN www/thumbnails/0/cool-tool-blurb-lulu.jpg
  190. BIN www/thumbnails/0/cool-tool-pictopia.jpg
  191. BIN www/thumbnails/0/cool-tool-scancafe.jpg
  192. BIN www/thumbnails/0/czy-to-ju-czas-na-lustrzank.jpg
  193. BIN www/thumbnails/0/dead-frozen-deer-random-images-moonbuggy.jpg
  194. BIN www/thumbnails/0/display-photo.jpg
  195. BIN www/thumbnails/0/driving-a-russian-lada-underwater-pics.jpg
  196. BIN www/thumbnails/0/egg-inspired-designs-pics.jpg
  197. BIN www/thumbnails/0/empty-41.jpg
  198. BIN www/thumbnails/0/empty-42.jpg
  199. BIN www/thumbnails/0/empty-43.jpg
  200. BIN www/thumbnails/0/face-research-demos-make-an-average.jpg
  201. BIN www/thumbnails/0/flickr-pool-di-s-u-m-setups-mandatory.jpg
  202. BIN ...humbnails/0/free-movie-film-motivational-inspirational-demotivational-wanted-poster-generator.jpg
  203. BIN www/thumbnails/0/galeries-dimages-en-flash.jpg
  204. BIN www/thumbnails/0/gallery-19-30-v-clav-jir-sek.jpg
  205. BIN www/thumbnails/0/gave-herself-to-the-sky.jpg
  206. BIN www/thumbnails/0/gifts-for-kids-digital-camera-as-a-wonderful-gift-for-your-kids.jpg
  207. BIN www/thumbnails/0/gore-u-n-climate-panel-share-nobel-peace-prize.jpg
  208. BIN www/thumbnails/0/greetings-from-the-white-stripes.jpg
  209. BIN www/thumbnails/0/how-to-photograph-white-object-on-white-background.jpg
  210. BIN www/thumbnails/0/ice-geysers-erupt-on-enceladus-pic.jpg
  211. BIN www/thumbnails/0/irfanview32-v3-99-from-gigaloads-com.jpg
  212. BIN www/thumbnails/0/is-this-what-reddit-has-come-to.jpg
  213. BIN www/thumbnails/0/jeff-vandermeer-and-the-weird-art-he-inspires.jpg
  214. BIN www/thumbnails/0/jose-a-gallego-61-12-98mb-2photo-ru.jpg
  215. BIN www/thumbnails/0/konkurs-fotograficzny-czekolada.jpg
  216. BIN www/thumbnails/0/lawyer-represents-unborn-embryo-in-federal-court-tuesday.jpg
  217. BIN www/thumbnails/0/learn-how-to-draw-with-bob-weber-jr-and-slylock-fox-and-comics-for-kids.jpg
  218. BIN www/thumbnails/0/lessigs-anti-corruption-lecture-alpha-version.jpg
  219. BIN www/thumbnails/0/m-i-r-r-o-r-w-o-r-l-d.jpg
  220. BIN www/thumbnails/0/marketing-mogul-bad-day-2.jpg
  221. BIN www/thumbnails/0/mayangs-free-texture-library.jpg
  222. BIN www/thumbnails/0/multi-colored-water-rose-best-pic-ever.jpg
  223. BIN www/thumbnails/0/netherlands-bans-magic-mushrooms.jpg
  224. BIN www/thumbnails/0/nikons-small-world-contest-a-gallery-of-beautiful-tiny-things.jpg
  225. BIN www/thumbnails/0/oct-12-1928-iron-lung-savior-to-a-generation.jpg
  226. BIN www/thumbnails/0/one-fish-hiding-inside-another-fish-picture.jpg
  227. BIN www/thumbnails/0/orange-box-and-more-keep-the-xbox-hits-coming.jpg
  228. BIN www/thumbnails/0/petegoldlust-com-images-carvedcrayons-06-carvedc-16-jpg.jpg
  229. BIN www/thumbnails/0/pic-half-wet-elephant.jpg
  230. BIN www/thumbnails/0/pic-if-youre-having-trouble-going-just-look-down.jpg
  231. BIN www/thumbnails/0/please-dont-steal-bikes-from-here-sign.jpg
  232. BIN www/thumbnails/0/pop-geek-jonathan-coulton-succeeds-by-giving-music-away.jpg
  233. BIN www/thumbnails/0/q-a-foul-mouthed-blogger-ted-dziuba-tells-why-most-startups-fail.jpg
  234. BIN www/thumbnails/0/rip-roy-rosenzweig-digital-historian.jpg
  235. BIN www/thumbnails/0/root-coffee-bento-bizarro-lunch-boxes.jpg
  236. BIN www/thumbnails/0/russias-culture-minister-bans-photo-of-kissing-policemen.jpg
  237. BIN www/thumbnails/0/scotch-maverick-reinvents-a-once-conservative-drink.jpg
  238. BIN www/thumbnails/0/screenshots-cite-orange-box-eternal-sonata-csi-hard-evidence-cite.jpg
  239. BIN www/thumbnails/0/sharpest-image-of-pluto-ever-taken.jpg
  240. BIN www/thumbnails/0/shipping-containers-as-housing.jpg
  241. BIN www/thumbnails/0/special-review-tel-aviv-art-and-culture.jpg
  242. BIN www/thumbnails/0/stop-and-freeze-pic.jpg
  243. BIN www/thumbnails/0/strange-but-fitting-pairing-of-signs.jpg
  244. BIN www/thumbnails/0/the-axe-effect-pics-caution-use-axe-responsibly.jpg
  245. BIN www/thumbnails/0/the-morning-commute-in-bokeh.jpg
  246. BIN www/thumbnails/0/the-most-amazing-trees-in-the-world-pics.jpg
  247. BIN www/thumbnails/0/the-new-shelton-wet-dry.jpg
  248. BIN www/thumbnails/0/the-ugc-old-drivel.jpg
  249. BIN www/thumbnails/0/tom-hunter-photography-the-saatchi-gallery.jpg
  250. BIN www/thumbnails/0/using-the-internet-to-ruin-someones-life.jpg
  251. BIN www/thumbnails/0/uta-fi-hashem-al-sayadi-cats-cool4-jpg-2.jpg
  252. BIN www/thumbnails/0/vii-chinas-wii-first-live-shots-pics.jpg
  253. BIN www/thumbnails/0/vote-the-most-underappreciated-scientists-of-all-time.jpg
  254. BIN www/thumbnails/0/water-hobo-sprays-yard-cutters-with-water.jpg
  255. BIN www/thumbnails/0/whimisical-repetitive-random-oh-yeah-and-modern.jpg
Binary file not shown.
@@ -0,0 +1,88 @@
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# picurls.com website database structure for SQLite database
+#
+# Read how picurls.com was designed at:
+# http://www.catonmat.net/blog/making-of-picurls-popurls-for-pictures-part-one/
+#
+
+# AUTOINCREMENT removed because SQLite2 does not support it!
+
+BEGIN TRANSACTION;
+
+CREATE TABLE items (
+ id INTEGER PRIMARY KEY, -- AUTOINCREMENT
+ title STRING NOT NULL,
+ sane_title STRING NOT NULL,
+ url STRING NOT NULL,
+ thumb STRING NOT NULL,
+ site_id INTEGER NOT NULL,
+ date_added DATE NOT NULL,
+ visible BOOL NOT NULL DEFAULT 1
+);
+
+CREATE TABLE tmp_items (
+ id INTEGER PRIMARY KEY, -- AUTOINCREMENT,
+ title STRING NOT NULL,
+ url STRING NOT NULL,
+ date_added DATE NOT NULL,
+ site_id INTEGER NOT NULL,
+ tries INTEGER NOT NULL DEFAULT 0
+);
+
+CREATE TABLE comments (
+ id INTEGER PRIMARY KEY, -- AUTOINCREMENT,
+ comment STRING NOT NULL,
+ item_id INTEGER NOT NULL,
+ user_id STRING NOT NULL,
+ anonymous_name STRING,
+ ip_address STRING NOT NULL,
+ date_added DATE NOT NULL
+);
+
+CREATE TABLE sites (
+ id INTEGER PRIMARY KEY,
+ name STRING NOT NULL UNIQUE,
+ sane_name STRING NOT NULL UNIQUE,
+ url STRING NOT NULL UNIQUE,
+ visible BOOL NOT NULL DEFAULT 1,
+ priority INTEGER NOT NULL
+);
+
+CREATE TABLE users (
+ id INTEGER PRIMARY KEY, -- AUTOINCREMENT,
+ username STRING NOT NULL UNIQUE,
+ password STRING NOT NULL,
+ data STRING,
+ ip_address STRING NOT NULL,
+ date_regged DATE NOT NULL,
+ date_access DATE NOT NULL,
+ can_login BOOL NOT NULL DEFAULT 1
+);
+
+CREATE INDEX IDX_sites_sane_name on sites(sane_name);
+CREATE INDEX IDX_sites_priority on sites(priority);
+CREATE INDEX IDX_items_site_id on items(site_id);
+CREATE INDEX IDX_items_date_added on items(date_added);
+CREATE INDEX IDX_items_sane_title on items(sane_title);
+CREATE INDEX IDX_comments_item_id on comments(item_id);
+CREATE INDEX IDX_comments_user_id on comments(user_id);
+CREATE INDEX IDX_comments_date_added on comments(date_added);
+CREATE INDEX IDX_comments_item_user_ip on comments(item_id, user_id, ip_address);
+CREATE INDEX IDX_users_username on users(username);
+
+INSERT INTO sites (name, sane_name, url, priority) VALUES('Digg', 'digg', 'http://www.digg.com', 1);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('Reddit', 'reddit', 'http://reddit.com', 2);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('del.icio.us', 'delicious', 'http://del.icio.us', 3);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('StumbleUpon', 'stumbleupon', 'http://www.stumbleupon.com', 4);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('Flickr', 'flickr', 'http://www.flickr.com', 5);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('Simpy', 'simpy', 'http://www.simpy.com', 6);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('Furl', 'furl', 'http://www.furl.net', 7);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('Boing Boing', 'boingboing', 'http://www.boingboing.net', 8);
+INSERT INTO sites (name, sane_name, url, priority) VALUES('Wired', 'wired', 'http://www.wired.com', 9);
+
+INSERT INTO users (id, username, password, ip_address, date_regged, date_access, can_login) VALUES (0, 'anonymous', 'x', '0.0.0.0', '1970-01-01 00:00:00', '1970-01-01 00:00:00', 0);
+
+COMMIT;
@@ -0,0 +1,15 @@
+This is the full source code of http://picurls.com website.
+
+Read how it was designed at:
+http://catonmat.net/blog/making-of-picurls-popurls-for-pictures-part-one/
+and
+http://catonmat.net/blog/making-of-picurls-popurls-for-pictures-part-two/
+
+The website is currently down. I will put it back online as soon as I can and
+then write a better readme.txt.
+
+------------------------------------------------------------------------------
+
+Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+http://www.catonmat.net - good coders code, great reuse
+
@@ -0,0 +1,46 @@
+#
+# Peteris Krumins (peter@catonmat.net), 2007.10.10
+# http://www.catonmat.net - good coders code, great reuse
+#
+# Scraper pattern file for picurls.com website
+#
+# This is another pattern filtering format that the scraper accepts.
+#
+# Here we can specify a list of predicates as perl subroutines which
+# get called on each item found on the site being scraped.
+#
+# If the predicate subroutine returns true, the item gets accepted.
+# If it returns false, it gets discarded and the next subroutine gets
+# called until all of them have either failed or one has accepted the item.
+#
+# WARNING: code must be correctly indented, otherwise program will fail
+# to extract the perl subroutine.
+#
+
+#
+# These patterns are mostly for social bookmarking (sb) sites, where people
+# do not write "[PIC]" or "(Pic)" in title.
+#
+
+# Discard items which point to index pages
+#
+perl: sub {
+ use URI;
+ my $post = shift;
+
+ my $uri = URI->new($post->{url});
+ my $path = $uri->path;
+
+ if (!length $path) { # empty path
+ return 0;
+ }
+ elsif ($path =~ m!^/+$!) { # just a slash '/'
+ return 0;
+ }
+ elsif ($path =~ m!^/(home|index)\.(php|html|htm|aspx?)$!i) { # index files
+ return 0;
+ }
+
+ return 1;
+}
+
@@ -0,0 +1,52 @@
+#
+# Peteris Krumins (peter@catonmat.net), 2007.09.08
+# http://www.catonmat.net - good coders code, great reuse
+#
+# Scraper pattern file for picurls.com website
+#
+# The format of the file is the following:
+# [url:|title:|desc:] regex pattern
+#
+# url:, title:, desc: are optional. they specify if the entry
+# on a website should be matched against its url, title or description.
+#
+# if url:, title:, desc: are not specified, it defaults to matching
+# pattern against title and description.
+#
+
+# match picture urls
+#
+url: \.jpg$
+url: \.gif$
+url: \.png$
+
+# match common patterns describing posts having pictures in them
+#
+[[(].*picture.*[])]
+[[(].*pic.*[])]
+[[(].*image.*[])]
+[[(].*photo.*[])]
+[[(].*comic.*[])]
+[[(].*chart.*[])]
+[[(].*graph.*[])]
+
+photos? of
+pics? of
+images? of
+pictures? of
+comics? of
+charts? of
+graphs? of
+grapics? of
+(this|these|those) photos?
+(this|these|those) pics?
+(this|these|those) images?
+photosets? (on|of)
+
+# match domains containing just pics
+url: xkcd\.com
+url: flickr\.com
+url: photobucket\.com
+url: imageshack\.us
+url: bestpicever\.com
+
Oops, something went wrong.

0 comments on commit b667ee3

Please sign in to comment.