Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

initial digpicz import

  • Loading branch information...
commit 467231d5030787a154fc7259540042b8c87b2ea2 0 parents
@pkrumins authored
Showing with 6,938 additions and 0 deletions.
  1. +33 −0 bin/readme.txt
  2. +16 −0 compiled.entries/readme.txt
  3. +16 −0 compiled.pages/readme.txt
  4. BIN  db/media.db
  5. +23 −0 db/readme.txt
  6. +17 −0 locks/readme.txt
  7. +46 −0 readme.txt
  8. +175 −0 scripts/ImageFinder.pm
  9. +339 −0 scripts/NetPbm.pm
  10. +349 −0 scripts/ThumbExtractor.pm
  11. +206 −0 scripts/ThumbMaker.pm
  12. +146 −0 scripts/db_inserter.pl
  13. +228 −0 scripts/digg_extractor.pl
  14. +961 −0 scripts/page_gen.pl
  15. +302 −0 scripts/reddit_extractor.pl
  16. +61 −0 scripts/test_insert_entries.pl
  17. +35 −0 templates/footer.html
  18. +42 −0 templates/header.html
  19. +15 −0 templates/index.html
  20. +25 −0 templates/index_entry.html
  21. +9 −0 templates/link.html
  22. +25 −0 templates/link_entry.html
  23. +5 −0 templates/navigation.html
  24. +38 −0 templates/sidebar.html
  25. +17 −0 tmp.www/readme.txt
  26. BIN  www/allpics.gif
  27. BIN  www/bookmark.gif
  28. BIN  www/catonmat.gif
  29. BIN  www/comment.gif
  30. BIN  www/email.gif
  31. BIN  www/favicon.ico
  32. BIN  www/icons/delicious.gif
  33. BIN  www/icons/feed.gif
  34. BIN  www/icons/local.gif
  35. +33 −0 www/icons/master/bin/readme.txt
  36. +16 −0 www/icons/master/compiled.entries/readme.txt
  37. +16 −0 www/icons/master/compiled.pages/readme.txt
  38. BIN  www/icons/master/db/media.db
  39. +23 −0 www/icons/master/db/readme.txt
  40. +17 −0 www/icons/master/locks/readme.txt
  41. +46 −0 www/icons/master/readme.txt
  42. +175 −0 www/icons/master/scripts/ImageFinder.pm
  43. +339 −0 www/icons/master/scripts/NetPbm.pm
  44. +349 −0 www/icons/master/scripts/ThumbExtractor.pm
  45. +206 −0 www/icons/master/scripts/ThumbMaker.pm
  46. +146 −0 www/icons/master/scripts/db_inserter.pl
  47. +228 −0 www/icons/master/scripts/digg_extractor.pl
  48. +961 −0 www/icons/master/scripts/page_gen.pl
  49. +302 −0 www/icons/master/scripts/reddit_extractor.pl
  50. +61 −0 www/icons/master/scripts/test_insert_entries.pl
  51. +35 −0 www/icons/master/templates/footer.html
  52. +42 −0 www/icons/master/templates/header.html
  53. +15 −0 www/icons/master/templates/index.html
  54. +25 −0 www/icons/master/templates/index_entry.html
  55. +9 −0 www/icons/master/templates/link.html
  56. +25 −0 www/icons/master/templates/link_entry.html
  57. +5 −0 www/icons/master/templates/navigation.html
  58. +38 −0 www/icons/master/templates/sidebar.html
  59. +17 −0 www/icons/master/tmp.www/readme.txt
  60. BIN  www/icons/master/www/allpics.gif
  61. BIN  www/icons/master/www/bookmark.gif
  62. BIN  www/icons/master/www/catonmat.gif
  63. BIN  www/icons/master/www/comment.gif
  64. BIN  www/icons/master/www/email.gif
  65. BIN  www/icons/master/www/favicon.ico
  66. BIN  www/icons/master/www/icons/delicious.gif
  67. BIN  www/icons/master/www/icons/feed.gif
  68. BIN  www/icons/master/www/icons/local.gif
  69. BIN  www/icons/master/www/icons/picture-big.gif
  70. BIN  www/icons/master/www/icons/picture.gif
  71. BIN  www/icons/master/www/icons/pictures-big.gif
  72. BIN  www/icons/master/www/icons/pictures.gif
  73. BIN  www/icons/master/www/icons/video-big.gif
  74. BIN  www/icons/master/www/icons/video.gif
  75. BIN  www/icons/master/www/icons/videos-big.gif
  76. BIN  www/icons/master/www/icons/videos.gif
  77. +17 −0 www/icons/master/www/image.cache/readme.txt
  78. BIN  www/icons/master/www/link-line.gif
  79. BIN  www/icons/master/www/logo-cat.gif
  80. BIN  www/icons/master/www/logo.gif
  81. +20 −0 www/icons/master/www/readme.txt
  82. +303 −0 www/icons/master/www/style.css
  83. BIN  www/icons/master/www/subscribe.gif
  84. BIN  www/icons/master/www/time.gif
  85. BIN  www/icons/picture-big.gif
  86. BIN  www/icons/picture.gif
  87. BIN  www/icons/pictures-big.gif
  88. BIN  www/icons/pictures.gif
  89. BIN  www/icons/video-big.gif
  90. BIN  www/icons/video.gif
  91. BIN  www/icons/videos-big.gif
  92. BIN  www/icons/videos.gif
  93. +17 −0 www/image.cache/readme.txt
  94. BIN  www/link-line.gif
  95. BIN  www/logo-cat.gif
  96. BIN  www/logo.gif
  97. +20 −0 www/readme.txt
  98. +303 −0 www/style.css
  99. BIN  www/subscribe.gif
  100. BIN  www/time.gif
33 bin/readme.txt
@@ -0,0 +1,33 @@
+(c) Peteris Krumins (peter@catonmat.net), 2007.
+http://www.catonmat.net - good coders code, great reuse
+
+digg's missing picture section website generator:
+http://digpicz.com
+-----------------------------------------------------------------
+/bin
+
+This directory should contain the following executables from
+Netpbm tools:
+
+pamfile
+ppmmake
+pamflip
+pnmcat
+pamscale
+jpegtopnm
+giftopnm
+pngtopnm
+pnmtojpeg
+pamcut
+
+These tools are used by /scripts/Netpbm.pm module which in
+turn is used by /scripts/ThumbMaker.pm module which creates
+small thumbnails for media links on digpicz website.
+
+Compile these tools yourself.
+
+They can be downloaded at:
+http://netpbm.sourceforge.net/
+
+-----------------------------------------------------------------
+http://www.catonmat.net/blog/designing-digg-picture-website
16 compiled.entries/readme.txt
@@ -0,0 +1,16 @@
+(c) Peteris Krumins (peter@catonmat.net), 2007.
+http://www.catonmat.net - good coders code, great reuse
+
+digg's missing picture section website generator:
+http://digpicz.com
+-----------------------------------------------------------------
+/compiled.entries
+
+This directory should contains the cached (compiled) HTML
+templates for each reddit link.
+
+The entries get cached to minimize page regeneration.
+
+-----------------------------------------------------------------
+http://www.catonmat.net/blog/designing-digg-picture-website
+
16 compiled.pages/readme.txt
@@ -0,0 +1,16 @@
+(c) Peteris Krumins (peter@catonmat.net), 2007.
+http://www.catonmat.net - good coders code, great reuse
+
+digg's missing picture section website generator:
+http://digpicz.com
+-----------------------------------------------------------------
+/compiled.pages
+
+This directory should contains the cached (compiled) HTML
+templates for each reddit link.
+
+The entries get cached to minimize page regeneration.
+
+-----------------------------------------------------------------
+http://www.catonmat.net/blog/designing-digg-picture-website
+
BIN  db/media.db
Binary file not shown
23 db/readme.txt
@@ -0,0 +1,23 @@
+(c) Peteris Krumins (peter@catonmat.net), 2007.
+http://www.catonmat.net - good coders code, great reuse
+
+digg's missing picture section website generator
+http://digpicz.com
+-----------------------------------------------------------------
+/db
+
+This directory contains the sqlite database with digg's stories
+linking to pictures
+
+I use SQLite Database Browser GUI tool for quickly viewing and
+editing the database.
+
+It is available at:
+http://sqlitebrowser.sourceforge.net/
+
+I included database with 12 entries so you got the idea how
+it looked.
+
+-----------------------------------------------------------------
+http://www.catonmat.net/blog/designing-digg-picture-website
+
17 locks/readme.txt
@@ -0,0 +1,17 @@
+(c) Peteris Krumins (peter@catonmat.net), 2007.
+http://www.catonmat.net - good coders code, great reuse
+
+digg's missing picture section website generator:
+http://digpicz.com
+-----------------------------------------------------------------
+/locks
+
+This directory contains exclusive locks created by the scripts to
+make sure two copies of the same script do not get executed at
+the same time.
+
+Actually the only script creating a lock file is page_gen.pl ;)
+
+-----------------------------------------------------------------
+http://www.catonmat.net/blog/designing-digg-picture-website
+
46 readme.txt
@@ -0,0 +1,46 @@
+This is the digpicz.com website that I created back in 2007. It got massive
+attention back then because Digg didn't have picture section then.
+
+One day I had to close it becuase I got an email from Digg's lawyers, which
+said that I was abusing Digg's trademarks.
+
+The site was completely static and was generated by a bunch of Perl scripts.
+
+Please visit http://digg.picurls.com/ to see how it looked. I saved a
+screenshot for history.
+
+Also see these two articles on how it was created and how it received 100,000
+visitors in the first day (it got massively popular!):
+
+ http://www.catonmat.net/blog/designing-digg-picture-website/
+ http://www.catonmat.net/blog/few-words-about-digpicz-dot-com/
+
+Digpicz.com was created by Peteris Krumins (peter@catonmat.net).
+His blog is at http://www.catonmat.net - good coders code, great reuse
+
+------------------------------------------------------------------------------
+
+/
+
+This is a complete package which generates digpicz.com website.
+
+Each directory except /templates and /scripts contains a readme
+with basic description of what the dir was made for.
+
+/templates directory contains HTML template fragments which
+Perl's Template::Toolkit puts together to create pages seen on
+http://digpicz.com
+
+/scripts directory contains all the Perl scripts and modules
+which do all the generation and data mining together.
+
+See the article on how it was designed for much more detailed information:
+
+ http://www.catonmat.net/blog/designing-digg-picture-website/
+
+------------------------------------------------------------------------------
+
+Sincerely,
+Peteris Krumins
+http://www.catonmat.net
+
175 scripts/ImageFinder.pm
@@ -0,0 +1,175 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+package ImageFinder;
+
+#
+# This package was written as a part of "reddit media: intelligent fun online"
+# website generator.
+# This website can be viewed here: http://redditmedia.com
+#
+# See http://www.catonmat.net/designing-reddit-media-website for more info.
+#
+
+use warnings;
+use strict;
+
+#
+# This module find "best" image on a web page.
+# Since this package was written for purpose of redditmedia.com website,
+# the "best" means a picture which is most likely to be posted on the site
+# for others to enjoy.
+#
+# People enjoy big pictures, so this package finds the image on a webpage which
+# has biggest area (width * height).
+#
+
+use File::Temp 'mktemp';
+use LWP::UserAgent;
+use HTML::TreeBuilder;
+use URI;
+
+use NetPbm;
+
+sub new {
+ my $this = shift;
+ my $class = ref($this) || $this;
+ my %args = @_;
+
+ my $self;
+ $self->{netpbm} = NetPbm->new(netpbm => $args{'netpbm'});
+ $self->{ua} = LWP::UserAgent->new(
+ agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Gecko/20070515 Firefox/2.0.0.4',
+ timeout => 5
+ );
+
+ bless $self, $this;
+}
+
+#
+# find_best_image
+#
+# Given a URL address to a website, the function gets all the images on the page
+# and figures out which one is the best.
+#
+sub find_best_image {
+ my ($self, $url) = @_;
+
+ my $content = $self->_get_page($url);
+ return undef unless defined $content;
+
+ my $tree = HTML::TreeBuilder->new;
+ $tree->parse($content);
+
+ # find all img tags
+ my @imgs = $tree->look_down(_tag => 'img');
+ unless (@imgs) {
+ $tree->delete;
+ return undef
+ }
+
+ # download all images
+ my @downloaded_images;
+ foreach my $img (@imgs) {
+ my $src = $img->attr('src'); # could be relative path, fix
+ next unless $src;
+
+ my $abs_src = URI->new_abs($src, $url)->as_string;
+
+ my $tmp_file = $self->_get_temp_file;
+ my $resp = $self->{ua}->get($abs_src, ":content_file" => $tmp_file);
+ next unless $resp->is_success;
+
+ unless (-s $tmp_file) { # skip empty files
+ unlink $tmp_file;
+ next;
+ }
+ push @downloaded_images, $tmp_file;
+ }
+
+ return undef unless @downloaded_images; # huh, no images?
+
+ return $self->_biggest_image(@downloaded_images);
+}
+
+#
+# _biggest_image
+#
+# Given a list of images, finds the biggest image (width * height maximum)
+#
+sub _biggest_image {
+ my ($self, @images) = @_;
+
+ my $netpbm = $self->{netpbm};
+ # convert all images to PNM format
+ my @pnms;
+ foreach (@images) {
+ my $pnm_file = $netpbm->img2pnm($_);
+ unlink $_;
+ if ($netpbm->is_error) {
+ print STDERR $netpbm->get_error, "\n";
+ $netpbm->clear_error;
+ next;
+ }
+ push @pnms, $pnm_file;
+ }
+
+ my @img_infos;
+ foreach (@pnms) {
+ my %info = $netpbm->get_img_info($_);
+ if ($netpbm->is_error) {
+ print STDERR $netpbm->get_error, "\n";
+ $netpbm->clear_error;
+ next;
+ }
+ push @img_infos, {
+ info => \%info,
+ path => $_
+ };
+ }
+
+ my @sorted_by_area = sort {
+ $b->{info}{width} * $b->{info}{height} <=> $a->{info}{width} * $a->{info}{height}
+ } @img_infos;
+
+ unlink $_->{path} foreach @sorted_by_area[1..$#sorted_by_area];
+
+ return $sorted_by_area[0]->{path};
+}
+
+sub _get_page {
+ my ($self, $url) = @_;
+ my $resp = $self->{ua}->get($url);
+
+ if ($resp->is_success) {
+ return $resp->content;
+ }
+ return undef;
+}
+
+#
+# _get_temp_file
+#
+# Creates and returns the path to a new temporary file
+#
+sub _get_temp_file {
+ return mktemp("/tmp/imageIFXXXXXXXX");
+}
+
+1;
339 scripts/NetPbm.pm
@@ -0,0 +1,339 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+package NetPbm;
+
+#
+# This package was written as a part of "reddit media: intelligent fun online"
+# website generator.
+# This website can be viewed here: http://redditmedia.com
+#
+# See http://www.catonmat.net/designing-reddit-media-website for more info.
+#
+
+use warnings;
+use strict;
+
+#
+# Package for manipulating images.
+#
+
+use File::Temp 'mktemp';
+use File::MMagic;
+
+sub new {
+ my ($this, %args) = @_;
+ my $class = ref($this) || $this;
+
+ my $self = { conf => \%args };
+ bless $self, $class;
+}
+
+#
+# get_img_info
+#
+# Given a path to an image, returns a hash with its primitive info
+# (width and height as keys)
+# On error, returns an empty hash.
+#
+sub get_img_info {
+ my ($self, $img) = @_;
+
+ my $info_line = $self->_run_netpbm_and_slurp("pamfile", $img);
+ if ($self->is_error) {
+ $self->set_error("Failed getting image info for '$img': " . $self->get_error);
+ return;
+ }
+
+ if ($info_line =~ /(\d+) by (\d+)/) {
+ return (width => $1, height => $2);
+ }
+
+ return
+}
+
+#
+# _get_file_mime
+#
+# Given a path to file, returns it's type in mime format based on magic
+#
+sub _get_file_mime {
+ my ($self, $file_path) = @_;
+
+ my $mm = new File::MMagic;
+ my $type = $mm->checktype_filename($file_path);
+
+ return $type;
+}
+
+#
+# get_img_type
+#
+# Given a path to an image, returns its type based on magic
+#
+sub get_img_type {
+ my ($self, $img_path) = @_;
+
+ my $type = $self->_get_file_mime($img_path);
+ if ($type =~ m{image/(.+)}) { # image, yumm, ok!
+ if ($1 =~ /x-portable/) { # portable pbm image
+ return "pnm"
+ }
+ return $1;
+ }
+
+ return "unknown";
+}
+
+#
+# border_img
+#
+# Given a path to an image, adds a border with $border_size width and $color
+#
+sub border_img {
+ my ($self, $img, $border_size, $color) = @_;
+
+ # taken from pnmmargin bash script
+ #
+ # ppmmake $color $size 1 > $tmp2
+ # pamflip -rotate90 $tmp2 > $tmp3
+ # pnmcat -lr $tmp2 $tmp1 $tmp2 > $tmp4
+ # pnmcat -tb $tmp3 $tmp4 $tmp3
+ #
+
+ my $tmp2 = $self->_get_temp_file;
+ my @ppmmake = ("'$color'", $border_size, 1);
+ $self->_run_netpbm_and_redirect("ppmmake", @ppmmake, $tmp2);
+ if ($self->is_error) {
+ $self->set_error("Failed adding a border to '$img': " . $self->get_error);
+ return undef;
+ }
+
+ my $tmp3 = $self->_get_temp_file;
+ my @pamflip = ("-rotate90", $tmp2);
+ $self->_run_netpbm_and_redirect("pamflip", @pamflip, $tmp3);
+ if ($self->is_error) {
+ $self->set_error("Failed adding a border to '$img': " . $self->get_error);
+ return undef;
+ }
+
+ my $tmp4 = $self->_get_temp_file;
+ my @pamcat = ("-lr", $tmp2, $img, $tmp2);
+ $self->_run_netpbm_and_redirect("pnmcat", @pamcat, $tmp4);
+ if ($self->is_error) {
+ $self->set_error("Failed adding a border to '$img': " . $self->get_error);
+ return undef;
+ }
+
+ my $bordered_img = $self->_get_temp_file;
+ @pamcat = ("-tb", $tmp3, $tmp4, $tmp3);
+ $self->_run_netpbm_and_redirect("pnmcat", @pamcat, $bordered_img);
+ if ($self->is_error) {
+ $self->set_error("Failed adding a border to '$img': " . $self->get_error);
+ return undef;
+ }
+
+ unlink $tmp2, $tmp3, $tmp4;
+
+ return $bordered_img;
+}
+
+#
+# resize_img
+#
+# Given a path to an image, its new width and height, resizes the image.
+# Returns a path to a temporary file where the resized image was stored.
+# On error, returns undef.
+#
+sub resize_img {
+ my ($self, $img, $w, $h) = @_;
+
+ my $resized_img = $self->_get_temp_file();
+ my @resize_args;
+ push @resize_args, "-xsize", $w if $w;
+ push @resize_args, "-ysize", $h if $h;
+ push @resize_args, $img;
+
+ $self->_run_netpbm_and_redirect("pamscale", @resize_args, $resized_img);
+ if ($self->is_error) {
+ $self->set_error("Failed resizing '$img': " . $self->get_error);
+ return undef;
+ }
+
+ return $resized_img;
+}
+
+#
+# img2pnm
+#
+# Given a path to an image of almost any type, converts it to a PNM format
+# and stores it in a temporary file.
+#
+# Returns path to the temporary file, or undef on error
+#
+sub img2pnm {
+ my ($self, $infile) = @_;
+
+ my $img_type = $self->get_img_type($infile);
+ if ($img_type eq "pnm") { # already pnm!
+ return $infile;
+ }
+
+ if ($img_type eq "unknown") {
+ my $mime = $self->_get_file_mime($infile);
+ $self->set_error("Error: can't convert '$mime' ($infile) to pnm!");
+ }
+
+ my $tmp_pnm_file = $self->_get_temp_file();
+ my $program = "${img_type}topnm";
+ $self->_run_netpbm_and_redirect($program, $infile, $tmp_pnm_file);
+
+ return $tmp_pnm_file;
+}
+
+#
+# pnm2jpeg
+#
+# Given a path to an image and path to an output image converts input image
+# to jpeg format.
+# Returns the output path, or undef on failure.
+#
+sub pnm2jpg {
+ my $self = shift;
+ my ($pnmfile, $outfile) = @_;
+
+ $self->_run_netpbm_and_redirect("pnmtojpeg", $pnmfile, $outfile);
+ if ($self->is_error) {
+ $self->{error} = "Failed converting pnm to jpeg ('$pnmfile' to '$outfile'): " . $self->get_error;
+ return undef;
+ }
+
+ return $outfile;
+}
+
+
+#
+# cut_img
+#
+# Given a path to an image, cuts $w x $h rectangle out of it with top left corner
+# at ($x, $y)
+# Returns a path to a temporary file where the cut image was stored.
+# On error, returns undef;
+#
+sub cut_img {
+ my ($self, $img, $x, $y, $w, $h) = @_;
+
+ my $cut_img = $self->_get_temp_file();
+ my @cut_args = (
+ "-left", $x,
+ "-top", $y,
+ "-width", $w,
+ "-height", $h,
+ $img
+ );
+
+ $self->_run_netpbm_and_redirect("pamcut", @cut_args, $cut_img);
+ if ($self->is_error) {
+ $self->{error} = "Failed cutting '$img': " . $self->get_error;
+ return undef;
+ }
+
+ return $cut_img;
+}
+
+#
+# _run_netpbm_and_redirect
+#
+# Executes a netpbm program with the given arguments and redirects
+# the output to a file
+#
+# =========
+# WARNING: non portable, redirects STDERR to /dev/null
+# =========
+#
+sub _run_netpbm_and_redirect {
+ my ($self, $program, @args) = @_;
+ my $redir = pop @args;
+
+ my $path_to_program = $self->{conf}->{netpbm} . '/' . $program;
+ my $full_command = "$path_to_program @args 2>/dev/null > $redir";
+ my $ret = system("$path_to_program @args 2>/dev/null > $redir");
+
+ unless ($ret == 0) { # system() failed
+ unlink $redir;
+ $self->{error} = "system($full_command) failed: $!";
+ return 0;
+ }
+ return 1;
+}
+
+#
+# _run_netpbm_and_slurp
+#
+# Executes a netpbm program with the given arguments and returns the first line
+# if run in scalar context or all the lines if run in array context
+#
+sub _run_netpbm_and_slurp {
+ my ($self, $program, @args) = @_;
+
+ my $path_to_program = $self->{conf}->{netpbm} . '/' . $program;
+ my $full_command = "$path_to_program @args";
+ my $ret = open my $in, '-|', $full_command;
+ unless (defined $ret) {
+ $self->{error} = "open('-|', $full_command) failed: $!";
+ return;
+ }
+
+ my $first_line = <$in>;
+ return wantarray ? ($first_line, <$in>) : $first_line;
+}
+
+
+#
+# _get_temp_file
+#
+# Creates and returns the path to a new temporary file
+#
+sub _get_temp_file {
+ return mktemp("/tmp/imageNPXXXXXXXX");
+}
+
+sub is_error {
+ my $self = shift;
+ return 1 if exists $self->{error};
+ return 0;
+}
+
+sub get_error {
+ my $self = shift;
+ return $self->{error};
+}
+
+sub clear_error {
+ my $self = shift;
+ delete $self->{error}
+}
+
+sub set_error {
+ my $self = shift;
+ my $error = shift;
+ $self->{error} = $error;
+}
+
+1;
349 scripts/ThumbExtractor.pm
@@ -0,0 +1,349 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+use warnings;
+use strict;
+
+package ThumbExtractor;
+
+#
+# This package was written as a part of "digpicz: digg's missing picture section"
+# website generator.
+# This website can be viewed here: http://digpicz.com
+#
+# See http://www.catonmat.net/designing-digg-picture-website for more info.
+#
+
+#
+# This package extracts thumbnail images for a given URL to a video or picture.
+#
+
+use LWP::UserAgent;
+use HTML::Entities;
+use HTML::TreeBuilder;
+use File::MMagic;
+use File::Temp 'mktemp';
+use URI;
+
+#
+# Here are handlers for various video and image sites.
+# There is no other way to extract thumbnail from a video site than analyzing the
+# website how the site displays thumbnail itself.
+#
+# For video sites I wrote find_best_image in ImageCacher's package which finds
+# the best image on the site.
+#
+# It is a very expensive function (requires fetching all images and converting them to
+# pnm format and then calculate areas, etc).
+#
+# For the most popular sites (from top 10) I wrote handlers manually.
+#
+my @thumb_handlers = (
+ 'youtube.com' => \&_youtube_handler,
+ 'video.google.com' => \&_video_google_handler,
+ 'flickr.com' => \&_flickr_handler,
+ 'metacafe.com' => \&_metacafe_handler,
+ 'liveleak.com' => \&_liveleak_handler,
+ 'xkcd.com' => \&_xkcd_handler,
+ 'bestpicever.com' => \&_bestpicever_handler,
+ 'blogger.com' => \&_blogger_handler
+);
+
+sub new {
+ my $this = shift;
+ my $class = ref($this) || $this;
+
+ my $self = {};
+ $self->{ua} = LWP::UserAgent->new(
+ agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Gecko/20070515 Firefox/2.0.0.4',
+ timeout => 5
+ );
+
+ bless $self, $class;
+}
+
+sub get_thumbnail {
+ my ($self, $url) = @_;
+
+ my $host = $self->_get_host($url);
+ return undef if $host eq "unknown";
+
+ # find a handler for a host
+ for my $handler_idx (grep { $_ % 2 == 0 } 0 .. $#thumb_handlers) {
+ if ($host =~ /$thumb_handlers[$handler_idx]/) {
+ my $thumb = $thumb_handlers[$handler_idx + 1]->($url, $self->{ua});
+
+ return $thumb;
+ }
+ }
+
+ # there was no handler, try matching extensions
+ my @img_rxes = qw|jpg$ jpeg$ gif$ png$|;
+ my $rx = join '|', @img_rxes;
+
+ if ($url =~ /$rx/i) {
+ # some sites have URLs ending with an image extension but really it is
+ # a HTML page. Let's check this.
+
+ # read just the first KB of the image and make sure we are not getting
+ # gzipped content
+ #
+
+ # File::MMagic is broken, it didnt work this way.
+
+# my $data;
+# my $cb_sub = sub {
+# $data .= shift;
+# my $length = do { use bytes; length($data) };
+#
+# if ($length >= 2024) {
+# die "got a KB of data";
+# }
+# };
+# my $response = $self->{ua}->get($url, 'Accept-Encoding' => undef,
+# ':content_cb' => $cb_sub);
+
+ my $tmp_file = $self->_get_temp_file();
+ my $response = $self->{ua}->get($url, ':content_file' => $tmp_file);
+ my $content = $response->content;
+ my $mm = new File::MMagic;
+ my $res = $mm->checktype_filename($tmp_file);
+ unlink $tmp_file;
+
+ if ($res =~ /image/) { # image, yumm, ok!
+ return ThumbExtractor::Thumb->new($url, 0);
+ }
+ }
+
+ return undef; # unknown url or not an image
+}
+
+sub _get_page {
+ my ($ua, $url) = @_;
+ my $resp = $ua->get($url);
+
+ if ($resp->is_success) {
+ return $resp->content;
+ }
+ return undef;
+}
+
+sub _get_temp_file {
+ return mktemp("/tmp/imageTEXXXXXXXX");
+}
+
+
+#
+# I use regexes for extracting because parsing each tree would be much slower and would
+# take me 5 times longer to write the code and I can't see any reason to do it.
+# I want the site to be running asap ;)
+#
+
+sub _youtube_handler {
+ my $url = shift;
+
+ # http://www.youtube.com/watch?v=qSNcVjpX-9Q&NR=1
+ if ($url =~ /v=([A-Za-z0-9-_]+)/) {
+ my $thumb_url = "http://img.youtube.com/vi/$1/1.jpg";
+ return ThumbExtractor::Thumb->new($thumb_url, 1);
+ }
+
+ return undef;
+}
+
+sub _video_google_handler {
+ my ($url, $ua) = @_;
+
+ # google video can either have their own thumbnail or youtube's thumbnail
+ #
+ # <img src="http://video.google.com/ThumbnailServer2?app=vss&amp;contentid=e9201247d01caa03&amp;offsetms=930000&amp;itag=w160&amp;lang=en&amp;sigh=mD_ipxj1B87xnGNNMkRgont7Nb4"
+
+ my $content = _get_page($ua, $url);
+ if (defined $content) {
+ my $thumb_url;
+ if ($content =~ m{(http://video.google.com/ThumbnailServer2.*?")}) {
+ $thumb_url = decode_entities $1;
+ }
+ elsif ($content =~ m{(http://img.youtube.com/vi/(?:[^/]+)/\d.jpg)}) {
+ $thumb_url = $1;
+ }
+ else {
+ return undef;
+ }
+
+ return ThumbExtractor::Thumb->new($thumb_url, 1);
+ }
+ return;
+}
+
+sub _flickr_handler {
+ my ($url, $ua) = @_;
+
+ my $flickr_extract = sub {
+ my $id = shift;
+
+ my $content = _get_page($ua, "http://flickr.com/photo_zoom.gne?id=$id&size=sq");
+ if (defined $content) {
+ if ($content =~ m{<a href="(http://farm\d+[^"]+)">Download}) {
+ return ThumbExtractor::Thumb->new($1, 1);
+ }
+ return;
+ }
+ return;
+ };
+
+ if ($url =~ /static.flickr.com/) {
+ return ThumbExtractor::Thumb->new($url, 0); # not a thumb yet
+ }
+ elsif ($url =~ /id=(\d+)/) {
+ # http://flickr.com/photo_zoom.gne?id=346049991&size=sq
+ return $flickr_extract->($1);
+ }
+ elsif ($url =~ m{/(\d+)/}) {
+ # http://www.flickr.com/photos/kielbryant/118020322/in/set-72057594137096110/
+ return $flickr_extract->($1);
+ }
+
+ return;
+}
+
+sub _metacafe_handler {
+ my $url = shift;
+ if ($url =~ m{metacafe.com/watch/(\d+)}) {
+ return ThumbExtractor::Thumb->new("http://www.metacafe.com/thumb/$1.jpg", 1);
+ }
+ elsif ($url =~ m{metacafe.com/w/(\d+)}) {
+ return ThumbExtractor::Thumb->new("http://www.metacafe.com/thumb/$1.jpg", 1);
+ }
+ return;
+}
+
+
+sub _liveleak_handler {
+ my ($url, $ua) = @_;
+
+ my $content = _get_page($ua, $url);
+ if (defined $content) {
+ if ($content =~ m{<link rel="videothumbnail" href="(.+?)" type="image/jpeg" />}) {
+ return ThumbExtractor::Thumb->new($1, 1);
+ }
+ }
+
+ return;
+}
+
+sub _xkcd_handler {
+ my ($url, $ua) = @_;
+
+ my $content = _get_page($ua, $url);
+ return undef unless defined $content;
+
+ if ($content =~ m{<img src="(http://imgs.xkcd.com/comics/(?:[^"]+))"}) {
+ return ThumbExtractor::Thumb->new($1, 0);
+ }
+
+ return;
+}
+
+sub _bestpicever_handler {
+ my ($url, $ua) = @_;
+
+ my $content = _get_page($ua, $url);
+ return undef unless defined $content;
+
+ my $tree = HTML::TreeBuilder->new;
+ $tree->parse($content);
+
+ my $div_holding_img = $tree->look_down(_tag => 'div', id => 'img-holder-reg');
+ unless (defined $div_holding_img) {
+ $tree->delete;
+ return undef;
+ };
+
+ my $img = $div_holding_img->look_down(_tag => 'img');
+ unless (defined $img) {
+ $tree->delete;
+ return undef;
+ }
+
+ my $img_url = $img->attr('src');
+ $tree->delete;
+ return ThumbExtractor::Thumb->new($img_url, 0);
+}
+
+sub _blogger_handler {
+ my ($url, $ua) = @_;
+
+ my $content = _get_page($ua, $url);
+ return undef unless defined $content;
+
+#<html>
+#<head>
+#<title>gangsta.jpg (image)</title>
+#<script type="text/javascript">
+#<!--
+#if (top.location != self.location) top.location = self.location;
+#// -->
+#</script>
+#</head>
+#<body bgcolor="#ffffff" text="#000000">
+#<img src="http://bp2.blogger.com/_XNXLcHFsW1U/RsTNeXsvs8I/AAAAAAAABMY/EL6FxTMv_F0/s1600/gangsta.jpg" alt="[gangsta.jpg]" border=0>
+#</body>
+#</html>
+
+ if ($content =~ /img src="(.+?)"/) {
+ return ThumbExtractor::Thumb->new($1, 0);
+ }
+
+ return
+}
+
+sub _get_host {
+ my ($self, $url) = @_;
+ my $uri = URI->new($url);
+ if ($uri->can('host')) {
+ return $uri->host;
+ }
+ return "unknown";
+}
+
+package ThumbExtractor::Thumb;
+
+sub new {
+ my $class = shift;
+ my ($url, $is_thumb) = @_;
+ my $self = {
+ url => $url,
+ is_thumb => $is_thumb
+ };
+
+ bless $self, $class;
+}
+
+sub is_thumb {
+ my $self = shift;
+ return $self->{is_thumb};
+}
+
+sub url {
+ my $self = shift;
+ return $self->{url};
+}
+
+1;
206 scripts/ThumbMaker.pm
@@ -0,0 +1,206 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+package ThumbMaker;
+
+#
+# This package was written as a part of "reddit media: intelligent fun online"
+# website generator.
+# This website can be viewed here: http://redditmedia.com
+#
+# See http://www.catonmat.net/designing-reddit-media-website for more info.
+#
+
+use warnings;
+use strict;
+
+use LWP::UserAgent;
+use File::Temp 'mktemp';
+use HTML::TreeBuilder;
+
+use NetPbm;
+
+sub new {
+ my $this = shift;
+ my $class = ref($this) || $this;
+
+ my %opts = @_;
+
+ my $self;
+ $self->{netpbm} = NetPbm->new(netpbm => $opts{netpbm});
+ $self->{ua} = LWP::UserAgent->new(
+ agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Gecko/20070515 Firefox/2.0.0.4',
+ timeout => 5
+ );
+
+ bless $self, $class;
+}
+
+#
+# Given a path to local or external image, the function creates
+# a jpeg thumbnail with dimensions of $params->{width} and $params->{height}
+# and storores it at $out_path
+#
+sub create_thumbnail {
+ my ($self, $img_path, $out_path, $params) = @_;
+
+ if (-e $img_path) { # if it is a local file
+ return $self->_mkthumb($img_path, $out_path, $params);
+ }
+
+ my $tmp_img = $self->cache($img_path);
+ if (defined $tmp_img) {
+ my $ret = $self->_mkthumb($tmp_img, $out_path, $params);
+ unlink $tmp_img;
+ return $ret;
+ }
+}
+
+sub cache {
+ my $self = shift;
+ my $url = shift; # cache image at $url
+
+ my $temp_file = mktemp("/tmp/imageTMXXXXXXXX");
+ my $resp = $self->{ua}->get($url, ":content_file" => $temp_file);
+ unless ($resp->is_success) {
+ unlink $temp_file;
+ $self->{error} = "Failed getting '$url': " . $resp->status_line;
+ return undef;
+ }
+
+ return $temp_file;
+}
+
+sub _mkthumb {
+ my ($self, $img_path, $out_path, $params) = @_;
+ my $netpbm = $self->{netpbm};
+
+ # convert image to pnm format
+ my $pnm_image = $netpbm->img2pnm($img_path);
+ if ($netpbm->is_error) {
+ $self->{error} = $netpbm->get_error;
+ return 0;
+ }
+
+ # get image info
+ my %img_info = $netpbm->get_img_info($pnm_image);
+ if ($netpbm->is_error) {
+ unlink $pnm_image;
+ $self->{error} = $netpbm->get_error;
+ return 0;
+ }
+
+ my %resize;
+ if ($img_info{width} < $params->{width}) {
+ $resize{w} = $params->{width};
+ }
+ if ($img_info{height} < $params->{height}) {
+ $resize{h} = $params->{height};
+ }
+
+ my $resized_image;
+ unless (keys %resize) {
+ # if the image is bigger than the required dimensions (didn't need resizing)
+
+ # cut out the middle of the image
+# my $middle_x = $img_info{width} / 2;
+# my $middle_y = $img_info{height} / 2;
+#
+# my $x = int ($middle_x - ($params->{width} / 2));
+# my $y = int ($middle_y - ($params->{height} / 2));
+#
+# my $cut_pnm_image = $self->_cut_img($pnm_image, $x, $y, $params->{width}, $params->{height});
+# unlink $pnm_image;
+# if ($self->is_error) {
+# return undef;
+# }
+#
+# $pnm_image = $cut_pnm_image;
+
+ # ^^^^^^^^^^^ that didn't look nice, here is another idea
+ #
+ # this one finds the smallest dimension of image and resizes it to fit the
+ # required dimensions, while keeping proportions of the other dimension
+ #
+ my ($new_width, $new_height);
+
+ if ($img_info{width} < $img_info{height}) {
+ ($new_width, $new_height) = ($params->{width}, 0);
+ }
+ elsif ($img_info{height} < $img_info{width}) {
+ ($new_width, $new_height) = (0, $params->{height});
+ }
+ else { # width == height
+ ($new_width, $new_height) = ($params->{width}, $params->{height});
+ }
+
+ $resized_image = $netpbm->resize_img($pnm_image, $new_width, $new_height);
+ }
+ else {
+ # image is smaller than the required size of the thumbnail, stretch it
+ $resized_image = $netpbm->resize_img($pnm_image, $resize{w} || 0, $resize{h} || 0);
+ }
+
+ if ($netpbm->is_error) {
+ $self->{error} = $netpbm->get_error;
+ return 0;
+ }
+ unlink $pnm_image;
+
+ $pnm_image = $netpbm->cut_img($resized_image, 0, 0, $params->{width}, $params->{height});
+ if ($netpbm->is_error) {
+ $self->{error} = $netpbm->get_error;
+ return 0;
+ }
+ unlink $resized_image;
+
+ # add border
+ if (exists $params->{border}) {
+ my $bordered_image = $netpbm->border_img($pnm_image, $params->{border},
+ $params->{border_color} || '#000');
+
+ unless ($netpbm->is_error) {
+ unlink $pnm_image;
+ $pnm_image = $bordered_image;
+ }
+ }
+
+ $netpbm->pnm2jpg($pnm_image, $out_path);
+ if ($netpbm->is_error) {
+ $self->{error} = $netpbm->get_error;
+ return 0;
+ }
+ unlink $pnm_image;
+
+ return 1;
+}
+
+sub is_error {
+ my $self = shift;
+
+ return 1 if exists $self->{error};
+ return 0;
+}
+
+sub get_error {
+ return shift->{error}
+}
+
+
+1;
146 scripts/db_inserter.pl
@@ -0,0 +1,146 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+use warnings;
+use strict;
+
+#
+# this script takes input in format:
+# title: story title
+# type: story type
+# desc: story description
+# url: story url
+# digg_url: url to original story on digg
+# category: digg category of the story
+# short_category: short cateogry name
+# user: username
+# user_pic: url to user pic
+# date: date story appeared on digg YYYY-MM-DD HH::MM::SS
+# and inserts all this info in an sqlite database
+#
+# it is made to work with digg_extractor.pl script but can be fed
+# any input which is in that format
+#
+
+use DBI;
+use POSIX;
+
+#
+# This program was written as a part of "digpicz: digg's missing picture section"
+# website generator.
+# This website can be viewed here: http://digpicz.com
+#
+# See http://www.catonmat.net/designing-digg-picture-website for more info.
+#
+
+binmode(STDIN, ':utf8');
+
+use constant DATABASE_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/db/media.db';
+
+my $dbh = DBI->connect("dbi:SQLite:" . DATABASE_PATH, '', '', { RaiseError => 1 });
+die $DBI::errstr unless $dbh;
+
+create_db_if_not_exists();
+
+# no normalization of database whatsoever, we just want the site to be running.
+# bad, bad, bad!
+my $insert_query =<<EOL;
+INSERT INTO digg (title, desc, type, url, digg_url, digg_category, digg_short_category, user, user_avatar, date_added)
+VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+EOL
+my $sth = $dbh->prepare($insert_query);
+
+# turn on paragraph slurp mode
+$/ = '';
+while (<>) {
+ next if /^#/; # ignore comments
+ parse_and_insert_db($_);
+}
+
+#
+# if we do not set $sth to undef, we get the following warning:
+#
+# DBI::db=HASH(0x1d287e8)->disconnect invalidates 1 active statement handle
+# (either destroy statement handles or call finish on them before disconnecting)
+# at db_inserter.pl line 65, <> line 4.
+#
+# closing dbh with active statement handles at db_inserter.pl line 65, <> line 4.
+#
+$sth = undef;
+
+$dbh->disconnect;
+
+#
+# parse_and_insert_db
+#
+# Parses and inserts a paragraph into database.
+#
+sub parse_and_insert_db {
+ my $par = shift;
+ my @parts = split '\n', $par;
+ my %story;
+
+ foreach (@parts) {
+ my ($val, $key) = split ': ', $_, 2;
+ $story{$val} = $key;
+ }
+
+ $sth->execute($story{title}, $story{desc}, $story{type}, $story{url}, $story{digg_url},
+ $story{category}, $story{short_category}, $story{user}, $story{user_pic},
+ $story{date});
+}
+
+#
+# create_db_if_not_exists
+#
+# Creates reddit table if it does not exit
+#
+sub create_db_if_not_exists {
+ # Older versions of sqlite 3 do not support IF NOT EXISTS clause,
+ # we have to workaround
+ #
+ my $table_exists = 0;
+ my $tables_q = "SELECT name FROM sqlite_master WHERE type='table' AND name='digg'";
+ my $res = $dbh->selectall_arrayref($tables_q);
+
+ if (defined $res and @$res) {
+ $table_exists = 1;
+ }
+
+ unless ($table_exists) {
+
+ my $create_db =<<EOL;
+CREATE TABLE digg (
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
+ title STRING NOT NULL UNIQUE,
+ desc STRING NOT NULL UNIQUE,
+ url STRING NOT NULL UNIQUE,
+ digg_url STRING NOT NULL UNIQUE,
+ digg_category STRING NOT NULL,
+ digg_short_category STRING NOT NULL,
+ user STRING NOT NULL,
+ user_avatar STRING NOT NULL,
+ type STRING NOT NULL,
+ date_added DATE NOT NULL
+)
+EOL
+
+ $dbh->do($create_db);
+ }
+}
+
228 scripts/digg_extractor.pl
@@ -0,0 +1,228 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+use warnings;
+use strict;
+
+#
+# This program was written as a part of "digpicz: digg's missing picture section"
+# website generator.
+# This website can be viewed here: http://digpicz.com
+#
+# See http://www.catonmat.net/designing-digg-picture-website for more info.
+#
+
+use LWP;
+use POSIX;
+use XML::Simple;
+
+binmode(STDOUT, ":utf8");
+
+# Each request to Digg API requires an appkey which is a a valid absolute URI
+# that identifies the application making the request.
+# This constant defines that key.
+# Read more: http://apidoc.digg.com/ApplicationKeys
+#
+use constant DIGG_APPKEY => 'http://digpicz.com';
+
+# This constant defines now many votes a digg post has had to have received
+# to be included in the results. Use 0 to include all posts.
+#
+#use constant VOTE_THRESHOLD => 60;
+
+use constant ITEMS_PER_REQUEST => 15;
+
+# These regex patterns match common picture titles on Digg.
+# It's an array to maintain the order of plural regexes vs. singular regexes.
+#
+my @extract_patterns = (
+ # pattern type
+ "[[(].*pictures.*[])]" => 'pictures',
+ "[[(].*picture.*[])]" => 'picture',
+ "[[(].*pics.*[])]" => 'pictures',
+ "[[(].*pic.*[])]" => 'picture',
+ "[[(].*images.*[])]" => 'pictures',
+ "[[(].*image.*[])]" => 'picture',
+ "[[(].*photos.*[])]" => 'pictures',
+ "[[(].*photo.*[])]" => 'picture',
+ "[[(].*comics.*[])]" => 'pictures',
+ "[[(].*comic.*[])]" => 'picture',
+ "[[(].*charts.*[])]" => 'pictures',
+ "[[(].*chart.*[])]" => 'picture',
+);
+
+# These regex patterns match domains which usually contain only images
+# and videos.
+my @extract_domains = (
+ 'photobucket.com' => 'picture',
+ 'photo.livevideo.com' => 'picture',
+ 'flickr.com' => 'picture',
+ 'xkcd.com' => 'picture'
+);
+
+my $ua = LWP::UserAgent->new(
+ agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) Gecko/20070515 Firefox/2.0.0.4'
+);
+
+my $reqs_to_get = shift || 'all'; # number of requests to get (ITEMS_PER_REQUEST per request)
+
+my $xml_posts = get_posts($ua, 1);
+my $posts = parse_posts($xml_posts);
+
+extract_and_print($posts->{story});
+$reqs_to_get-- if $reqs_to_get =~ /\d+/;
+
+my $offset = 2;
+if ($reqs_to_get eq 'all') {
+ do {
+ $xml_posts = get_posts($ua, $offset++);
+ $posts = parse_posts($xml_posts);
+ extract_and_print($posts->{story});
+ } while (exists $posts->{story} and @{$posts->{story}});
+}
+else {
+ while ($reqs_to_get--) {
+ # note: it doesn't matter that we duplicate code, this program is so small
+ # that i typed it in matter of minutes
+ $xml_posts = get_posts($ua, $offset++);
+ $posts = parse_posts($xml_posts);
+ extract_and_print($posts->{story});
+
+ exit 0 unless exists $posts->{story} and @{$posts->{story}};
+ }
+}
+
+#
+# extract_and_print
+#
+# Given a hashref data structure of posts, find posts matching @extract_patterns and
+# @extract_domains and prints them out
+#
+sub extract_and_print {
+ my $posts = shift;
+
+ my @to_print;
+ POST:
+ foreach my $post (@$posts) { # naive algorithm, we don't care about complexity
+ foreach my $idx (grep { $_ % 2 == 0 } 0..$#extract_patterns) {
+ # foreach extract pattern
+ if ($post->{title} =~ /$extract_patterns[$idx]/i ||
+ $post->{description} =~ /$extract_patterns[$idx]/i)
+ {
+ push @to_print, {
+ entry => $post,
+ type => $extract_patterns[$idx+1]
+ };
+ next POST;
+ }
+ }
+ foreach my $idx (grep { $_ % 2 == 0 } 0..$#extract_domains) {
+ my $uri = URI->new($post->{link});
+ my $host;
+ next unless $uri->can('host');
+ $host = $uri->host;
+ if ($host =~ /$extract_domains[$idx]/i) {
+ push @to_print, {
+ entry => $post,
+ type => $extract_domains[$idx+1]
+ };
+ next POST;
+ }
+ }
+ }
+
+ print_entries(\@to_print);
+}
+
+#
+# print_entries
+#
+# Given a arrayref of entries, prints one by one in our desired format.
+# The format is:
+# title: story title
+# type: story type
+# desc: story description
+# url: story url
+# digg_url: url to original story on digg
+# category: digg category of the story
+# short_category: short cateogry name
+# user: username
+# user_pic: url to user pic
+# date: date story appeared on digg YYYY-MM-DD HH::MM::SS
+# <new line>
+#
+sub print_entries {
+ my $entries = shift;
+ foreach (@$entries) {
+ print "title: $_->{entry}->{title}\n";
+ print "type: $_->{type}\n";
+ print "desc: $_->{entry}->{description}\n";
+ print "url: $_->{entry}->{link}\n";
+ print "digg_url: $_->{entry}->{href}\n";
+ print "category: $_->{entry}->{topic}->{name}\n";
+ print "short_category: $_->{entry}->{topic}->{short_name}\n";
+ print "user: $_->{entry}->{user}->{name}\n";
+ print "user_pic: $_->{entry}->{user}->{icon}\n";
+ print "date: " . strftime("%Y-%m-%d %H:%M:%S", localtime $_->{entry}->{promote_date}) . "\n";
+ print "\n";
+ }
+}
+
+#
+# parse_posts
+#
+# Given XML posts, returns a hashref data structure with them
+#
+sub parse_posts {
+ my $xml = shift;
+ return XMLin($xml, KeyAttr => [], ForceArray => ['story']);
+}
+
+#
+# get_posts
+#
+# Gets front page ITEMS_PER_REQUEST posts at (offset - 1) * ITEMS_PER_REQUEST
+#
+sub get_posts {
+ my ($ua, $offset) = @_;
+
+ my $service_url = "http://services.digg.com/stories/popular";
+ $service_url .= "?appkey=" . DIGG_APPKEY;
+ $service_url .= "&offset=" . ($offset - 1) * ITEMS_PER_REQUEST;
+ $service_url .= "&count=" . ITEMS_PER_REQUEST;
+
+ return get_page($ua, $service_url);
+}
+
+#
+# get_page
+#
+# Given an URL, the subroutine returns content of the resource located at URL.
+# die()s if getting the URL fails
+#
+sub get_page {
+ my ($ua, $url) = @_;
+
+ my $response = $ua->get($url);
+ unless ($response->is_success) {
+ die "Failed getting $url: ", $response->status_line;
+ }
+
+ return $response->content;
+}
+
961 scripts/page_gen.pl
@@ -0,0 +1,961 @@
+#!/usr/bin/perl
+#
+# Copyright (C) 2007 Peteris Krumins (peter@catonmat.net)
+# http://www.catonmat.net - good coders code, great reuse
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+use warnings;
+use strict;
+
+#
+# This program was written as a part of "digpicz: digg's missing picture section"
+# website generator.
+# This website can be viewed here: http://digpicz.com
+#
+# See http://www.catonmat.net/designing-digg-picture-website for more info.
+#
+
+use Template; # for generating html pages from templates
+use DBI;
+use XML::RSS;
+use POSIX;
+use HTML::Entities;
+use URI;
+use URI::Escape;
+use File::Basename;
+use File::Find;
+use File::Copy;
+use File::Flock;
+use List::Util 'max';
+use open OUT => ':utf8';
+
+use ThumbExtractor;
+use ThumbMaker;
+use ImageFinder;
+
+# Print various debugging information to stderr
+#
+use constant DEBUG => 1;
+
+# Path to lockfile to make sure 1 copy of this script is running at any time
+#
+use constant LOCK_FILE_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/locks/page_gen.lock';
+
+# Number of items to display per page. note that the first page will always strech to
+# 2*ITEMS_PER_PAGE - 1 items (read about page generating algorithm in generate_pages subroutine).
+#
+use constant ITEMS_PER_PAGE => 15;
+
+# Number of items which appear in the feed
+#
+use constant ITEMS_PER_FEED => 15;
+
+# Path to webservers root directory which it will serve static pages from.
+#
+use constant WWW_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/www';
+
+# Path to html templates which will be used to make static pages.
+#
+use constant TEMPLATE_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/templates';
+
+# Temporary path for outputting compiled pages, after the pages have been generated
+# they will be atomically renamed() to WWW_PATH.
+#
+use constant OUTPUT_TMP_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/tmp.www';
+
+# Path to sqlite database which stores entries and some information about last script run.
+#
+use constant DATABASE_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/db/media.db';
+
+# Path to dir where the compiled (cached) entries will be stored.
+#
+use constant COMPILED_ENTRIES_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/compiled.entries';
+
+# Path to already generated pages which once generated never change (except navigation).
+# Read about algorithm in generate_pages subroutine's comments
+#
+use constant COMPILED_PAGES_PATH => '/mnt/evms/services/apache/wwwroot/digpicz/compiled.pages';
+
+# Path to single entry link directory. That is where static pages of
+# /link/<entry type>/<first char of entry title>/<entry title>.html are stored.
+#
+use constant LINK_DIR => '/mnt/evms/services/apache/wwwroot/digpicz/www/link'; # + /pictures or /videos
+
+# To prevent filesystem bottlenecks, entry cache will be stored in a number of subdirs
+# of COMPILED_ENTRIES_PATH. Based on entry ID, the compiled version will be stored in
+# COMPILED_ENTRIES_PATH/(integer part of(id/ITEMS_PER_CACHE_DIR) * ITEMS_PER_CACHE_DIR).
+# For example if ENTRIES_PER_DIR is 1000, then entry with id 25 will be stored
+# in COMPILED_ENTRIES_PATH/0 dir, entry with id 1832 in COMPILED_ENTRIES_PATH/1000, etc.
+#
+# The same happens for cached image thumbnails in IMAGE_CACHE_PATH
+#
+use constant ITEMS_PER_CACHE_DIR => 1000;
+
+# Most sites do not provide thumbnails, in this case we retrive the picture and
+# cache it locally in IMAGE_CACHE_PATH
+#
+use constant IMAGE_CACHE_PATH => "/mnt/evms/services/apache/wwwroot/digpicz/www/image.cache";
+
+# The relative path to WWW when a cached icon is accessed from web server
+#
+use constant IMAGE_RELATIVE_WWW => "/image.cache";
+
+# To generate thumbnails, ImageCacher.pm module needs netpbm executables.
+# This constant defines path to them
+#
+use constant NETPBM_PATH => "/home/pkrumins/tmpinstall/netpbm-10.26.44/foobarbaz/bin";
+
+lock_script();
+
+my $regenerate = shift || 0; # if regenerate is set, all the pages will be regenerated!
+ # useful if html templates have changed
+
+clear_cache() if $regenerate;
+
+my $dbh = DBI->connect("dbi:SQLite:" . DATABASE_PATH, '', '', { RaiseError => 1 });
+die $DBI::errstr unless $dbh;
+
+try_create_status_db();
+
+my %new_entries = get_new_entries('main'); # get all new entries
+exit 0 unless keys %new_entries; # exit if no new entries
+
+my $template = Template->new({
+ INCLUDE_PATH => TEMPLATE_PATH,
+ OUTPUT_PATH => OUTPUT_TMP_PATH,
+ ABSOLUTE => 1
+});
+
+# Get top users and top hosts
+my @top_users = get_top_users();
+my @top_hosts = get_top_hosts();
+
+#
+# First let's generate cache of the entries, they will be put in COMPILED_ENTRIES_PATH directory
+# and named $id-$type.html, where $id is the id of primary key of the entry in database and
+# $type is type of link (picture, pictures, video, videos, etc.).
+# They will still contain [% entry.pos %] template variable which should be replaced
+# with the correct position in the page (1, 2, 3, ..., etc).
+#
+# Also after each cache entry has been generated, create the
+# /<media type>/<first title alnum char>/title.html page to have something indexed in google
+#
+foreach my $entry_id (keys %new_entries) {
+ generate_entry_cache($new_entries{$entry_id});
+ generate_link($new_entries{$entry_id});
+}
+
+generate_pages('main', \%new_entries);
+
+update_rss_feed();
+
+# Now, do the atomic rename() of index pages
+#
+my @new_indexes = glob(OUTPUT_TMP_PATH . "/*.html");
+rename $_ => WWW_PATH . '/' . basename($_) foreach @new_indexes;
+
+update_status_db();
+
+#
+# lock_script
+#
+# Exclusively locks a file, so we had always 1 copy of script running at any
+# given moment
+#
+sub lock_script {
+ my $ret = lock(LOCK_FILE_PATH, undef, 'nonblocking');
+ unless ($ret) {
+ print "Script already running. Quitting.\n";
+ exit 1;
+ }
+}
+
+#
+# clear_cache
+#
+# Function deletes all *.html files in OUTPUT_TMP_PATH and
+# COMPILED_{ENTRIES,PAGES}_PATH directories
+#
+sub clear_cache {
+ unlink glob(OUTPUT_TMP_PATH . "/*.html");
+ unlink glob(COMPILED_PAGES_PATH . "/*.html");
+
+ my @entry_sub_dirs = grep -d, glob(COMPILED_ENTRIES_PATH . "/*");
+ foreach (@entry_sub_dirs) {
+ unlink glob($_ . "/*.html");
+ }
+
+}
+
+#
+# update_rss_feed
+#
+# Function takes last ITEMS_PER_FEED entries from the database and
+# generates the RSS feed for the media
+#
+# TODO: separate this out into a template
+#
+sub update_rss_feed {
+ my $rss = XML::RSS->new(version => '2.0');
+ $rss->channel(
+ title => "digpicz: digg's missing picture section",
+ link => "http://digpicz.com",
+ description => "the unofficial digg's picture section",
+ language => "en",
+ copyright => "digpicz.com (c) Peteris Krumins, 2007",
+ webMaster => 'peter@catonmat.net',
+ managingEditor => 'peter@catonmat.net',
+ pubDate => "2007-08-30 20:00",
+ lastBuildDate => strftime("%Y-%m-%d %H:%M:%S", localtime),
+ generator => "digpicz.com static page generator"
+ );
+
+ $rss->image(
+ title => "digpicz: digg's missing picture section",
+ url => "http://digpicz.com/logo.gif",
+ link => "http://digpicz.com",
+ width => 120,
+ height => 45,
+ description => "the unofficial digg's picture section"
+ );
+
+ my $last_entries_query = "SELECT * FROM digg ORDER BY id DESC LIMIT " . ITEMS_PER_FEED;
+ my $last_entries = $dbh->selectall_hashref($last_entries_query, ['id']);
+
+ foreach my $id (sort { $b <=> $a } keys %$last_entries) {
+ $rss->add_item(
+ title => $last_entries->{$id}->{title},
+ permaLink => $last_entries->{$id}->{url},
+ comments => $last_entries->{$id}->{digg_url},
+ pubDate => $last_entries->{$id}->{date_added},
+ category => $last_entries->{$id}->{type},
+ dc => {
+ creator => "digg.com"
+ }
+ );
+ }
+
+ $rss->save(WWW_PATH . "/feed.html");
+}
+
+#
+# try_create_status_db
+#
+# Creates a status db if does not exist
+#
+sub try_create_status_db {
+ my $table_exists = 0;
+ my $tables_q = "SELECT name FROM sqlite_master WHERE type='table' AND name='digg_status'";
+ my $res = $dbh->selectall_arrayref($tables_q);
+
+ if (defined $res and @$res) {
+ $table_exists = 1;
+ }
+
+ unless ($table_exists) {
+ my $create_db =<<EOL;
+CREATE TABLE digg_status (
+ last_id INTEGER NOT NULL,
+ last_run DATE NOT NULL
+)
+EOL
+ $dbh->do($create_db);
+ }
+}
+
+#
+# update_status_db
+#
+# Updates status information abour last run and last generated id.
+#
+sub update_status_db {
+ my $has_records = "SELECT * FROM digg_status";
+ my $records = $dbh->selectall_arrayref($has_records);
+
+ my $last_run = strftime("%Y-%m-%d %H:%M:%S", localtime);
+ my $max_id = max keys %new_entries;
+ if (defined $records and @$records) {
+ # Update the status table
+ #
+ $dbh->do("UPDATE digg_status SET last_id = '$max_id'");
+ $dbh->do("UPDATE digg_status SET last_run = '$last_run'");
+ }
+ else {
+ # Insert new status
+ #
+ $dbh->do("INSERT INTO digg_status (last_id, last_run) VALUES ('$max_id', '$last_run')");
+ }
+}
+
+#
+# generate_pages
+#
+# Given a hashref of new entries and the page type, the function generates
+# pages of given type and outputs them to OUTPUT_TMP_PATH directory.
+#
+# Function uses the generated cache entries.
+#
+sub generate_pages {
+ my ($page_type, $new_entries) = @_;
+
+ my @compiled_entries = get_compiled_entries($page_type);
+ my @index_pages = get_index_pages($page_type);
+
+ #
+ # I want to regenerate pages as little as possible, to keep things running quick.
+ # Here is the algorithm which splits the entries to pages and makes them never change.
+ # Only the first page changes at any time.
+ #
+ # Let T be total number of entries, IPP be items per page to display.
+ # The first page will have maximum 2 * IPP - 1 entries. Given T entries, it is first filled
+ # with IPP entries and then with T%IPP. IPP is now a divisor of the remaining
+ # number of entries T - (IPP + T%IPP).
+ #
+ # Now we just have to update the main page and offset other pages
+ #
+
+ my ($total_entries, $total_pages) = (scalar @compiled_entries, scalar @index_pages);
+ my $extra_entries = $total_entries % ITEMS_PER_PAGE; # number of extra entries on first page
+ my $first_page_entries = ITEMS_PER_PAGE + $extra_entries;
+
+ if ($first_page_entries > $total_entries) {
+ $first_page_entries = $total_entries;
+ }
+
+ # Generate first page (index.html or index-pictures.html, etc).
+ my @gen_entries;
+ for my $entry_idx (0 .. $first_page_entries - 1) {
+ push @gen_entries, {
+ file => $compiled_entries[$entry_idx],
+ pos => $total_entries - $entry_idx
+ }
+ }
+ generate_page($page_type, 1, $total_entries, \@gen_entries);
+
+ if ($total_pages <= 1 || $regenerate) {
+ # no existing index pages for this page_type
+ # generate all pages!
+ my $current_page = 2;
+ my $current_item = 1;
+ @gen_entries = ();
+ for my $entry_idx ($first_page_entries .. $#compiled_entries) {
+ push @gen_entries, {
+ file => $compiled_entries[$entry_idx],
+ pos => $total_entries - $entry_idx
+ };
+
+ if ($current_item % ITEMS_PER_PAGE == 0) {
+ generate_page($page_type, $current_page, $total_entries, \@gen_entries);
+ @gen_entries = ();
+ $current_page++;
+ }
+ $current_item++;
+ }
+ }
+ else {
+ # Generate only the new pages.
+ # We determine how many new pages will be created and just rename the existing ones
+ # by that number. This way we avoid regenerating the existing pages.
+ #
+ my $pages_required = ($total_entries - $first_page_entries)/ITEMS_PER_PAGE; # pages required to fit the left entries
+ my $page_offset = ($pages_required + 1) - $total_pages; # +1 because of the first page (plain index.html)
+
+ if ($page_offset) {
+ my $total_new_entries = keys %$new_entries;
+ my $entry_offset = ITEMS_PER_PAGE + $extra_entries;
+
+# print "tot ent: $total_entries\n";
+# print "1st p ent: $first_page_entries\n";
+# print "total pages: $total_pages\n";
+# print "pages req: $pages_required\n";
+# print "page offset: $page_offset\n";
+# print "tot new en: $total_new_entries\n",
+# print "ent offset: $entry_offset\n";
+
+ # copy the other pages to new page numbers (moved later back to WWW_PATH)
+ #
+ for my $page_number (2 .. $total_pages) {
+ my $new_page_number = $page_number + $page_offset;
+ my $src = COMPILED_PAGES_PATH . '/' . get_page_name($page_type, $page_number);
+ my $dst = get_page_name($page_type, $new_page_number);
+ my $data = {
+ navigation => build_navigation($total_entries, $page_type, $new_page_number),
+ topusers => \@top_users,
+ tophosts => \@top_hosts
+ };
+ $template->process($src, $data, $dst, binmode => ':utf8');
+ }
+
+ # since generate_page will be creating new index pages and their compiled versions
+ # we need to change their indexes
+# for my $page_number (reverse 2 .. $total_pages) {
+# my $src = COMPILED_PAGES_PATH . '/' . get_page_name($page_type, $page_number);
+# my $dst = COMPILED_PAGES_PATH . '/' . get_page_name($page_type, $page_number + $page_offset);
+# rename $src => $dst;
+# }
+
+ my $current_page = 2;
+ my $current_item = 1;
+ @gen_entries = ();
+ for my $entry_idx ($entry_offset .. $entry_offset + $pages_required * ITEMS_PER_PAGE - 1) {
+ push @gen_entries, {
+ file => $compiled_entries[$entry_idx],
+ pos => $total_entries - $entry_idx
+ };
+
+ if ($current_item % ITEMS_PER_PAGE == 0) {
+ generate_page($page_type, $current_page, $total_entries, \@gen_entries);
+ @gen_entries = ();
+ $current_page++;
+ }
+ $current_item++;
+ }
+ }
+ }
+}
+
+#
+# generate_page
+#
+# Given a page type, page number and entries, the function generates a static
+# HTML page and puts it in OUTPUT_TMP_PATH directory
+#
+sub generate_page {
+ my ($page_type, $page_number, $total_entries, $entries) = @_;
+
+ my $outpage = get_page_name($page_type, $page_number);
+ my $data = {
+ last_update => strftime("%Y-%m-%d %H:%M:%S", localtime),
+ navigation => build_navigation($total_entries, $page_type, $page_number),
+ entries => $entries,
+ page_type => $page_type,
+ topusers => \@top_users,
+ tophosts => \@top_hosts,
+ };
+ $template->process('index.html', $data, $outpage, binmode => ':utf8');
+
+ # create a compiled version which will be used when moving pages
+ $data->{navigation} = '[% navigation %]';
+ $data->{topusers_tpl} = 1; # include top user template
+ $data->{tophosts_tpl} = 1; # include top host template
+ my $output;
+ $template->process('index.html', $data, \$output);
+
+ my $file_path = COMPILED_PAGES_PATH . "/$outpage";
+ open my $out, '>', $file_path or die "Error: could not open '$file_path': $!";
+ print $out $output;
+ close $out;
+}
+
+#
+# get_page_name
+#
+# Given page type and page number, generates an index page filename
+#
+sub get_page_name {
+ my ($page_type, $page_number) = @_;
+
+ my $outpage;
+ if ($page_type eq 'main') {
+ $outpage = $page_number == 1 ? "index.html" : "index-$page_number.html";
+ }
+ else {
+ $outpage = $page_number == 1 ? "index-$page_type.html" : "index-$page_type-$page_number.html";
+ }
+ return $outpage;
+}
+
+#
+# build_navigation
+#
+# given total number of entries, function builds navigation html code
+# for a given type of page (main, pictures or videos)
+#
+sub build_navigation {
+ my ($total_entries, $type, $current) = @_;
+ $current ||= -1;
+ my $pages = int $total_entries / ITEMS_PER_PAGE;
+
+ my @navarr;
+ for my $page (1 .. $pages) {
+ # build page names
+ #
+ my $page_name;
+ if ($type eq "main") {
+ $page_name = "index";
+ }
+ else {
+ $page_name = "index-$type";
+ }
+
+ unless ($page == 1) {
+ $page_name .= "-$page";
+ }
+ $page_name .= ".html";
+
+ my $nav = {
+ href => $page_name,
+ page => $page,
+ current => $current
+ };
+ push @navarr, $nav;
+ }
+
+ my $output = '';
+ $template->process('navigation.html', { navs => \@navarr }, \$output);
+
+ return $output;
+}
+
+#
+# generate_link
+#
+# Given a digg story, function generates /link/<first title letter>/entry-title.html page
+#
+sub generate_link {
+ my $entry = shift;
+
+ my $entry_data = {
+ icon => get_icon($entry),
+ title => encode_entities($entry->{title}),
+ description => encode_entities($entry->{desc}),
+ title_uri_esc => uri_escape($entry->{title}),
+ sane_title => sanitize_title($entry->{title}),
+ host => get_host($entry->{url}),
+ link_dir => get_link_dir($entry->{type}),
+ user => encode_entities($entry->{user}),
+ user_avatar => $entry->{user_avatar},
+ url => $entry->{url},
+ digg_url => $entry->{digg_url},
+ digg_category => $entry->{digg_category},
+ digg_short_category => $entry->{digg_short_category},
+ url_uri_esc => uri_escape($entry->{url}),
+ date_added => $entry->{date_added},
+ };
+
+ my %link_data = (
+ last_update => strftime("%Y-%m-%d %H:%M:%S", localtime),
+ title => encode_entities($entry->{title}),
+ topusers => \@top_users,
+ tophosts => \@top_hosts
+ );
+
+ my $output;
+ $template->process('link.html', { entry => $entry_data, %link_data }, \$output);
+
+ # build path to link file
+ my $link_path = LINK_DIR;
+ $link_path .= "/pictures" if $entry->{type} =~ /picture/;
+ $link_path .= "/videos" if $entry->{type} =~ /video/;
+
+ unless (-d $link_path) {
+ mkdir $link_path or die "Error: could not create '$link_path': $!";
+ }
+
+ $link_path .= '/' . substr($entry_data->{sane_title}, 0, 1);
+ unless (-d $link_path) {
+ mkdir $link_path or die "Error: could not create '$link_path': $!";
+ }
+
+ $link_path .= "/$entry_data->{sane_title}.html";
+
+ open my $out, '>', $link_path or die "Error: could not open '$link_path': $!";
+ print $out $output;
+ close $out;
+}
+
+#
+# generate_entry_cache
+#
+# The function takes a digg story and generates an entry cache file.
+#
+sub generate_entry_cache {
+ my $entry = shift;
+
+ return if !$regenerate and -e COMPILED_ENTRIES_PATH . "/$entry->{id}-$entry->{type}.html";
+
+ my $entry_data = {
+ icon => get_icon($entry),
+ title => encode_entities($entry->{title}),
+ description => encode_entities($entry->{desc}),
+ title_uri_esc => uri_escape($entry->{title}),
+ sane_title => sanitize_title($entry->{title}),
+ host => get_host($entry->{url}),
+ link_dir => get_link_dir($entry->{type}),
+ user => encode_entities($entry->{user}),
+ user_avatar => $entry->{user_avatar},
+ url => $entry->{url},
+ digg_url => $entry->{digg_url},
+ digg_category => $entry->{digg_category},
+ digg_short_category => $entry->{digg_short_category},
+ digg_url => $entry->{digg_url},
+ url_uri_esc => uri_escape($entry->{url}),
+ date_added => $entry->{date_added},
+ };
+ $entry_data->{title_first_char} = substr($entry_data->{sane_title}, 0, 1);
+
+ my $output = '';
+ $template->process('index_entry.html', { entry => $entry_data } , \$output);
+
+ my $entry_dir = COMPILED_ENTRIES_PATH . '/' . get_cache_subdir($entry->{id});
+ unless (-d $entry_dir) {
+ mkdir $entry_dir or die "Error: could not create '$entry_dir': $!";
+ }
+
+ my $file_path = "$entry_dir/$entry->{id}-$entry->{type}.html";
+ open my $out, '>', $file_path or die "Error: could not open '$file_path': $!";
+ print $out $output;
+ close $out;
+}
+
+#
+# get_cache_subdir
+#
+# Calculates cache subdir, see comments of ITEMS_PER_CACHE_DIR constant
+#
+sub get_cache_subdir {
+ my $id = shift;
+ return (int $id / ITEMS_PER_CACHE_DIR) * ITEMS_PER_CACHE_DIR;
+}
+
+#
+# get_compiled_entries
+#
+# Given page_type, the function returns a list of compiled (cached) entries for a given type
+#
+sub get_compiled_entries {
+ my $page_type = shift;
+
+ my $entry_search_glob;
+ if ($page_type eq 'main') {
+ $entry_search_glob = "/*.html";
+ }
+ elsif ($page_type eq "pictures") {
+ $entry_search_glob = "/*{picture,pictures}.html";
+ }
+ elsif ($page_type eq "videos") {
+ $entry_search_glob = "/*{video,videos}.html";
+ }
+
+ my @entries;
+ my @entry_sub_dirs = grep -d, glob(COMPILED_ENTRIES_PATH . "/*");
+ foreach (@entry_sub_dirs) {
+ my @subentries = glob($_ . $entry_search_glob);
+ push @entries, @subentries;
+ }
+
+ # sort compiled entries by id and then reverse the list so the list began with newest entries
+ my @compiled_entries = reverse sort {;
+ no warnings 'numeric';
+ int basename($a) <=> int basename($b)
+ } @entries;
+
+ return @compiled_entries;
+}
+
+#
+# get_index_pages
+#
+# Given page_type, the function returns a list of existing index*.html pages
+#
+sub get_index_pages {
+ my $page_type = shift;
+
+ my @index_pages;
+ if ($page_type eq 'main') {
+ # can't use a glob on main page because an 'index*.html' glob
+ # would match picture and video index pages as well
+ find(sub {
+ push @index_pages, $File::Find::name if $_ =~ /index(-\d+)?\.html$/
+ }, WWW_PATH
+ );
+ }
+ else {
+ my $page_search_glob = "/index-$page_type*.html";
+ @index_pages = glob(WWW_PATH . $page_search_glob);
+ }
+
+ my @sorted_index_pages = sort {
+ my $rx = qr/(\d+)\.html$/;
+ my ($an) = $a =~ /$rx/;
+ my ($bn) = $b =~ /$rx/;
+ return 1 unless defined $bn and defined $an; # take care of default pages like 'index.html'
+ $an <=> $bn;
+ } @index_pages;
+
+ return @sorted_index_pages;
+}
+
+#
+# get_link_dir
+#
+# Given entry's type, returns link directory.
+#
+sub get_link_dir {
+ my $type = shift;
+
+ return "videos" if ($type =~ /video/);
+ return "pictures" if ($type =~ /picture/);
+ die "unknown entry type: $type";
+}
+
+#
+# get_icon
+#
+# Given an entry, the function gets a thumbnail (icon) for the entry.
+# For example, for youtube videos it gets thumbnail from youtube's servers.
+# Or, for some blogspot page it tries to find the first image in the content,
+# download it, make a thumbnail and cache it locally.
+#
+sub get_icon {
+ my $entry = shift;
+
+ my $sane_title = sanitize_title($entry->{title});
+ my $cached_icon_path = IMAGE_CACHE_PATH . '/' . get_cache_subdir($entry->{id});
+ unless (-d $cached_icon_path) {
+ mkdir $cached_icon_path;
+ }
+ $cached_icon_path .= "/$entry->{id}-$sane_title.jpg";
+
+ my $rel_www_icon_path = IMAGE_RELATIVE_WWW . '/' . get_cache_subdir($entry->{id});
+ unless (-d $rel_www_icon_path) {
+ mkdir $rel_www_icon_path;
+ }
+ $rel_www_icon_path .= "/$entry->{id}-$sane_title.jpg";
+
+ return $rel_www_icon_path if -e $cached_icon_path; # return cached icon
+
+ my $thex = ThumbExtractor->new;
+ my $thumb = $thex->get_thumbnail($entry->{url});
+
+ unless (defined $thumb) { # no thumb was found
+ if ($entry->{type} =~ /video/) {
+ # each video site requires a custom written handler for extracting thumbnails
+ # if there was none, display default icon
+ print STDERR "Couldn't extract thumbnail for video site at '$entry->{url}'\n" if DEBUG;
+ return get_default_icon($entry->{type});
+ }
+
+ # let's find the best image on the page
+ my $image_finder = ImageFinder->new(netpbm => NETPBM_PATH);
+ my $best_img = $image_finder->find_best_image($entry->{url});
+
+ unless ($best_img) { # no best image, hmm.
+ print STDERR "No best image was found at '$entry->{url}'\n" if DEBUG;
+ return get_default_icon($entry->{type});
+ }
+
+ # create a thumbnail for this image
+ my $thumb_maker = ThumbMaker->new(netpbm => NETPBM_PATH);
+ my $success = $thumb_maker->create_thumbnail($best_img, $cached_icon_path,
+ { width => 76, height => 76 });
+
+ unlink $best_img;
+ unless ($success) {
+ print STDERR $thumb_maker->get_error, "\n" if DEBUG;
+ return get_default_icon($entry->{type});
+ }
+
+ return $rel_www_icon_path;
+ }
+
+ if ($thumb->is_thumb) { # a real thumbnail
+ return $thumb->url;
+ }
+ else { # just an image
+ my $thumb_maker = ThumbMaker->new(netpbm => NETPBM_PATH);
+ my $success = $thumb_maker->create_thumbnail($thumb->url, $cached_icon_path,
+ { width => 76, height => 76 });
+
+ unless ($success) {
+ print STDERR $thumb_maker->get_error, "\n" if DEBUG;
+ return get_default_icon($entry->{type});
+ }
+ }
+ return $rel_www_icon_path;
+}
+
+#
+# get_default_icon
+#
+sub get_default_icon {
+ my $type = shift;
+
+ return "/icons/$type-big.gif";
+}
+
+#
+# sanitize_title
+#
+# given a title of a digg story, the function sanitizes the title:
+# removes [ ]'s, ( )'s, etc. and then replaces all non alphanumeric chars with '-'
+#
+sub sanitize_title {
+ my $title = lc shift;
+
+ $title =~ s{\[|\]|\(|\)|'}{}g;
+ $title =~ s/[^[:alnum:]]/-/g;
+
+ # get rid of multiple -'s
+ $title =~ s/-{2,}/-/g;
+
+ # get rid of leading and trailing -'s
+ $title =~ s/^-+|-+$//g;
+
+ if (length $title > 100) {
+ $title = substr($title, 0, 100);
+ $title =~ s/-*$//g; # there might now be one - at the end again
+ $title =~ s/-[[:alnum:]]*$//g;
+ }
+
+ return $title;
+}
+
+#
+# get_host
+#
+# given a URL, the function returns host portion of it
+#
+sub get_host {
+ my $url = shift;
+
+ my $uri = URI->new($url);
+ if ($uri->can('host')) {
+ return $uri->host;
+ }
+ return "unknown";
+}
+
+#
+# get_top_users
+#
+# Subroutine returns an array of hashrefs of top 10 users
+# Each hash hash two keys 'user' and 'posts'
+#
+sub get_top_users {
+ my $top_users_query =<<EOL;
+SELECT user, user_avatar, count(user) as posts
+ FROM digg
+GROUP BY user
+ ORDER BY posts
+DESC
+ LIMIT 10
+EOL
+ my $top_users = $dbh->selectall_arrayref($top_users_query);
+ my @ret;
+ foreach (@$top_users) {
+ push @ret, {
+ user => $_->[0],
+ avatar => $_->[1],
+ total_posts => $_->[2]
+ }
+ }
+ return @ret;
+}
+
+#
+# get_top_hosts
+#
+# Subroutine returns an array of hashrefs of top 10 domains
+# Each hash hash two keys 'host' and 'posts'
+#
+sub get_top_hosts {
+ my $urls_query = "SELECT url FROM digg";
+ my $urls = $dbh->selectall_arrayref($urls_query);
+
+ my %hosts;
+ foreach (@$urls) {