lib/Parrot/Docs/POD2HTML.pm

# Copyright (C) 2004-2012, Parrot Foundation.

=head1 NAME

Parrot::Docs::POD2HTML - POD to HTML formatter

=head1 SYNOPSIS

    use Parrot::Docs::POD2HTML;

=head1 DESCRIPTION

C<Parrot::Docs::POD2HTML> subclasses C<Pod::Simple::HTML> to provide
various Parrot-specific formatting features, such as linking
C<CE<lt>Parrot::I<Module>E<gt>> and C<FE<lt>I<file>E<gt>>, and using the
Parrot house-style provided by C<Parrot::HTMLPage>.

=head2 Instance Methods

=over

=cut

package Parrot::Docs::POD2HTML;

use strict;
use warnings;

use base qw( Pod::Simple::HTML );

# This is just here to keep Pod::Simple::HTML's version_tag_comment() happy.
our $VERSION = '1.0';

use Parrot::Docs::HTMLPage;
use Parrot::Distribution;
use Parrot::BuildUtil;
use Parrot::Config;

=item C<new()>

Extend C<Pod::Simple::HTML> method to accept PIR and PASM sections that
contain example code, which will be put into a <pre> HTML element.

=cut

sub new {
    my $new = shift->SUPER::new(@_);

    $new->accept_targets(qw(
        PIR PASM PIR_FRAGMENT
        PIR_INVALID PIR_TODO
        PASM_INVALID PASM_TODO
        PIR_FRAGMENT_INVALID
    ));
    delete(@{$new->{'Tagmap'}}{'Data','/Data'});

    return $new;
}

=item C<do_beginning()>

Reimplements the C<Pod::Simple::HTML> method to add a header to the start
of the document.

=cut

sub do_beginning {
    my $self = shift;

    # We have to do this because it has the side effect of setting
    # content_seen.
    $self->get_short_title();

    return unless $self->content_seen;

    # Suppress the warning in the tests.
    $self->{RESOURCES_URL} = '' unless $self->{RESOURCES_URL};
    $self->{NAV_BAR}       = '' unless $self->{NAV_BAR};

    my $title = $self->{'Title'};
    esc($title);

    print { $self->{'output_fh'} }
        Parrot::Docs::HTMLPage->header( $title, $self->{NAV_BAR}, $self->{RESOURCES_URL} );

    $self->version_tag_comment;

    return 1;
}

=item C<do_middle()>

Does the middle of the document. This splits up the long C<do_middle()>
method in C<Pod:Simple::HTML>, calling the various C<process_*> methods
below. This makes it easier to where the custom bits of Parrot-specific
formatting have to be inserted.

=cut

sub do_middle {
    my $self = shift;
    my $token;
    my $type;

    $self->{STACK}     = [];
    $self->{DONT_WRAP} = 0;

    while ( $token = $self->get_token ) {
        $type = $token->type;

        if ( $type eq 'start' ) {
            $self->process_start_token($token);
        }
        elsif ( $type eq 'end' ) {
            $self->process_end_token($token);
        }
        elsif ( $type eq 'text' ) {
            $self->process_text_token($token);
        }
    }

    return 1;
}

=item C<process_start_token($token)>

Process a start token.

=cut

sub process_start_token {
    my $self    = shift;
    my $token   = shift;
    my $tagname = $token->tagname;

    if ( $tagname eq 'L' ) {
        $self->process_link_start_token($token);
    }
    elsif ( $tagname eq 'F' ) {
        $self->process_file_start_token($token);
    }
    elsif ( $tagname eq 'C' ) {
        $self->process_code_start_token($token);
    }
    elsif ( $tagname eq 'item-text' or $tagname =~ m/^head\d$/s ) {
        $self->process_item_text_or_head_start_token($token);
    }
    elsif ( $tagname eq 'Data' ) {
        $self->process_data_start_token($token);
    }
    elsif ( $tagname eq 'for' ) {
        $self->process_for_start_token($token);
    }
    else {
        $self->process_other_start_token($token);
    }
}

=item C<process_link_start_token($token)>

Processes the link start token.

=cut

sub process_link_start_token {
    my $self  = shift;
    my $token = shift;
    my $text  = $self->do_link($token);

    esc($text);

    if ( defined $text and length $text ) {
        print { $self->{'output_fh'} } "<a href='$text'>";
    }
    else {
        print { $self->{'output_fh'} } "<a>";
    }
}

=item C<process_code_start_token($token)>

Processes the code start token. If the code text is the name of a Parrot
Perl module, and the current documentation file is not the file for that
module, then the documentation file for the module is linked to.

=cut

sub process_code_start_token {
    my $self    = shift;
    my $token   = shift;
    my $tagname = $token->tagname;

    # We make the code tags in items bold because they are almost
    # always part of function and arguments, or constants listings
    #Êand should stand out.

    print { $self->{'output_fh'} } '<b>' if $self->{IN_ITEM_TEXT};

    print { $self->{'output_fh'} } $self->{'Tagmap'}{$tagname};

    my $next = $self->get_token;

    unless ( $next->type eq 'text' ) {
        $self->unget_token($next);
        return;
    }

    my $text = $next->text;

    if ( $text =~ /^Parrot::/ ) {

        my $href = $self->href_for_perl_module($text);

        esc($text);

        if ($href) {
            $text = "<a href='$href'>$text</a>";
        }
    }
    else {

        esc($text);
    }

    print { $self->{'output_fh'} } $text;
}

=item C<process_file_start_token($token)>

Processes the file start token. If the text of the next token is a file
path and that file contains POD, then a link will be made to that file's
documentation file.

=cut

sub process_file_start_token {
    my $self    = shift;
    my $token   = shift;
    my $tagname = $token->tagname;
    my $next    = $self->get_token;

    if ( $next->type eq 'text' ) {
        my $text = $next->text;
        my $dist = Parrot::Distribution->new;

        # Only link to files that will have HTML pages.

        if (
            $dist->relative_path_is_file($text)

            # A little bit of a hack to avoid config template files.
            and $text !~ /\.in$/ and $dist->file_with_relative_path($text)->contains_pod
            )
        {
            my $path = $self->append_html_suffix($text);
            my $file = $self->{TARGET}->file_with_relative_path($path);

            print { $self->{'output_fh'} } $self->{'Tagmap'}{$tagname}
                . $self->html_link( $self->{DOCS_FILE}->parent->relative_path($file), esc($text) );
        }
        else {
            print { $self->{'output_fh'} } $self->{'Tagmap'}{$tagname} . esc($text);
        }
    }
    else {
        $self->unget_token($next);
        print { $self->{'output_fh'} } $self->{'Tagmap'}{$tagname};
    }
}

=item C<process_item_text_or_head_start_token($token)>

Process the item text or head start token.

=cut

sub process_item_text_or_head_start_token {
    my $self    = shift;
    my $token   = shift;
    my $tagname = $token->tagname;

    print { $self->{'output_fh'} } $self->{'Tagmap'}{$tagname} || return;

    if ( $tagname eq 'item-text' ) {
        $self->{IN_ITEM_TEXT} = 1;
    }

    my @to_unget;

    # Get all the contained tokens, including the end token.

    while (1) {
        push @to_unget, $self->get_token;

        last
            if $to_unget[-1]->is_end
                and $to_unget[-1]->tagname eq $tagname;
    }

    # Convert them into an anchor name;

    my $name = $self->linearize_tokens(@to_unget);

    if ( defined $name ) {
        $name =~ tr/ /_/;

        print { $self->{'output_fh'} } qq(<a name="), esc($name), qq("\n>);

    }
    else {
        print { $self->{'output_fh'} } "<a\n>";
    }

    $self->unget_token(@to_unget);
}

=item C<process_data_start_token($token)>

Processes a data start token.

=cut

sub process_data_start_token {
    my $self  = shift;
    my $token = shift;
    my $next  = $self->get_token;

    return unless defined $next;

    unless ( $next->type eq 'text' ) {
        $self->unget_token($next);
        return;
    }

    if  ($self->{IN_CODE_BLOCK}) {
        print { $self->{'output_fh'} } $next->text;
    }
    else {
        print { $self->{'output_fh'} } "\n" . $next->text . "\n";
    }
}

=item C<process_for_start_token($token)>

Processes a for start token.

=cut

sub process_for_start_token {
    my $self  = shift;
    my $token = shift;
    my $target = $token->attr("target");

    if ($target =~ m/^(PIR|PASM)/) {
        print { $self->{'output_fh'} } '<pre>';
        $self->{IN_CODE_BLOCK} = 1;
    }
}

=item C<process_other_start_token($token)>

Processes a start token not processable by the above methods.

=cut

sub process_other_start_token {
    my $self    = shift;
    my $token   = shift;
    my $tagname = $token->tagname;

    if ( $tagname =~ m/^over-(.+)$/s ) {
        push @{ $self->{STACK} }, $1;
    }
    elsif ( $tagname eq 'Para' ) {
        $tagname = 'Para_item'
            if @{ $self->{STACK} } and $self->{STACK}->[-1] eq 'text';
    }

    print { $self->{'output_fh'} } $self->{'Tagmap'}{$tagname} || return;

    ++$self->{DONT_WRAP}
        if $tagname eq 'Verbatim'
            or $tagname eq "VerbatimFormatted"
            or $tagname eq 'X';
}

=item C<process_end_token($token)>

Processes an end token.

=cut

sub process_end_token {
    my $self    = shift;
    my $token   = shift;
    my $tagname = $token->tagname;

    if ( $tagname =~ m/^over-/s ) {
        pop @{ $self->{STACK} };
    }
    elsif ( $tagname eq 'Para' ) {
        $tagname = 'Para_item'
            if @{ $self->{STACK} } and $self->{STACK}->[-1] eq 'text';
    }
    elsif ( $tagname eq 'C' ) {

        # See the note in process_code_start_token() above.
        print { $self->{'output_fh'} } '</b>' if $self->{IN_ITEM_TEXT};
    }
    elsif ( $tagname eq 'item-text' ) {
        $self->{IN_ITEM_TEXT} = 0;
    }
    elsif ( $tagname eq 'for' ) {
        print { $self->{'output_fh'} } '</pre>' if $self->{IN_CODE_BLOCK};
        $self->{IN_CODE_BLOCK} = 0;
    }

    print { $self->{'output_fh'} } $self->{'Tagmap'}{"/$tagname"} || return;

    --$self->{DONT_WRAP} if $tagname eq 'Verbatim' or $tagname eq 'X';
}

=item C<process_text_token($token)>

Processes the specified text token. URLs which are not in link tags
are linked here for convenience.

=cut

sub process_text_token {
    my $self  = shift;
    my $token = shift;
    my $text  = $token->text;

    esc($text);
    $text =~ s/([\?\!\"\'\.\,]) /$1\n/g unless $self->{DONT_WRAP};

    # URLs not in L<>.
    $text =~ s|(http://[^\s)]+)|<a href="$1">$1</a>|gs;

    print { $self->{'output_fh'} } $text;
}

=item C<do_pod_link($link)>

This is reimplemented here to avoid a bug in C<Pod::Simple::HTML>.

=cut

sub do_pod_link {
    my $self    = shift;
    my $link    = shift;
    my $to      = $link->attr('to');
    my $section = $link->attr('section');

    return
        unless ( defined $to and length $to )
        or ( defined $section and length $section );

    if ( defined $to and length $to ) {
        $to = $self->resolve_pod_page_link( $to, $section );

        return unless defined $to and length $to;
    }

    if ( defined $section and length( $section .= '' ) ) {
        $section =~ tr/ /_/;
        $section =~ tr/\x00-\x1F\x80-\x9F//d if 'A' eq chr(65);
        $section = $self->unicode_escape_url($section);

        # Turn char 1234 into "(1234)"
        $section = '_' unless length $section;
    }

    foreach my $it ( $to, $section ) {
        next unless defined $it;

        $it =~ s/([^\x00-\xFF])/join '', map sprintf('%%%02X',$_), unpack 'C*', $1/eg;
        $it =~
s/([^\._abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789])/sprintf('%%%02X',ord($1))/eg;

        # Yes, stipulate the list without a range, so that this can work right on
        #  all charsets that this module happens to run under.
        # Altho, hmm, what about that ord?  Presumably that won't work right
        #  under non-ASCII charsets.  Something should be done about that.
    }

    # Pod::Simple::HTML bug was here.
    my $out;

    $out = $to if defined $to and length $to;
    $out .= "#" . $section if defined $section and length $section;

    return unless length $out;

    return $out;
}

=item C<resolve_pod_page_link($to, $section)>

Resolves the POD link. Perl modules are converted to paths.

=cut

sub resolve_pod_page_link {
    my $self    = shift;
    my $to      = shift;
    my $section = shift;

    if ( $to =~ /^Parrot::/ ) {
        my $href = $self->href_for_perl_module($to);

        # This gets corrupted somewhere down the line, with
        # Parrot/PackFile/ConstTable.pm.html being turned into
        # Parrot/PackFile%2FConstTable.pm.html and thus breaking
        # the CSS and images somehow.

        return $href if defined $href;
    }

    return 'TODO';
}

=item C<href_for_perl_module($module)>

Returns the path to the Perl module's HTML file relative to the current
documentation page. Currently only F<lib/Parrot> modules are linkable.
Returns C<undef> if the current documentation file is the file for the
module.

=cut

sub href_for_perl_module {
    my $self   = shift;
    my $module = shift;

    # This is not very obvious, so let me explain. We get the file
    # for the module, then we take its path relative to the
    # distribution, then we append the HTML suffix and get the
    # docs file. Once we have that all we need is the relative
    # path from the current directory to the file and return that
    # as the link.

    my $dist = Parrot::Distribution->new;
    my $file = $dist->file_for_perl_module($module);

    return if not $file or not $file->contains_pod;

    my $path = $self->append_html_suffix( $dist->relative_path($file) );

    # This is the docs file for the module.
    $file = $self->{TARGET}->file_with_relative_path($path);

    # There's no point in linking to the file you are already in.
    return if $file == $self->{DOCS_FILE};

    return $self->href_path( $self->{DOCS_FILE}->parent->relative_path($file) );
}

=item C<do_end()>

Reimplements the C<Pod::Simple::HTML> method to add a footer to the end
of the document.

=cut

sub do_end {
    my $self = shift;

    print { $self->{'output_fh'} } Parrot::Docs::HTMLPage->footer( '', $self->{RESOURCES_URL} );

    return 1;
}

=item C<html_for_file($file)>

Returns the HTML for the specified file.

=cut

sub html_for_file {
    my $self   = shift;
    my $file   = shift;
    my $string = "";

    $self->output_string( \$string );
    $self->parse_file( $file->path );

    return $string;
}

=item C<write_html($source, $target, $rel_path, $item)>

Writes an HTML version of the file specified by C<$rel_path> in
C<$source> to the equivalent location in C<$target>.

=cut

sub write_html {
    my $self     = shift;
    my $source   = shift;
    my $target   = shift;
    my $rel_path = shift;
    my $item     = shift;
    my $file     = $source->file_with_relative_path($rel_path);

    return unless $file->contains_pod;

    $self->{TARGET} = $target;

    # Use our own method for consistency.
    $self->{'Title'} = $file->title;

    $rel_path = $self->append_html_suffix($rel_path);

    my $docs_file = $target->file_with_relative_path($rel_path);

    $self->{DOCS_FILE} = $docs_file;

    $rel_path = $self->href_path( $docs_file->parent->relative_path( $target->parent_path ) );

    my $name = $target->name;

    # This is a cheat because we know that all top-level sections
    # have their indexes in the root directory - but it works.

    $self->{NAV_BAR}       = $item->html_navigation("$rel_path/$name");
    $self->{RESOURCES_URL} = "$rel_path/resources";

    $docs_file->write( $self->html_for_file($file) );

    unless ($self->{TESTING}) {
        my $path = File::Spec->abs2rel(
          File::Spec->catfile($docs_file->{PATH}), $PConfig{build_dir});
        chdir "..";
        add_to_generated($path, "[main]", "html");
        chdir "docs";
    }
}

=item C<append_html_suffix($path)>

Returns C<$path> with and HTML file suffix added.

=cut

sub append_html_suffix {
    my $self = shift;
    my $path = shift;

    return $path . '.html';
}

=item C<html_link($href, $text)>

Returns an HTML anchor with the specified "href".

=cut

sub html_link {
    my $self = shift;
    my $href = $self->href_path(shift);
    my $text = shift || $href;

    return "<a href=\"$href\">$text</a>";
}

=item href_path($path)

Converts the path for use in an "href".

Sequences of backslashes are converted to forward slash.

=cut

sub href_path {
    my $self = shift;
    my $path = shift;

    $path =~ s|\\+|/|go;

    return $path;
}

=back

=head2 Functions

=over 4

=item C<esc(@strings)>

=item C<esc($string)>

Reimplemented here to silence a C<Pod::Simple::HTML> warning.

=cut

sub esc {
    if ( defined wantarray ) {
        if (wantarray) {
            @_ = splice @_;    # break aliasing
        }
        else {
            my $x = shift;

            # Pod::Simple::HTML didn't check this.
            return $x unless $x;
            $x =~
s/([^\n\t !\#\$\%\(\)\*\+,\.\~\/\:\;=\?\@\[\\\]\^_\`\{\|\}abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789])/'&#'.(ord($1)).';'/eg;
            return $x;
        }
    }

    foreach my $x (@_) {

        # Pod::Simple::HTML didn't check this.
        next unless $x;

        # Escape things very cautiously:
        $x =~
s/([^\n\t !\#\$\%\(\)\*\+,\.\~\/\:\;=\?\@\[\\\]\^_\`\{\|\}abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789])/'&#'.(ord($1)).';'/eg;

        # Leave out "- so that "--" won't make it thru in X-generated comments
        #  with text in them.

        # Yes, stipulate the list without a range, so that this can work right on
        #  all charsets that this module happens to run under.
        # Altho, hmm, what about that ord?  Presumably that won't work right
        #  under non-ASCII charsets.  Something should be done about that.
    }

    return @_;
}

=back

=head1 HISTORY

In order to avoid modifying C<Pod:Simple::HTML>, large sections of its code
have been copied here, and then refactored and adjusted to enable various bits
of Parrot-specific behaviour.

=cut

1;

# Local Variables:
#   mode: cperl
#   cperl-indent-level: 4
#   fill-column: 100
# End:
# vim: expandtab shiftwidth=4: