Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Branch: master
Fetching contributors…

Cannot retrieve contributors at this time

executable file 1223 lines (1137 sloc) 47.829 kB
#!/usr/bin/python
# -*- coding: utf-8 -*-
# Automatically derive Recode table files from various sources.
# Copyright © 1993, 1994, 1997, 1998, 1999, 2000 Free Software Foundation, Inc.
# François Pinard <pinard@iro.umontreal.ca>, 1993.
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software Foundation,
# Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
"""\
`tables.py' derives Recode table files from various sources.
Usage: python tables.py [OPTION]... DATA-FILE...
Output selection:
-e Produce C source file for explode data (explode.c)
-i Produce C source file for iconv charsets (iconvdecl.h)
-m Produce C inclusion file for short RFC 1345 mnemonics (rfc1345.h)
-n Produce C inclusion file for character names (charname.h)
-p Produce C source files for strip data (strip-pool.c and strip-data.c)
-t Produce Texinfo inclusion file for RFC 1345 (rfc1345.texi)
Modality options:
-C DIRECTORY Change to DIRECTORY prior to processing
-F Produce French versions for -n, -s or -t
-v Increase verbosity
DATA-FILEs may be rfc1345.txt, mnemonic[.,]ds, Unicode maps, or .def files
from Keld's chset* packages. The digesting order is usually important.
When `-F' and `-n' are used, process Alain's tables.
"""
import re, sys
# Character constants.
REPLACEMENT_CHARACTER = 0xFFFD
NOT_A_CHARACTER = 0xFFFF
# Main driver.
class Main:
directory = None
charnames = None
explodes = None
iconv = None
mnemonics = None
strips = None
verbose = False
def main(self, *arguments):
if not arguments:
sys.stdout.write(__doc__)
return
import getopt
French_option = False
options, arguments = getopt.getopt(arguments, 'C:Feimnptv')
for option, value in options:
if option == '-C':
self.directory = value
elif option == '-F':
French_option = True
elif option == '-e':
if not self.explodes:
self.explodes = Explodes()
self.explodes.do_sources = True
elif option == '-i':
if not self.iconv:
self.iconv = Iconv()
self.iconv.do_sources = True
elif option == '-m':
if not self.mnemonics:
self.mnemonics = Mnemonics()
self.mnemonics.do_sources = True
elif option == '-n':
if not self.charnames:
self.charnames = Charnames()
self.charnames.do_sources = True
elif option == '-p':
if not self.strips:
self.strips = Strips()
self.strips.do_sources = True
elif option == '-t':
if not self.strips:
self.strips = Strips()
self.strips.do_texinfo = True
elif option == '-v':
self.verbose = True
# Read all data tables.
if self.directory:
import os
os.chdir(self.directory)
if self.iconv:
self.iconv.digest()
for name in arguments:
input = Input(name)
while True:
line = input.readline()
if not line:
break
if line[0] == '\n':
continue
if line[:2] == '/*':
while line.find('*/') < 0:
line = input.readline()
continue
if input.begins('# Name:'):
if not self.strips:
self.strips = Strips()
self.strips.digest_unimap(input)
break
if line[0] == '#':
continue
if input.begins('escape_char'):
if not self.mnemonics:
self.mnemonics = Mnemonics()
self.mnemonics.digest_mnemonics_ds(input)
break
if input.match('Network Working Group +K\. Simonsen$'):
if (self.charnames
and self.charnames.do_sources
and not French_option):
while not input.begins(
' 3rd field is the long descriptive'):
line = input.readline()
if not self.mnemonics:
self.mnemonics = Mnemonics()
self.mnemonics.digest_rfc1345(input)
if self.explodes or self.strips:
while line != '5. CHARSET TABLES\n':
line = input.readline()
if not self.strips:
self.strips = Strips()
self.strips.digest_rfc1345(input)
break
if input.begins('@@\t'):
if self.charnames.do_sources and French_option:
self.charnames.digest_french(input)
break
if line == '&referenceset\n':
while line != '\n':
line = input.readline()
if not self.strips:
self.strips = Strips()
if not self.mnemonics:
self.mnemonics = Mnemonics()
self.strips.digest_rfc1345(input)
break
if line in (' Repertoire according to ISO/IEC 10646-1:1993\n',
' Control characters\n',
' Private use\n'):
while line not in (' Plane 000\n',
' plane 000\n'):
line = input.readline()
if not self.mnemonics:
self.mnemonics = Mnemonics()
self.mnemonics.digest_iso10646_def(input)
break
input.die("Data file with unknown contents")
for instance in (self.explodes,
self.strips,
self.charnames,
self.iconv,
self.mnemonics):
if instance:
instance.complete(French_option)
run = Main()
main = run.main
class Options:
def __init__(self):
self.do_sources = False
self.do_texinfo = False
# Charnames.
class Charnames(Options):
SOURCES = 'charname.h'
# Name of character, given its numerical value.
charname_map = {}
# Maximum printable length of a character name.
max_length = 0
# Frequency of each word, then its crypt code.
code_map = {}
def digest_french(self, input):
self.preset_french()
fold_table = range(256)
for before, after in map(
None,
u'ABCDEFGHIJKLMNOPQRSTUVWXYZÀÂÇÈÉÊÎÏÑÔÖÛ'.encode('ISO-8859-1'),
u'abcdefghijklmnopqrstuvwxyzàâçèéêîïñôöû'.encode('ISO-8859-1')):
fold_table[ord(before)] = ord(after)
folding = ''.join(map(chr, fold_table))
ignorables = (
u'<commande>'.encode('ISO-8859-1'),
u'<réservé>'.encode('ISO-8859-1'),
u'<pas un caractère>'.encode('ISO-8859-1'))
while True:
line = input.readline()
if not line:
break
if input.begins('@@\t'):
continue
# Pour éliminer la fin de ligne.
line = line.rstrip()
input.line = line
match = input.match('([0-9A-F]{4})\t([^(]+)( \\(.*\\))?( \\*)?$')
if match:
ucs = int(match.group(1), 16)
text = match.group(2).translate(folding)
if text in ignorables:
continue
self.declare(ucs, re.sub(r' +\*$', '', text, 1))
else:
input.warn("Unrecognised line")
def preset_french(self):
self.max_length = 0
ucs = 0x0000
for text in (
u"nul (nul)", # 0000
u"début d'en-tête (soh)", # 0001
u"début de texte (stx)", # 0002
u"fin de texte (etx)", # 0003
u"fin de transmission (eot)", # 0004
u"demande (enq)", # 0005
u"accusé de réception positif (ack)", # 0006
u"sonnerie (bel)", # 0007
u"espace arrière (bs)", # 0008
u"tabulation horizontale (ht)", # 0009
u"interligne (lf)", # 000A
u"tabulation verticale (vt)", # 000B
u"page suivante (ff)", # 000C
u"retour de chariot (cr)", # 000D
u"hors code (so)", # 000E
u"en code (si)", # 000F
u"échappement transmission (dle)", # 0010
u"commande d'appareil un (dc1)", # 0011
u"commande d'appareil deux (dc2)", # 0012
u"commande d'appareil trois (dc3)", # 0013
u"commande d'appareil quatre (dc4)", # 0014
u"accusé de réception négatif (nak)", # 0015
u"synchronisation (syn)", # 0016
u"fin de transmission de bloc (etb)", # 0017
u"annulation (can)", # 0018
u"fin de support (em)", # 0019
u"caractère de substitution (sub)", # 001A
u"échappement (esc)", # 001B
u"séparateur de fichier (fs)", # 001C
u"séparateur de groupe (gs)", # 001D
u"séparateur d'article (rs)", # 001E
u"séparateur de sous-article (us)", # 001F
):
self.declare(ucs, text.encode('ISO-8859-1'))
ucs += 1
ucs = 0x007F
for text in (
u"suppression (del)", # 007F
u"caractère de bourre (pad)", # 0080
u"octet supérieur prédéfini (hop)", # 0081
u"arrêt permis ici (bph)", # 0082
u"aucun arrêt ici (nbh)", # 0083
u"index (ind)", # 0084
u"à la ligne (nel)", # 0085
u"début de zone sélectionnée (ssa)", # 0086
u"fin de zone sélectionnée (esa)", # 0087
u"arrêt de tabulateur horizontal (hts)", # 0088
u"tabulateur horizontal avec justification (htj)", # 0089
u"arrêt de tabulateur vertical (vts)", # 008A
u"interligne partiel vers <= bas (pld)", # 008B
u"interligne partiel vers <= haut (plu)", # 008C
u"index inversé (ri)", # 008D
u"remplacement unique deux (ss2)", # 008E
u"remplacement unique trois (ss3)", # 008F
u"chaîne de commande d'appareil (dcs)", # 0090
u"usage privé un (pu1)", # 0091
u"usage privé deux (pu2)", # 0092
u"mise en mode transmission (sts)", # 0093
u"annulation du caractère précédent (cch)", # 0094
u"message en attente (mw)", # 0095
u"début de zone protégée (sga)", # 0096
u"fin de zone protégée (ega)", # 0097
u"début de chaîne (sos)", # 0098
u"introducteur de caractère graphique unique (sgci)",# 0099
u"introducteur de caractère unique (sci)", # 009A
u"introducteur de séquence de commande (csi)", # 009B
u"fin de chaîne (st)", # 009C
u"commande de système d'exploitation (osc)", # 009D
u"message privé (pm)", # 009E
u"commande de progiciel (apc)", # 009F
):
self.declare(ucs, text.encode('ISO-8859-1'))
ucs += 1
def declare(self, ucs, text):
self.charname_map[ucs] = text
if len(text) > self.max_length:
self.max_length = len(text)
for word in text.split():
self.code_map[word] = self.code_map.get(word, 0) + 1
def presort_word(self, word):
return -self.code_map[word], word
# Write a compressed list of character names.
def complete(self, french):
if not self.do_sources:
return
if french:
write = Output('fr-%s' % self.SOURCES).write
else:
write = Output(self.SOURCES).write
# Establish a mild compression scheme. Words word[:singles]
# will be represented by a single byte running from 1 to
# singles. All remaining words will be represented by two
# bytes, the first one running slowly from singles+1 to 255,
# the second cycling faster from 1 to 255.
if run.verbose:
sys.stdout.write(' sorting words...')
pairs = map(self.presort_word, self.code_map.keys())
pairs.sort()
words = map(lambda pair: pair[1], pairs)
pairs = None
if run.verbose:
sys.stdout.write(' %d of them\n' % len(words))
count = len(words)
singles = (255 * 255 - count) / 254
# Transmit a few values for further usage by the C code.
if run.verbose:
sys.stdout.write(' sorting names...')
ucs2_table = self.charname_map.keys()
ucs2_table.sort()
if run.verbose:
sys.stdout.write(' %d of them\n' % len(ucs2_table))
write('\n'
'#define NUMBER_OF_SINGLES %d\n'
'#define MAX_CHARNAME_LENGTH %d\n'
'#define NUMBER_OF_CHARNAMES %d\n'
% (singles, self.max_length, len(ucs2_table)))
# Establish a mild compression scheme (one or two bytes per word).
sys.stdout.write(" writing words\n")
write('\n'
'static const char *const word[%d] =\n'
' {\n'
% count)
char1 = 1
char2 = 1
for counter in range(singles):
word = words[counter]
write(' %-28s/* \\%0.3o */\n'
% ('"%s",' % re.sub('"', r'\"', word), char1))
self.code_map[words[counter]] = char1
char1 += 1
for counter in range(singles, count):
word = words[counter]
write(' %-28s/* \\%0.3o\\%0.3o */\n'
% ('"%s",' % re.sub('"', r'\"', word, 1), char1, char2))
self.code_map[words[counter]] = 256 * char1 + char2
if char2 == 255:
char1 += 1
char2 = 1
else:
char2 += 1
write(' };\n')
sys.stdout.write(" writing names\n")
write('\n'
'struct charname\n'
' {\n'
' recode_ucs2 code;\n'
' const char *crypted;\n'
' };\n'
'\n'
'static const struct charname charname[NUMBER_OF_CHARNAMES] =\n'
' {\n')
for ucs2 in ucs2_table:
write(' {0x%04X, "' % ucs2)
for word in self.charname_map[ucs2].split():
if word in self.code_map:
code = self.code_map[word]
if code < 256:
write('\\%0.3o' % code)
else:
write('\\%0.3o\\%0.3o' % (code / 256, code % 256))
else:
sys.stdout.write('??? %s\n' % word)
write('"},\n')
write(' };\n')
# Explodes.
class Explodes(Options):
SOURCES = 'explode.c'
def __init__(self):
Options.__init__(self)
# Table fragments will be produced while reading data tables.
self.write = Output(self.SOURCES).write
write = self.write
write('\n'
'#include "common.h"\n')
def complete(self, french):
if not self.do_sources:
return
# Print the collectable initialization function.
sys.stdout.write("Completing %s\n" % self.SOURCES)
write = self.write
write('\n'
'bool\n'
'module_explodes (struct recode_outer *outer)\n'
'{\n')
count = 0
while self.declare_charset:
write(' if (!declare_explode_data (outer, &data_%d, "%s"))\n'
' return false;\n'
% (count, self.declare_charset[0]))
del self.declare_charset[0]
count += 1
write('\n')
while declare_alias:
write(' if (!declare_alias (outer, "%s", "%s"))\n'
' return false;\n'
% declare_alias[0])
del declare_alias[0]
write('\n'
' return true;\n'
'}\n'
'\n'
'void\n'
'delmodule_explodes (struct recode_outer *outer)\n'
'{\n'
'}\n')
# Iconv.
class Iconv(Options):
SOURCES = 'iconvdecl.h'
data = []
def digest(self):
canonical = {}
for charset in ('Georgian-Academy', 'Georgian-PS', 'MuleLao-1',
'Macintosh', 'MacArabic', 'MacCentralEurope',
'MacCroatian', 'MacCyrillic', 'MacGreek', 'MacHebrew',
'MacIceland', 'MacRoman', 'MacRomania', 'MacThai',
'MacTurkish', 'MacUkraine'):
canonical[charset.upper()] = charset
# Read in the encodings.def file.
sys.stdout.write("Reading from `iconv -l'\n")
libc = None
import os
for line in os.popen('iconv -l'):
if libc is None:
libc = len(line.split('/')) == 3
if libc:
first, second, empty = line.split('/')
assert empty == '\n', repr(line)
self.data.append((second or first, ()))
else:
aliases = []
for alias in line.split():
if alias in canonical:
alias = canonical[alias]
aliases.append(alias)
self.data.append((aliases[0], aliases[1:]))
def complete(self, french):
if not self.do_sources:
return
write = Output(self.SOURCES).write
count = 1
for charset, aliases in self.data:
count = count + 2 + len(aliases)
write('\n'
"/* This is derived from Bruno Haible's `libiconv' package. */"
'\n'
'static const char *iconv_name_list[%d] =\n'
' {\n'
% count)
for charset, aliases in self.data:
if aliases:
write(' "%s",\n' % charset)
for alias in aliases[:-1]:
write('\t"%s",\n' % alias)
write('\t"%s", NULL,\n' % aliases[-1])
else:
write(' "%s", NULL,\n' % charset)
write(' NULL\n'
' };\n')
# Mnemonics.
class Mnemonics(Options):
SOURCES = 'rfc1345.h'
# Ignore any mnemonic whose length is greater than MAX_MNEMONIC_LENGTH.
MAX_MNEMONIC_LENGTH = 3
# Numeric value of a character, given its mnemonic.
ucs2_map = {}
table_length = 0
mnemonic_map = {}
# Read in a mnemonics file.
def digest_mnemonics_ds(self, input):
while input.readline():
match = input.match('<([^ \t\n]+)>\t<U(....)>')
if match:
mnemonic = re.sub('/(.)', r'\1', match.group(1))
ucs2 = int(match.group(2), 16)
self.declare(mnemonic, ucs2, input.warn)
# Read in Keld's list of 10646 characters.
def digest_iso10646_def(self, input):
while True:
line = input.readline()
if not line:
break
if line == '\n':
continue
if len(line) == 3:
continue
if input.begins(' \.\.\.'):
continue
if line == ' Presentation forms\n':
continue
if input.begins(' naming: first vertical '):
continue
match = input.match(' row ([0-9][0-9][0-9])$')
if match and int(match.group(1)) < 256:
row = int(match.group(1))
cell = 0
continue
if line == ' cell 00\n':
cell = 0
continue
match = input.match(' cell ([0-9][0-9][0-9])$')
if match and int(match.group(1)) < 256:
cell = int(match.group(1))
continue
if input.match(' [^ ]+'):
if not input.match(' [A-Z][A-Z][A-Z]'):
continue
if input.match(' [^ ].*'):
if cell == 256:
input.warn("Over 256 cells in row %d", row)
cell += 1
continue
match = (input.match('([^ ]) [^ ].*')
or input.match('([^ ][^ ]+) [^ ].*'))
if match:
if cell == 256:
input.warn("Over 256 cells in row %d", row)
self.declare(match.group(1), 256*row + cell, input.warn)
cell += 1
continue
input.warn("Unrecognised line")
# Read the text of RFC 1345, saving all character names it declares.
def digest_rfc1345(self, input):
def read_line(input=input):
skip = False
while True:
line = input.readline()
if not line:
break
if input.begins('Simonsen'):
skip = True
continue
if skip:
if input.begins('RFC 1345'):
skip = False
continue
if input.begins('4. CHARSETS'):
break
if line == '\n':
continue
if line[0] == ' ':
return line[:-1].lstrip()
return None
self.max_length = 0
# Read the character descriptions. Count words in charnames.
line = read_line()
while line:
# Look ahead one line and merge it if it should.
next = read_line()
while next:
match = re.match(' *( .*)', next)
if not match:
break
line += match.group(1)
next = read_line()
# Separate fields and save needed information.
match = re.search('([^ ]+) +[0-9a-f]+ +(.*)', line)
if match:
mnemo = match.group(1)
text = match.group(2).lower()
if mnemo in self.ucs2_map:
run.charnames.declare(self.ucs2_map[mnemo], text)
elif len(mnemo) <= self.MAX_MNEMONIC_LENGTH:
input.warn("No known UCS-2 code for `%s'", mnemo)
elif not re.search(' +e000', line):
input.warn("Unrecognised line")
line = next
# Declare a correspondence between a mnemonic and an UCS-2 value.
def declare(self, mnemonic, ucs2, warn):
if len(mnemonic) > self.MAX_MNEMONIC_LENGTH:
return
if self.do_sources:
if ucs2 in self.mnemonic_map:
if self.mnemonic_map[ucs2] != mnemonic:
warn("U+%04X `%s' known as `%s'",
ucs2, mnemonic, self.mnemonic_map[ucs2])
if len(mnemonic) < len(self.mnemonic_map[ucs2]):
self.mnemonic_map[ucs2] = mnemonic
else:
self.mnemonic_map[ucs2] = mnemonic
self.table_length += 1
if mnemonic in self.ucs2_map:
if self.ucs2_map[mnemonic] != ucs2:
warn("`%s' U+%04X known as U+%04X",
mnemonic, ucs2, self.ucs2_map[mnemonic])
#FIXME: ??? cell = self.ucs2_map[mnemonic] - 256*row
else:
self.ucs2_map[mnemonic] = ucs2
def complete(self, french):
if self.do_sources:
self.complete_sources()
# Write an UCS-2 to RFC 1345 mnemonic table.
def complete_sources(self):
inverse_map = {}
write = Output(self.SOURCES).write
write('\n'
'#define TABLE_LENGTH %d\n'
'#define MAX_MNEMONIC_LENGTH %d\n'
% (self.table_length, self.MAX_MNEMONIC_LENGTH))
write('\n'
'struct entry\n'
' {\n'
' recode_ucs2 code;\n'
' const char *rfc1345;\n'
' };\n'
'\n'
'static const struct entry table[TABLE_LENGTH] =\n'
' {\n')
count = 0
indices = self.mnemonic_map.keys()
indices.sort()
for ucs2 in indices:
text = self.mnemonic_map[ucs2]
inverse_map[text] = count
write(' /* %4d */ {0x%04X, "%s"},\n'
% (count, ucs2, re.sub(r'([\"])', r'\\\1', text)))
count += 1
write(' };\n')
write('\n'
'static const unsigned short inverse[TABLE_LENGTH] =\n'
' {')
count = 0
keys = inverse_map.keys()
keys.sort()
for text in keys:
if count % 10 == 0:
if count != 0:
write(',')
write('\n /* %4d */ ' % count)
else:
write(', ')
write('%4d' % inverse_map[text])
count += 1
write('\n'
' };\n')
# Global table of strips.
class Strips(Options):
POOL = 'strip-pool.c'
DATA = 'strip-data.c'
TEXINFO = 'rfc1345.texi'
# Change STRIP_SIZE in `src/recode.h' if you change the value here.
# See the accompanying documentation there, as needed.
STRIP_SIZE = 8
# Prepare the production of tables.
pool_size = 0
pool_refs = 0
strip_map = {}
strips = []
# While digesting files.
used_map = {}
table = []
declare_alias = []
implied_surface = {}
def __init__(self):
Options.__init__(self)
self.write_data = None
self.aliases_map = {}
self.remark_map = {}
self.declare_charset = []
# Prepare to read various tables.
self.charset_ordinal = 0
self.discard_charset = False
self.alias_count = 0
self.comment = ''
def init_write_data(self):
if self.do_sources and not self.write_data:
# Table fragments will be produced while reading data tables.
self.write_data = Output(self.DATA).write
write = self.write_data
write('\n'
'#include \"common.h\"\n')
# Read the text of RFC 1345, saving all charsets it declares.
# UCS-2 mnemonics files should have been read in already.
def digest_rfc1345(self, input):
self.init_write_data()
# Informal canonical order of presentation.
CHARSET, REM, ALIAS, ESC, BITS, CODE = range(6)
charset = None
skip = False
while True:
line = input.readline()
if not line:
break
if input.begins('Simonsen'):
skip = True
continue
if skip:
if input.begins('RFC 1345'):
skip = False
continue
if line == '\n':
continue
if line == 'ACKNOWLEDGEMENTS\n':
break
line, count = re.subn('^ ?', '', line)
if not count:
continue
input.line = line
# Recognize `&charset'.
match = input.match('&charset (.*)')
if match:
# Before beginning a new charset, process the previous one.
if charset:
self.charset_done(charset, remark, aliases)
charset = match.group(1)
# Prepare for processing a new charset: save the charset
# name for further declaration; announce this charset in
# the array initialization section; and initialize its
# processing.
if run.verbose:
sys.stdout.write(" %d) %s\n"
% (self.charset_ordinal + 1, charset))
status = CHARSET
self.comment = '\n/* %s\n' % charset
hashname = re.sub('[^a-z0-9]', '', charset.lower())
if hashname in self.used_map:
input.warn("Duplicate of %s (discarded)",
self.used_map[hashname])
self.discard_charset = True
continue
self.used_map[hashname] = charset
self.alias_count = 0
self.table = [NOT_A_CHARACTER] * 256
codedim = 0
code = 0
aliases = []
remark = []
#FIXME:match = re.match('(CP|IBM|windows-)([0-9]+)$', charset)
match = re.match('(CP|IBM)([0-9]+)$', charset)
if match:
self.implied_surface[match.group(2)] = 'crlf'
self.implied_surface['CP' + match.group(2)] = 'crlf'
self.implied_surface['IBM' + match.group(2)] = 'crlf'
self.declare_alias.append((charset, charset))
self.alias_count += 1
continue
if charset in ('macintosh', 'macintosh_ce'):
self.implied_surface[charset] = 'cr'
self.declare_alias.append((charset, charset))
self.alias_count += 1
continue
continue
# Recognize other `&' directives.
match = input.match('&rem (.*)')
if match and not input.begins('&rem &alias'):
# Keld now prefers `&rem' to be allowed everywhere.
#if status > REM:
# input.warn("`&rem' out of sequence")
#status = REM;
if self.do_texinfo:
# Save remarks for Texinfo.
text = match.group(1)
remark.append(text)
continue
match = input.match('(&rem )?&alias (.*)')
if match:
if status > ALIAS:
input.warn("`&alias' out of sequence")
status = ALIAS
# Save synonymous charset names for later declarations.
alias = match.group(2)
if alias[-1] == ' ':
input.warn("Spurious trailing whitespace")
alias = alias.rstrip()
self.comment = self.comment + ' %s\n' % alias
hashname = re.sub('[^a-z0-9]', '', alias.lower())
if hashname in self.used_map:
if self.used_map[hashname] != charset:
input.warn("Duplicate of %s", self.used_map[hashname])
continue
else:
self.used_map[hashname] = charset
aliases.append(alias)
match = re.match('(CP|IBM)([0-9]+)$', alias)
if match:
self.implied_surface[match.group(2)] = 'crlf'
self.implied_surface['CP' + match.group(2)] = 'crlf'
self.implied_surface['IBM' + match.group(2)] = 'crlf'
elif alias in ('mac', 'macce'):
self.implied_surface[alias] = 'cr'
self.declare_alias.append((alias, charset))
self.alias_count += 1
continue
if input.match('&g[0-4]esc'):
if status > ESC:
input.warn("`&esc' out of sequence")
status = ESC
continue
match = input.match('&bits ([0-9]+)$')
if match:
if status > BITS:
input.warn("`&bits' out of sequence")
status = BITS
if int(match.group(1)) > 8:
input.warn("`&bits %s' not accepted (charset discarded)",
match.group(1))
self.discard_charset = True
continue
match = input.match('&code (.*)')
if match:
if status > CODE:
input.warn("`&code' out of sequence")
status = CODE
# Save the code position.
code = int(match.group(1))
continue
# Other lines cause the charset to be discarded.
match = input.match('&([^ ]+)')
if match:
if not self.discard_charset:
input.warn("`&%s' not accepted (charset discarded)",
match.group(1))
self.discard_charset = True
if self.discard_charset:
continue
# Save all other tokens into the double table.
for token in line.split():
if token == '??':
self.table[code] = NOT_A_CHARACTER
elif token == '__':
self.table[code] = REPLACEMENT_CHARACTER
elif token in run.mnemonics.ucs2_map:
self.table[code] = run.mnemonics.ucs2_map[token]
if len(token) > codedim:
codedim = len(token)
else:
input.warn("Unknown mnemonic for code: %s", token)
self.table[code] = REPLACEMENT_CHARACTER
code += 1
# Push the last charset out.
self.charset_done(charset, remark, aliases)
# Read a Unicode map, as found in ftp://ftp.unicode.com/MAPPINGS.
def digest_unimap(self, input):
self.init_write_data()
line = input.line
match = input.match('# +Name: +([^ ]+) to Unicode table$')
if match:
# Set comment.
name = match.group(1).split()
charset = name[0]
del name[0]
self.comment = '\n/* %s\n' % charset
# Set charset.
hashname = re.sub('[^a-z0-9]', '', charset.lower())
if self.used_map[hashname]:
input.warn("`%s' duplicates `%s' (charset discarded)",
hashname, self.used_map[hashname])
self.discard_charset = True
return
self.used_map[hashname] = charset
# Prepare for read.
self.alias_count = 0
self.table = [NOT_A_CHARACTER] * 256
codedim = 0
code = 0
aliases = []
remark = []
if self.discard_charset:
return
# Process aliases.
for alias in name:
self.comment = self.comment + ' %s\n' % alias
hashname = re.sub('[^a-z0-9]', '', alias.lower())
if self.used_map[hashname] and self.used_map[hashname] != charset:
input.warn("`%s' duplicates `%s'", hashname,
self.used_map[hashname])
continue
self.used_map[hashname] = charset
aliases.append(alias)
self.declare_alias.append((alias, charset))
self.alias_count += 1
# Read table contents.
while True:
line = input.readline()
if not line:
break
if line == '\n':
continue
if line[0] == '#':
continue
if input.match('0x([0-9A-F]+)\t\t#UNDEFINED$'):
continue
if input.search('\032'):
# Old MS-DOS C-z !!
break
match = input.match('0x([0-9A-F]+)\t0x([0-9A-F]+)\t\#')
if match:
self.table[int(match.group(1), 16)] = int(match.group(2), 16)
else:
input.warn("Unrecognised input line")
# Complete processing.
self.charset_done(charset, remark, aliases)
# Print all accumulated information for the charset. If the
# charset should be discarded, adjust tables.
def charset_done(self, charset, remark, aliases):
if self.discard_charset:
while self.alias_count > 0:
del self.declare_alias[-1]
self.alias_count -= 1
self.discard_charset = False
self.comment = ''
if not self.comment:
return
if self.do_texinfo:
# Save the documentation.
aliases.sort()
self.aliases_map[charset] = aliases
self.remark_map[charset] = remark
if run.explodes:
write = run.explodes.write
# Make introductory C comments.
write(self.comment)
write('*/\n')
# Make the table for this charset.
write('\n'
'static const unsigned short data_%d[] =\n'
' {\n'
% self.charset_ordinal)
for code in range(256):
if code != self.table[code]:
write(' %3d, 0x%.4X, DONE,\n'
% (code, self.table[code]))
write(' DONE\n'
' };\n')
# Register the table.
self.declare_charset.append(charset)
if self.do_sources:
write = self.write_data
# Make introductory C comments.
write(self.comment)
write('*/\n')
# Make the table for this charset.
write('\n'
'static struct strip_data data_%d =\n'
' {\n'
' ucs2_data_pool,\n'
' {\n'
% self.charset_ordinal)
count = 0
for code in range(0, 256, self.STRIP_SIZE):
if count % 12 == 0:
if count != 0:
write(',\n')
write(' ')
else:
write(', ')
strip = self.table[code:code+self.STRIP_SIZE]
write('%4d' % self.pool_index(strip))
count += 1
write('\n'
' }\n'
' };\n')
# Register the table.
self.declare_charset.append(charset)
self.charset_ordinal += 1
self.comment = ''
# Return the pool index for strip. Add to the pool as required.
def pool_index(self, strip):
def format(item):
return '%04X' % item
self.pool_refs += 1
text = ''.join(map(format, strip))
if text not in self.strip_map:
self.strip_map[text] = self.pool_size
self.pool_size = self.pool_size + self.STRIP_SIZE
self.strips.append(text)
return self.strip_map[text]
def complete(self, french):
if self.do_sources:
self.complete_sources()
if self.do_texinfo:
self.complete_texinfo(french)
def complete_sources(self):
# Give memory statistics.
sys.stdout.write('Table memory = %d bytes (pool %d, refs %d)\n'
% (self.pool_size * 2 + self.pool_refs * 2,
self.pool_size * 2,
self.pool_refs * 2))
# Print the collectable initialization function.
sys.stdout.write("Completing %s\n" % self.DATA)
write = self.write_data
write('\n'
'bool\n'
'module_strips (struct recode_outer *outer)\n'
'{\n'
' RECODE_ALIAS alias;\n'
'\n')
count = 0
while self.declare_charset:
write(' if (!declare_strip_data (outer, &data_%d, "%s"))\n'
' return false;\n'
% (count, self.declare_charset[0]))
del self.declare_charset[0]
count += 1
write('\n')
while self.declare_alias:
alias, charset = self.declare_alias[0]
if alias in self.implied_surface:
write(' if (alias = declare_alias (outer, "%s", "%s"),'
' !alias)\n'
' return false;\n'
% self.declare_alias[0])
write(' if (!declare_implied_surface (outer, alias,'
' outer->%s_surface))\n'
' return false;\n'
% self.implied_surface[alias])
else:
write(' if (!declare_alias (outer, "%s", "%s"))\n'
' return false;\n'
% self.declare_alias[0])
del self.declare_alias[0]
write('\n'
' return true;\n'
'}\n'
'\n'
'void\n'
'delmodule_strips (struct recode_outer *outer)\n'
'{\n'
'}\n')
# Write the pool file.
write = Output(self.POOL).write
write('\n'
'#include "common.h"\n'
'\n'
'const recode_ucs2 ucs2_data_pool[%d] =\n'
' {'
% self.pool_size)
count = 0
for strip in self.strips:
for pos in range(0, self.STRIP_SIZE * 4, 4):
if count % 8 == 0:
if count != 0:
write(',')
write('\n /* %4d */ ' % count)
else:
write(', ')
write('0x' + strip[pos:pos+4])
count += 1
write('\n'
' };\n')
def complete_texinfo(self, french):
if french:
write = Output('fr-%s' % self.TEXINFO, noheader=True).write
else:
write = Output(self.TEXINFO, noheader=True).write
charsets = self.remark_map.keys()
charsets.sort()
for charset in charsets:
write('\n'
'@item %s\n'
'@tindex %s@r{, aliases and source}\n'
% (charset, re.sub(':([0-9]+)', r'(\1)', charset)))
aliases = self.aliases_map[charset]
if aliases:
if len(aliases) == 1:
if aliases[0]: # FIXME: why empty sometimes?
write('@tindex %s\n'
'@code{%s} is an alias for this charset.\n'
% (re.sub(':([0-9]+)', r'(\1)', aliases[0]),
aliases[0]))
else:
for alias in aliases:
write('@tindex %s\n'
% re.sub(':([0-9]+)', r'(\1)', alias))
write('@code{%s} and @code{%s} are aliases'
' for this charset.\n'
% ('}, @code{'.join(aliases[:-1]), aliases[-1]))
for line in self.remark_map[charset]:
if line[0].islower():
line = line[0].upper() + line[1:]
write(line.replace('@', '@@'))
if line[-1] != '.':
write('.')
write('\n')
# Handling basic input and output.
class Input:
def __init__(self, name):
self.name = name
self.input = file(name)
self.line_count = 0
sys.stdout.write("Reading %s\n" % name)
def readline(self):
self.line = self.input.readline()
self.line_count += 1
return self.line
def warn(self, format, *args):
if run.verbose:
sys.stdout.write('%s:%s: %s\n'
% (self.name, self.line_count, format % args))
def die(self, format, *args):
sys.stdout.write('%s:%s: %s\n'
% (self.name, self.line_count, format % args))
raise 'Fatal'
def begins(self, text):
return self.line[:len(text)] == text
def match(self, pattern):
return re.match(pattern, self.line)
def search(self, pattern):
return re.search(pattern, self.line)
class Output:
def __init__(self, name, noheader=False):
self.name = name
self.write = file(name, 'w').write
sys.stdout.write("Writing %s\n" % name)
if not noheader:
self.write("""\
/* DO NOT MODIFY THIS FILE! It was generated by `recode/tables.py'. */
/* Conversion of files between different charsets and surfaces.
Copyright © 1999 Free Software Foundation, Inc.
Contributed by François Pinard <pinard@iro.umontreal.ca>, 1993, 1997.
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public License
as published by the Free Software Foundation; either version 2 of the
License, or (at your option) any later version.
This library is distributed in the hope that it will be
useful, but WITHOUT ANY WARRANTY; without even the implied warranty
of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the Recode Library; see the file `COPYING.LIB'.
If not, write to the Free Software Foundation, Inc., 59 Temple Place -
Suite 330, Boston, MA 02111-1307, USA. */
""")
if __name__ == '__main__':
main(*sys.argv[1:])
Jump to Line
Something went wrong with that request. Please try again.