-
Notifications
You must be signed in to change notification settings - Fork 2
/
RegExpr.pas
4215 lines (3907 loc) · 149 KB
/
RegExpr.pas
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
{*
TRegExpr class library
Delphi Regular Expressions
-
Copyright (c) 1999-2004 Andrey V. Sorokin, St.Petersburg, Russia
-
You may use this software in any kind of development,
including comercial, redistribute, and modify it freely,
under the following restrictions :
1. This software is provided as it is, without any kind of
warranty given. Use it at Your own risk.The author is not
responsible for any consequences of use of this software.
2. The origin of this software may not be mispresented, You
must not claim that You wrote the original software. If
You use this software in any kind of product, it would be
appreciated that there in a information box, or in the
documentation would be an acknowledgement like
-
Partial Copyright (c) 2004 Andrey V. Sorokin
http://RegExpStudio.com
mailto:anso@mail.ru
-
3. You may not have any income from distributing this source
(or altered version of it) to other developers. When You
use this product in a comercial package, the source may
not be charged seperatly.
4. Altered versions must be plainly marked as such, and must
not be misrepresented as being the original software.
5. RegExp Studio application and all the visual components as
well as documentation is not part of the TRegExpr library
and is not free for usage.
-
mailto:anso@mail.ru
http://RegExpStudio.com
http://anso.da.ru/
-
Documentation for NAADSM only covers the public methods to implement the
TRegExpr class of this library. Some of the documentation herein was added
from comments in a help file for the TRegExpr interface, downloaded from:
http://regexpstudio.com/TRegExpr/TRegExpr.html
-
Examples of use in other pas units: (from RegExpDefs.pas)
_regExpr.Expression := self[0];
_regExpr.Exec( test );
}
(*
Documentation generation tags begin with {* or ///
Replacing these with (* or // foils the documentation generator
*)
unit RegExpr;
interface
// ======== Determine compiler
{$IFDEF VER80} Sorry, TRegExpr is for 32-bits Delphi only. Delphi 1 is not supported (and whos really care today?!). {$ENDIF}
{$IFDEF VER90} {$DEFINE D2} {$ENDIF} // D2
{$IFDEF VER93} {$DEFINE D2} {$ENDIF} // CPPB 1
{$IFDEF VER100} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D3
{$IFDEF VER110} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // CPPB 3
{$IFDEF VER120} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D4
{$IFDEF VER130} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D5
{$IFDEF VER140} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D6
{$IFDEF VER150} {$DEFINE D7} {$DEFINE D6} {$DEFINE D5} {$DEFINE D4} {$DEFINE D3} {$DEFINE D2} {$ENDIF} // D7
// ======== Define base compiler options
{$BOOLEVAL OFF}
{$EXTENDEDSYNTAX ON}
{$LONGSTRINGS ON}
{$OPTIMIZATION ON}
{$IFDEF D6}
{$WARN SYMBOL_PLATFORM OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF D7}
{$WARN UNSAFE_CAST OFF} // Suppress .Net warnings
{$WARN UNSAFE_TYPE OFF} // Suppress .Net warnings
{$WARN UNSAFE_CODE OFF} // Suppress .Net warnings
{$ENDIF}
{$IFDEF FPC}
{$MODE DELPHI} // Delphi-compatible mode in FreePascal
{$ENDIF}
// ======== Define options for TRegExpr engine
{.$DEFINE UniCode} // Unicode support
{$DEFINE RegExpPCodeDump} // p-code dumping (see Dump method)
{$IFNDEF FPC} // the option is not supported in FreePascal
{$DEFINE reRealExceptionAddr} // exceptions will point to appropriate source line, not to Error procedure
{$ENDIF}
{$DEFINE ComplexBraces} // support braces in complex cases
{$IFNDEF UniCode} // the option applicable only for non-UniCode mode
{$DEFINE UseSetOfChar} // Significant optimization by using set of char
{$ENDIF}
{$IFDEF UseSetOfChar}
{$DEFINE UseFirstCharSet} // Fast skip between matches for r.e. that starts with determined set of chars
{$ENDIF}
// ======== Define Pascal-language options
// Define 'UseAsserts' option (do not edit this definitions).
// Asserts used to catch 'strange bugs' in TRegExpr implementation (when something goes
// completely wrong). You can swith asserts on/off with help of {$C+}/{$C-} compiler options.
{$IFDEF D3} {$DEFINE UseAsserts} {$ENDIF}
{$IFDEF FPC} {$DEFINE UseAsserts} {$ENDIF}
// Define 'use subroutine parameters default values' option (do not edit this definition).
{$IFDEF D4} {$DEFINE DefParam} {$ENDIF}
// Define 'OverMeth' options, to use method overloading (do not edit this definitions).
{$IFDEF D5} {$DEFINE OverMeth} {$ENDIF}
{$IFDEF FPC} {$DEFINE OverMeth} {$ENDIF}
uses
Classes, // TStrings in Split method
SysUtils; // Exception
type
{$IFDEF UniCode}
PRegExprChar = PWideChar;
RegExprString = WideString;
REChar = WideChar;
{$ELSE}
PRegExprChar = PChar;
RegExprString = AnsiString; //###0.952 was string
REChar = Char;
{$ENDIF}
TREOp = REChar; // internal p-code type //###0.933
PREOp = ^TREOp;
TRENextOff = integer; // internal Next "pointer" (offset to current p-code) //###0.933
PRENextOff = ^TRENextOff; // used for extracting Next "pointers" from compiled r.e. //###0.933
TREBracesArg = integer; // type of {m,n} arguments
PREBracesArg = ^TREBracesArg;
const
REOpSz = SizeOf (TREOp) div SizeOf (REChar); // size of p-code in RegExprString units
RENextOffSz = SizeOf (TRENextOff) div SizeOf (REChar); // size of Next 'pointer' -"-
REBracesArgSz = SizeOf (TREBracesArg) div SizeOf (REChar); // size of BRACES arguments -"-
type
TRegExprInvertCaseFunction = function (const Ch : REChar) : REChar
of object;
const
EscChar = '\'; /// 'Escape'-char ('\' in common r.e.) used for escaping metachars (\w, \d etc).
RegExprModifierI : boolean = False; /// default value for ModifierI
RegExprModifierR : boolean = True; /// default value for ModifierR
RegExprModifierS : boolean = True; /// default value for ModifierS
RegExprModifierG : boolean = True; /// default value for ModifierG
RegExprModifierM : boolean = False; /// default value for ModifierM
RegExprModifierX : boolean = False; /// default value for ModifierX
RegExprSpaceChars : RegExprString = /// default value for SpaceChars
' '#$9#$A#$D#$C;
RegExprWordChars : RegExprString = /// default value for WordChars
'0123456789' //###0.940
+ 'abcdefghijklmnopqrstuvwxyz'
+ 'ABCDEFGHIJKLMNOPQRSTUVWXYZ_';
RegExprLineSeparators : RegExprString =/// default value for LineSeparators
#$d#$a{$IFDEF UniCode}+#$b#$c#$2028#$2029#$85{$ENDIF}; //###0.947
RegExprLinePairedSeparator : RegExprString =/// default value for LinePairedSeparator
#$d#$a;
{ if You need Unix-styled line separators (only \n), then use:
RegExprLineSeparators = #$a;
RegExprLinePairedSeparator = '';
}
const
NSUBEXP = 15; // max number of subexpression //###0.929
// Cannot be more than NSUBEXPMAX
// Be carefull - don't use values which overflow CLOSE opcode
// (in this case you'll get compiler erorr).
// Big NSUBEXP will cause more slow work and more stack required
NSUBEXPMAX = 255; // Max possible value for NSUBEXP. //###0.945
// Don't change it! It's defined by internal TRegExpr design.
MaxBracesArg = $7FFFFFFF - 1; // max value for {n,m} arguments //###0.933
{$IFDEF ComplexBraces}
LoopStackMax = 10; // max depth of loops stack //###0.925
{$ENDIF}
TinySetLen = 3;
// if range includes more then TinySetLen chars, //###0.934
// then use full (32 bytes) ANYOFFULL instead of ANYOF[BUT]TINYSET
// !!! Attension ! If you change TinySetLen, you must
// change code marked as "//!!!TinySet"
type
{$IFDEF UseSetOfChar}
PSetOfREChar = ^TSetOfREChar;
TSetOfREChar = set of REChar;
{$ENDIF}
TRegExpr = class;
TRegExprReplaceFunction = function (ARegExpr : TRegExpr): string
of object;
/// Regular Expressions engine operating on an input string
TRegExpr = class
private
startp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr starting points
endp : array [0 .. NSUBEXP - 1] of PRegExprChar; // founded expr end points
{$IFDEF ComplexBraces}
LoopStack : array [1 .. LoopStackMax] of integer; // state before entering loop
LoopStackIdx : integer; // 0 - out of all loops
{$ENDIF}
// The "internal use only" fields to pass info from compile
// to execute that permits the execute phase to run lots faster on
// simple cases.
regstart : REChar; // char that must begin a match; '\0' if none obvious
reganch : REChar; // is the match anchored (at beginning-of-line only)?
regmust : PRegExprChar; // string (pointer into program) that match must include, or nil
regmlen : integer; // length of regmust string
// Regstart and reganch permit very fast decisions on suitable starting points
// for a match, cutting down the work a lot. Regmust permits fast rejection
// of lines that cannot possibly match. The regmust tests are costly enough
// that regcomp() supplies a regmust only if the r.e. contains something
// potentially expensive (at present, the only such thing detected is * or +
// at the start of the r.e., which can involve a lot of backup). Regmlen is
// supplied because the test in regexec() needs it and regcomp() is computing
// it anyway.
{$IFDEF UseFirstCharSet} //###0.929
FirstCharSet : TSetOfREChar;
{$ENDIF}
// work variables for Exec's routins - save stack in recursion}
reginput : PRegExprChar; // String-input pointer.
fInputStart : PRegExprChar; // Pointer to first char of input string.
fInputEnd : PRegExprChar; // Pointer to char AFTER last char of input string
// work variables for compiler's routines
regparse : PRegExprChar; // Input-scan pointer.
regnpar : integer; // count.
regdummy : char;
regcode : PRegExprChar; // Code-emit pointer; @regdummy = don't.
regsize : integer; // Code size.
regexpbeg : PRegExprChar; // only for error handling. Contains
// pointer to beginning of r.e. while compiling
fExprIsCompiled : boolean; // true if r.e. successfully compiled
// programm is essentially a linear encoding
// of a nondeterministic finite-state machine (aka syntax charts or
// "railroad normal form" in parsing technology). Each node is an opcode
// plus a "next" pointer, possibly plus an operand. "Next" pointers of
// all nodes except BRANCH implement concatenation; a "next" pointer with
// a BRANCH on both ends of it is connecting two alternatives. (Here we
// have one of the subtle syntax dependencies: an individual BRANCH (as
// opposed to a collection of them) is never concatenated with anything
// because of operator precedence.) The operand of some types of node is
// a literal string; for others, it is a node leading into a sub-FSM. In
// particular, the operand of a BRANCH node is the first node of the branch.
// (NB this is *not* a tree structure: the tail of the branch connects
// to the thing following the set of BRANCHes.) The opcodes are:
programm : PRegExprChar; // Unwarranted chumminess with compiler.
fExpression : PRegExprChar; // source of compiled r.e.
fInputString : PRegExprChar; // input string
fLastError : integer; // see Error, LastError
fModifiers : integer; // modifiers
fCompModifiers : integer; // compiler's copy of modifiers
fProgModifiers : integer; // modifiers values from last programm compilation
fSpaceChars : RegExprString; //###0.927
fWordChars : RegExprString; //###0.929
fInvertCase : TRegExprInvertCaseFunction; //###0.927
fLineSeparators : RegExprString; //###0.941
fLinePairedSeparatorAssigned : boolean;
fLinePairedSeparatorHead,
fLinePairedSeparatorTail : REChar;
{$IFNDEF UniCode}
fLineSeparatorsSet : set of REChar;
{$ENDIF}
procedure InvalidateProgramm;
// Mark programm as have to be [re]compiled
function IsProgrammOk : boolean; //###0.941
// Check if we can use precompiled r.e. or
// [re]compile it if something changed
function GetExpression : RegExprString;
procedure SetExpression (const s : RegExprString);
function GetModifierStr : RegExprString;
class function ParseModifiersStr (const AModifiers : RegExprString;
var AModifiersInt : integer) : boolean; //###0.941 class function now
// Parse AModifiers string and return true and set AModifiersInt
// if it's in format 'ismxrg-ismxrg'.
procedure SetModifierStr (const AModifiers : RegExprString);
function GetModifier (AIndex : integer) : boolean;
procedure SetModifier (AIndex : integer; ASet : boolean);
procedure Error (AErrorID : integer); virtual; // error handler.
// Default handler raise exception ERegExpr with
// Message = ErrorMsg (AErrorID), ErrorCode = AErrorID
// and CompilerErrorPos = value of property CompilerErrorPos.
{==================== Compiler section ===================}
function CompileRegExpr (exp : PRegExprChar) : boolean;
// compile a regular expression into internal code
procedure Tail (p : PRegExprChar; val : PRegExprChar);
// set the next-pointer at the end of a node chain
procedure OpTail (p : PRegExprChar; val : PRegExprChar);
// regoptail - regtail on operand of first argument; nop if operandless
function EmitNode (op : TREOp) : PRegExprChar;
// regnode - emit a node, return location
procedure EmitC (b : REChar);
// emit (if appropriate) a byte of code
procedure InsertOperator (op : TREOp; opnd : PRegExprChar; sz : integer); //###0.90
// insert an operator in front of already-emitted operand
// Means relocating the operand.
function ParseReg (paren : integer; var flagp : integer) : PRegExprChar;
// regular expression, i.e. main body or parenthesized thing
function ParseBranch (var flagp : integer) : PRegExprChar;
// one alternative of an | operator
function ParsePiece (var flagp : integer) : PRegExprChar;
// something followed by possible [*+?]
function ParseAtom (var flagp : integer) : PRegExprChar;
// the lowest level
function GetCompilerErrorPos : integer;
// current pos in r.e. - for error hanling
{$IFDEF UseFirstCharSet} //###0.929
procedure FillFirstCharSet (prog : PRegExprChar);
{$ENDIF}
{===================== Mathing section ===================}
function regrepeat (p : PRegExprChar; AMax : integer) : integer;
// repeatedly match something simple, report how many
function regnext (p : PRegExprChar) : PRegExprChar;
// dig the "next" pointer out of a node
function MatchPrim (prog : PRegExprChar) : boolean;
// recursively matching routine
function ExecPrim (AOffset: integer) : boolean;
// Exec for stored InputString
{$IFDEF RegExpPCodeDump}
function DumpOp (op : REChar) : RegExprString;
{$ENDIF}
function GetSubExprMatchCount : integer;
function GetMatchPos (Idx : integer) : integer;
function GetMatchLen (Idx : integer) : integer;
function GetMatch (Idx : integer) : RegExprString;
function GetInputString : RegExprString;
procedure SetInputString (const AInputString : RegExprString);
{$IFNDEF UseSetOfChar}
function StrScanCI (s : PRegExprChar; ch : REChar) : PRegExprChar; //###0.928
{$ENDIF}
procedure SetLineSeparators (const AStr : RegExprString);
procedure SetLinePairedSeparator (const AStr : RegExprString);
function GetLinePairedSeparator : RegExprString;
public
constructor Create;
destructor Destroy; override;
class function VersionMajor : integer; //###0.944
class function VersionMinor : integer; //###0.944
/// Read/Writes the regular expression
property Expression : RegExprString read GetExpression write SetExpression;
// Regular expression.
// For optimization, TRegExpr will automatically compiles it into 'P-code'
// (You can see it with help of Dump method) and stores in internal
// structures. Real [re]compilation occures only when it really needed -
// while calling Exec[Next], Substitute, Dump, etc
// and only if Expression or other P-code affected properties was changed
// after last [re]compilation.
// If any errors while [re]compilation occures, Error method is called
// (by default Error raises exception - see below)
{*
Set/get default values of more than one r.e.syntax modifiers at a time
For example ModifierStr := 'i-x' will switch on modifier /i, switch off /x and leave unchanged others.
@throws Setting an unsupported modifier raises exception ERegExpr
}
property ModifierStr : RegExprString read GetModifierStr write SetModifierStr;
// Set/get default values of r.e.syntax modifiers. Modifiers in
// r.e. (?ismx-ismx) will replace this default values.
// If you try to set unsupported modifier, Error will be called
// (by defaul Error raises exception ERegExpr).
/// True = case insensitive matching, False = case sensitive matching
property ModifierI : boolean index 1 read GetModifier write SetModifier;
// Modifier /i - caseinsensitive, initialized from RegExprModifierI
/// Russian syntax (character set) extensions
property ModifierR : boolean index 2 read GetModifier write SetModifier;
// Modifier /r - use r.e.syntax extended for russian,
// (was property ExtSyntaxEnabled in previous versions)
// If true, then à-ÿ additional include russian letter '¸',
// À-ß additional include '¨', and à-ß include all russian symbols.
// You have to turn it off if it may interfere with you national alphabet.
// , initialized from RegExprModifierR
{*
Treat string as single line. That is, change ".'' to match any character whatsoever,
even a line separators, which it normally would not match.
}
property ModifierS : boolean index 3 read GetModifier write SetModifier;
// Modifier /s - '.' works as any char (else as [^\n]),
// , initialized from RegExprModifierS
{*
Greedy style True, Non-Greedy False. So if so if ModifierG = False, then
all '*' works as '*?', all '+' as '+?' and so on to match as little text as possible
}
property ModifierG : boolean index 4 read GetModifier write SetModifier;
// Switching off modifier /g switchs all operators in
// non-greedy style, so if ModifierG = False, then
// all '*' works as '*?', all '+' as '+?' and so on.
// , initialized from RegExprModifierG
{*
If true matching is restricted to the current line, else the entire string
is searched (ignoring line break characters)
}
property ModifierM : boolean index 5 read GetModifier write SetModifier;
// Treat string as multiple lines. That is, change `^' and `$' from
// matching at only the very start or end of the string to the start
// or end of any line anywhere within the string.
// , initialized from RegExprModifierM
{*
Extend your pattern's legibility by permitting whitespace and comments.
The modifier /x tells the TRegExpr to ignore whitespace that is neither
backslashed nor within a character class. You can use this to break up your
regular expression into (slightly) more readable parts.
The # character is also treated as a metacharacter introducing a comment,
for example:
(abc) # comment 1
| # You can use spaces to format r.e. - TRegExpr ignores it
(efg) # comment 2
If you want real whitespace or # characters in the pattern
(outside a character class, where they are unaffected by /x),
that you'll either have to escape them or encode them using octal or
hex escapes. Taken together, these features go a long way towards making
regular expressions text more readable.
}
property ModifierX : boolean index 6 read GetModifier write SetModifier;
// Modifier /x - eXtended syntax, allow r.e. text formatting,
// see description in the help. Initialized from RegExprModifierX
function Exec (const AInputString : RegExprString) : boolean; {$IFDEF OverMeth} overload;
{$IFNDEF FPC} // I do not know why FreePascal cannot overload methods with empty param list
function Exec : boolean; overload; //###0.949
{$ENDIF}
function Exec (AOffset: integer) : boolean; overload; //###0.949
{$ENDIF}
// match a programm against a string AInputString
// !!! Exec store AInputString into InputString property
// For Delphi 5 and higher available overloaded versions - first without
// parameter (uses already assigned to InputString property value)
// and second that has integer parameter and is same as ExecPos
function ExecNext : boolean;
// find next match:
// ExecNext;
// works same as
// if MatchLen [0] = 0 then ExecPos (MatchPos [0] + 1)
// else ExecPos (MatchPos [0] + MatchLen [0]);
// but it's more simpler !
// Raises exception if used without preceeding SUCCESSFUL call to
// Exec* (Exec, ExecPos, ExecNext). So You always must use something like
// if Exec (InputString) then repeat { proceed results} until not ExecNext;
function ExecPos (AOffset: integer {$IFDEF DefParam}= 1{$ENDIF}) : boolean;
// find match for InputString starting from AOffset position
// (AOffset=1 - first char of InputString)
/// Returns/Sets current input string (from last Exec call or last assign to this property).
property InputString : RegExprString read GetInputString write SetInputString;
// Any assignment to this property clear Match* properties !
function Substitute (const ATemplate : RegExprString) : RegExprString;
// Returns ATemplate with '$&' or '$0' replaced by whole r.e.
// occurence and '$n' replaced by occurence of subexpression #n.
// Since v.0.929 '$' used instead of '\' (for future extensions
// and for more Perl-compatibility) and accept more then one digit.
// If you want place into template raw '$' or '\', use prefix '\'
// Example: '1\$ is $2\\rub\\' -> '1$ is <Match[2]>\rub\'
// If you want to place raw digit after '$n' you must delimit
// n with curly braces '{}'.
// Example: 'a$12bc' -> 'a<Match[12]>bc'
// 'a${1}2bc' -> 'a<Match[1]>2bc'.
procedure Split (AInputStr : RegExprString; APieces : TStrings);
// Split AInputStr into APieces by r.e. occurencies
// Internally calls Exec[Next]
function Replace (AInputStr : RegExprString;
const AReplaceStr : RegExprString;
AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) //###0.946
: RegExprString; {$IFDEF OverMeth} overload;
function Replace (AInputStr : RegExprString;
AReplaceFunc : TRegExprReplaceFunction)
: RegExprString; overload;
{$ENDIF}
function ReplaceEx (AInputStr : RegExprString;
AReplaceFunc : TRegExprReplaceFunction)
: RegExprString;
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
// If AUseSubstitution is true, then AReplaceStr will be used
// as template for Substitution methods.
// For example:
// Expression := '({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*';
// Replace ('BLOCK( test1)', 'def "$1" value "$2"', True);
// will return: def 'BLOCK' value 'test1'
// Replace ('BLOCK( test1)', 'def "$1" value "$2"')
// will return: def "$1" value "$2"
// Internally calls Exec[Next]
// Overloaded version and ReplaceEx operate with call-back function,
// so You can implement really complex functionality.
{*
Number of subexpressions found in last Exec* call.
If there are no subexpr. but whole expr was found (Exec* returned True),
then SubExprMatchCount=0, if no subexpressions nor whole
r.e. found (Exec* returned false) then SubExprMatchCount=-1.
For example: Expression := '(1)?2(3)?';
Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
Exec ('2'): SubExprMatchCount=0, Match[0]='2'
Exec ('7') - return False: SubExprMatchCount=-1
}
property SubExprMatchCount : integer read GetSubExprMatchCount;
// Number of subexpressions has been found in last Exec* call.
// If there are no subexpr. but whole expr was found (Exec* returned True),
// then SubExprMatchCount=0, if no subexpressions nor whole
// r.e. found (Exec* returned false) then SubExprMatchCount=-1.
// Note, that some subexpr. may be not found and for such
// subexpr. MathPos=MatchLen=-1 and Match=''.
// For example: Expression := '(1)?2(3)?';
// Exec ('123'): SubExprMatchCount=2, Match[0]='123', [1]='1', [2]='3'
// Exec ('12'): SubExprMatchCount=1, Match[0]='12', [1]='1'
// Exec ('23'): SubExprMatchCount=2, Match[0]='23', [1]='', [2]='3'
// Exec ('2'): SubExprMatchCount=0, Match[0]='2'
// Exec ('7') - return False: SubExprMatchCount=-1
{*
Returns the starting character position of the current match in InputString
Whole R.E. Idx = 0, first subexpr Idx = 1 etc.
@param Idx Index for Regular Expression sub pattern
@return Input string position number, -1 if in r.e. no such subexpr.
or this subexpr. not found in input string.
}
property MatchPos [Idx : integer] : integer read GetMatchPos;
// pos of entrance subexpr. #Idx into tested in last Exec*
// string. First subexpr. have Idx=1, last - MatchCount,
// whole r.e. have Idx=0.
// Returns -1 if in r.e. no such subexpr. or this subexpr.
// not found in input string.
{*
Returns the number of characters in the match of current input to R.E. Idx
Whole R.E. Idx = 0, first subexpr Idx = 1 etc.
@param Idx Index for Regular Expression sub pattern
@return Returns Number of characters matching, -1 if in r.e. no such subexpr.
or this subexpr. not found in input string.
@comment Remember - MatchLen may be 0 (if r.e. match empty string)
}
property MatchLen [Idx : integer] : integer read GetMatchLen;
// len of entrance subexpr. #Idx r.e. into tested in last Exec*
// string. First subexpr. have Idx=1, last - MatchCount,
// whole r.e. have Idx=0.
{*
Uses the subexpression at Idx to attempt matching the current inputstring
@return '' if r.e. no such subexpr. or this subexpr. not found in input string.
if a match is found then the text is returned.
}
property Match [Idx : integer] : RegExprString read GetMatch;
// == copy (InputString, MatchPos [Idx], MatchLen [Idx])
function LastError : integer;
// Returns ID of last error, 0 if no errors (unusable if
// Error method raises exception) and clear internal status
// into 0 (no errors).
function ErrorMsg (AErrorID : integer) : RegExprString; virtual;
// Returns Error message for error with ID = AErrorID.
/// Returns pos in r.e. where compiler stopped. Usefull for error diagnostics
property CompilerErrorPos : integer read GetCompilerErrorPos;
/// Contains chars, treated as /s (initially filled with RegExprSpaceChars global constant)
property SpaceChars : RegExprString read fSpaceChars write fSpaceChars; //###0.927
/// Contains chars, treated as /w (initially filled with RegExprWordChars global constant)
property WordChars : RegExprString read fWordChars write fWordChars; //###0.929
/// line separators (like \n in Unix)
property LineSeparators : RegExprString read fLineSeparators write SetLineSeparators; //###0.941
/// paired line separator (like \r\n in DOS and Windows). Must contain exactly two chars or no chars at all
property LinePairedSeparator : RegExprString read GetLinePairedSeparator write SetLinePairedSeparator; //###0.941
class function InvertCaseFunction (const Ch : REChar) : REChar;
// Converts Ch into upper case if it in lower case or in lower
// if it in upper (uses current system local setings)
/// Set this property if you want to override case-insensitive functionality.
property InvertCase : TRegExprInvertCaseFunction read fInvertCase write fInvertCase; //##0.935
// Create set it to RegExprInvertCaseFunction (InvertCaseFunction by default)
procedure Compile; //###0.941
// [Re]compile r.e. Usefull for example for GUI r.e. editors (to check
// all properties validity).
{$IFDEF RegExpPCodeDump}
function Dump : RegExprString;
// dump a compiled regexp in vaguely comprehensible form
{$ENDIF}
end;
///Class for holding reg expr parsing exceptions
ERegExpr = class (Exception)
public
ErrorCode : integer; /// Error code enumeration, see TRegExpr.ErrorMsg
CompilerErrorPos : integer; /// Position in reg expr of syntax error
end;
const
/// defaul for InvertCase property
RegExprInvertCaseFunction : TRegExprInvertCaseFunction = {$IFDEF FPC} nil {$ELSE} TRegExpr.InvertCaseFunction{$ENDIF};
function ExecRegExpr (const ARegExpr, AInputStr : RegExprString) : boolean;
// true if string AInputString match regular expression ARegExpr
// ! will raise exeption if syntax errors in ARegExpr
procedure SplitRegExpr (const ARegExpr, AInputStr : RegExprString; APieces : TStrings);
// Split AInputStr into APieces by r.e. ARegExpr occurencies
function ReplaceRegExpr (const ARegExpr, AInputStr, AReplaceStr : RegExprString;
AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString; //###0.947
// Returns AInputStr with r.e. occurencies replaced by AReplaceStr
// If AUseSubstitution is true, then AReplaceStr will be used
// as template for Substitution methods.
// For example:
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
// 'BLOCK( test1)', 'def "$1" value "$2"', True)
// will return: def 'BLOCK' value 'test1'
// ReplaceRegExpr ('({-i}block|var)\s*\(\s*([^ ]*)\s*\)\s*',
// 'BLOCK( test1)', 'def "$1" value "$2"')
// will return: def "$1" value "$2"
function QuoteRegExprMetaChars (const AStr : RegExprString) : RegExprString;
// Replace all metachars with its safe representation,
// for example 'abc$cd.(' converts into 'abc\$cd\.\('
// This function usefull for r.e. autogeneration from
// user input
function RegExprSubExpressions (const ARegExpr : string;
ASubExprs : TStrings; AExtendedSyntax : boolean{$IFDEF DefParam}= False{$ENDIF}) : integer;
// Makes list of subexpressions found in ARegExpr r.e.
// In ASubExps every item represent subexpression,
// from first to last, in format:
// String - subexpression text (without '()')
// low word of Object - starting position in ARegExpr, including '('
// if exists! (first position is 1)
// high word of Object - length, including starting '(' and ending ')'
// if exist!
// AExtendedSyntax - must be True if modifier /m will be On while
// using the r.e.
// Usefull for GUI editors of r.e. etc (You can find example of using
// in TestRExp.dpr project)
// Returns
// 0 Success. No unbalanced brackets was found;
// -1 There are not enough closing brackets ')';
// -(n+1) At position n was found opening '[' without //###0.942
// corresponding closing ']';
// n At position n was found closing bracket ')' without
// corresponding opening '('.
// If Result <> 0, then ASubExpr can contain empty items or illegal ones
implementation
uses
Windows; // CharUpper/Lower
const
TRegExprVersionMajor : integer = 0;
TRegExprVersionMinor : integer = 952;
// TRegExpr.VersionMajor/Minor return values of this constants
MaskModI = 1; // modifier /i bit in fModifiers
MaskModR = 2; // -"- /r
MaskModS = 4; // -"- /s
MaskModG = 8; // -"- /g
MaskModM = 16; // -"- /m
MaskModX = 32; // -"- /x
{$IFDEF UniCode}
XIgnoredChars = ' '#9#$d#$a;
{$ELSE}
XIgnoredChars = [' ', #9, #$d, #$a];
{$ENDIF}
{=============================================================}
{=================== WideString functions ====================}
{=============================================================}
{$IFDEF UniCode}
function StrPCopy (Dest: PRegExprChar; const Source: RegExprString): PRegExprChar;
var
i, Len : Integer;
begin
Len := length (Source); //###0.932
for i := 1 to Len do
Dest [i - 1] := Source [i];
Dest [Len] := #0;
Result := Dest;
end; { of function StrPCopy
--------------------------------------------------------------}
function StrLCopy (Dest, Source: PRegExprChar; MaxLen: Cardinal): PRegExprChar;
var i: Integer;
begin
for i := 0 to MaxLen - 1 do
Dest [i] := Source [i];
Result := Dest;
end; { of function StrLCopy
--------------------------------------------------------------}
function StrLen (Str: PRegExprChar): Cardinal;
begin
Result:=0;
while Str [result] <> #0
do Inc (Result);
end; { of function StrLen
--------------------------------------------------------------}
function StrPos (Str1, Str2: PRegExprChar): PRegExprChar;
var n: Integer;
begin
Result := nil;
n := Pos (RegExprString (Str2), RegExprString (Str1));
if n = 0
then EXIT;
Result := Str1 + n - 1;
end; { of function StrPos
--------------------------------------------------------------}
function StrLComp (Str1, Str2: PRegExprChar; MaxLen: Cardinal): Integer;
var S1, S2: RegExprString;
begin
S1 := Str1;
S2 := Str2;
if Copy (S1, 1, MaxLen) > Copy (S2, 1, MaxLen)
then Result := 1
else
if Copy (S1, 1, MaxLen) < Copy (S2, 1, MaxLen)
then Result := -1
else Result := 0;
end; { function StrLComp
--------------------------------------------------------------}
function StrScan (Str: PRegExprChar; Chr: WideChar): PRegExprChar;
begin
Result := nil;
while (Str^ <> #0) and (Str^ <> Chr)
do Inc (Str);
if (Str^ <> #0)
then Result := Str;
end; { of function StrScan
--------------------------------------------------------------}
{$ENDIF}
{=============================================================}
{===================== Global functions ======================}
{=============================================================}
/// Global function - true if AInputString matches reg expr ARegExpr, raises exeption if syntax errors in ARegExpr
function ExecRegExpr (const ARegExpr, AInputStr : RegExprString) : boolean;
var r : TRegExpr;
begin
r := TRegExpr.Create;
try
r.Expression := ARegExpr;
Result := r.Exec (AInputStr);
finally r.Free;
end;
end; { of function ExecRegExpr
--------------------------------------------------------------}
/// Global function - Splits AInputStr into APieces by r.e. ARegExpr occurances
procedure SplitRegExpr (const ARegExpr, AInputStr : RegExprString; APieces : TStrings);
var r : TRegExpr;
begin
APieces.Clear;
r := TRegExpr.Create;
try
r.Expression := ARegExpr;
r.Split (AInputStr, APieces);
finally r.Free;
end;
end; { of procedure SplitRegExpr
--------------------------------------------------------------}
{*
Global function - Returns AInputStr with r.e. occurencies replaced by AReplaceStr.
If AUseSubstitution is true, then AReplaceStr will be used as template for Substitution methods.
}
function ReplaceRegExpr (const ARegExpr, AInputStr, AReplaceStr : RegExprString;
AUseSubstitution : boolean{$IFDEF DefParam}= False{$ENDIF}) : RegExprString;
begin
with TRegExpr.Create do try
Expression := ARegExpr;
Result := Replace (AInputStr, AReplaceStr, AUseSubstitution);
finally Free;
end;
end; { of function ReplaceRegExpr
--------------------------------------------------------------}
{*
Global function - Replaces all metachars with its safe representation,
for example 'abc$cd.(' converts into 'abc\$cd\.\('
Function usefull for r.e. autogeneration from user input
}
function QuoteRegExprMetaChars (const AStr : RegExprString) : RegExprString;
const
RegExprMetaSet : RegExprString = '^$.[()|?+*'+EscChar+'{'
+ ']}'; // - this last are additional to META.
// Very similar to META array, but slighly changed.
// !Any changes in META array must be synchronized with this set.
var
i, i0, Len : integer;
begin
Result := '';
Len := length (AStr);
i := 1;
i0 := i;
while i <= Len do begin
if Pos (AStr [i], RegExprMetaSet) > 0 then begin
Result := Result + System.Copy (AStr, i0, i - i0)
+ EscChar + AStr [i];
i0 := i + 1;
end;
inc (i);
end;
Result := Result + System.Copy (AStr, i0, MaxInt); // Tail
end; { of function QuoteRegExprMetaChars
--------------------------------------------------------------}
{*
Global function - Makes list of subexpressions found in ARegExpr r.e.
In ASubExps every item represent subexpression, from first to last, in format:
String - subexpression text (without '()') low word of Object -
starting position in ARegExpr, including '(' if exists! (first position is 1)
high word of Object - length, including starting '(' and ending ')' if exist!
AExtendedSyntax - must be True if modifier /x will be On while using the r.e.
Usefull for GUI editors of r.e. etc. See class help file for more details.
}
function RegExprSubExpressions (const ARegExpr : string;
ASubExprs : TStrings; AExtendedSyntax : boolean{$IFDEF DefParam}= False{$ENDIF}) : integer;
type
TStackItemRec = record //###0.945
SubExprIdx : integer;
StartPos : integer;
end;
TStackArray = packed array [0 .. NSUBEXPMAX - 1] of TStackItemRec;
var
Len, SubExprLen : integer;
i, i0 : integer;
Modif : integer;
Stack : ^TStackArray; //###0.945
StackIdx, StackSz : integer;
begin
Result := 0; // no unbalanced brackets found at this very moment
ASubExprs.Clear; // I don't think that adding to non empty list
// can be usefull, so I simplified algorithm to work only with empty list
Len := length (ARegExpr); // some optimization tricks
// first we have to calculate number of subexpression to reserve
// space in Stack array (may be we'll reserve more then need, but
// it's faster then memory reallocation during parsing)
StackSz := 1; // add 1 for entire r.e.
for i := 1 to Len do
if ARegExpr [i] = '('
then inc (StackSz);
// SetLength (Stack, StackSz); //###0.945
GetMem (Stack, SizeOf (TStackItemRec) * StackSz);
try
StackIdx := 0;
i := 1;
while (i <= Len) do begin
case ARegExpr [i] of
'(': begin
if (i < Len) and (ARegExpr [i + 1] = '?') then begin
// this is not subexpression, but comment or other
// Perl extension. We must check is it (?ismxrg-ismxrg)
// and change AExtendedSyntax if /x is changed.
inc (i, 2); // skip '(?'
i0 := i;
while (i <= Len) and (ARegExpr [i] <> ')')
do inc (i);
if i > Len
then Result := -1 // unbalansed '('
else
if TRegExpr.ParseModifiersStr (System.Copy (ARegExpr, i, i - i0), Modif)
then AExtendedSyntax := (Modif and MaskModX) <> 0;
end
else begin // subexpression starts
ASubExprs.Add (''); // just reserve space
with Stack [StackIdx] do begin
SubExprIdx := ASubExprs.Count - 1;
StartPos := i;
end;
inc (StackIdx);
end;
end;
')': begin
if StackIdx = 0
then Result := i // unbalanced ')'
else begin
dec (StackIdx);
with Stack [StackIdx] do begin
SubExprLen := i - StartPos + 1;
ASubExprs.Objects [SubExprIdx] :=
TObject (StartPos or (SubExprLen ShL 16));
ASubExprs [SubExprIdx] := System.Copy (
ARegExpr, StartPos + 1, SubExprLen - 2); // add without brackets
end;
end;
end;
EscChar: inc (i); // skip quoted symbol
'[': begin
// we have to skip character ranges at once, because they can
// contain '#', and '#' in it must NOT be recognized as eXtended
// comment beginning!
i0 := i;
inc (i);
if ARegExpr [i] = ']' // cannot be 'emty' ranges - this interpretes
then inc (i); // as ']' by itself
while (i <= Len) and (ARegExpr [i] <> ']') do
if ARegExpr [i] = EscChar //###0.942
then inc (i, 2) // skip 'escaped' char to prevent stopping at '\]'
else inc (i);
if (i > Len) or (ARegExpr [i] <> ']') //###0.942
then Result := - (i0 + 1); // unbalansed '[' //###0.942
end;
'#': if AExtendedSyntax then begin
// skip eXtended comments
while (i <= Len) and (ARegExpr [i] <> #$d) and (ARegExpr [i] <> #$a)
// do not use [#$d, #$a] due to UniCode compatibility
do inc (i);
while (i + 1 <= Len) and ((ARegExpr [i + 1] = #$d) or (ARegExpr [i + 1] = #$a))
do inc (i); // attempt to work with different kinds of line separators
// now we are at the line separator that must be skipped.
end;
// here is no 'else' clause - we simply skip ordinary chars
end; // of case
inc (i); // skip scanned char
// ! can move after Len due to skipping quoted symbol
end;
// check brackets balance
if StackIdx <> 0
then Result := -1; // unbalansed '('