Permalink
Switch branches/tags
Nothing to show
Find file Copy path
819b378 Mar 25, 2014
0 contributors

Users who have contributed to this file

890 lines (726 sloc) 16.7 KB
// ëåêñè÷åñêèé àíàëèçàòîð äëÿ êîìïèëÿòîðà C++ - LexicalAnalyzer.cpp
#pragma warning(disable: 4786)
#include <nrc.h>
using namespace NRC;
#include "Limits.h"
#include "Application.h"
#include "LexicalAnalyzer.h"
// ñòðóêòóðà êîòîðàÿ îáåñïå÷èâàåò ïîèñê êîäà ëåêñåìû
// ïî åå èìåíè
struct keywords
{
// èìÿ êëþ÷åâîãî ñëîâà
const char *name;
// êîä êëþ÷åâîãî ñëîâà
int code;
} kpp_words[] = {
{ "define", KPP_DEFINE },
{ "error", KPP_ERROR },
{ "undef", KPP_UNDEF },
{ "elif", KPP_ELIF },
{ "if", KPP_IF },
{ "include", KPP_INCLUDE },
{ "else", KPP_ELSE },
{ "ifdef", KPP_IFDEF },
{ "line", KPP_LINE },
{ "endif", KPP_ENDIF },
{ "ifndef", KPP_IFNDEF },
{ "pragma", KPP_PRAGMA }
},
c_words[] = {
{ "auto", KWAUTO }, { "break", KWBREAK },
{ "case", KWCASE }, { "char", KWCHAR },
{ "const", KWCONST }, { "continue", KWCONTINUE },
{ "default", KWDEFAULT }, { "do", KWDO },
{ "double", KWDOUBLE }, { "else", KWELSE },
{ "enum", KWENUM }, { "extern", KWEXTERN },
{ "float", KWFLOAT }, { "for", KWFOR },
{ "goto", KWGOTO }, { "if", KWIF },
{ "int", KWINT }, { "long", KWLONG },
{ "register", KWREGISTER }, { "return", KWRETURN },
{ "short", KWSHORT }, { "signed", KWSIGNED },
{ "sizeof", KWSIZEOF }, { "static", KWSTATIC },
{ "struct", KWSTRUCT }, { "switch", KWSWITCH },
{ "typedef", KWTYPEDEF }, { "union", KWUNION },
{ "unsigned", KWUNSIGNED }, { "void", KWVOID },
{ "volatile", KWVOLATILE }, { "while", KWWHILE }
},
cpp_words[] = {
{ "asm", KWASM }, { "auto", KWAUTO },
{ "bool", KWBOOL }, { "break", KWBREAK },
{ "case", KWCASE }, { "catch", KWCATCH },
{ "char", KWCHAR }, { "class", KWCLASS },
{ "const", KWCONST }, { "const_cast", KWCONST_CAST },
{ "continue", KWCONTINUE }, { "default", KWDEFAULT },
{ "delete", KWDELETE }, { "do", KWDO },
{ "double", KWDOUBLE }, { "dynamic_cast", KWDYNAMIC_CAST },
{ "else", KWELSE }, { "enum", KWENUM },
{ "explicit", KWEXPLICIT }, { "export", KWEXPORT },
{ "extern", KWEXTERN }, { "false", KWFALSE },
{ "float", KWFLOAT }, { "for", KWFOR },
{ "friend", KWFRIEND }, { "goto", KWGOTO },
{ "if", KWIF }, { "inline", KWINLINE },
{ "int", KWINT }, { "long", KWLONG },
{ "mutable", KWMUTABLE }, { "namespace", KWNAMESPACE },
{ "new", KWNEW }, { "operator", KWOPERATOR },
{ "private", KWPRIVATE }, { "protected", KWPROTECTED },
{ "public", KWPUBLIC }, { "register", KWREGISTER },
{ "reinterpret_cast", KWREINTERPRET_CAST },
{ "return", KWRETURN }, { "short", KWSHORT },
{ "signed", KWSIGNED }, { "sizeof", KWSIZEOF },
{ "static", KWSTATIC }, { "static_cast", KWSTATIC_CAST },
{ "struct", KWSTRUCT }, { "switch", KWSWITCH },
{ "template", KWTEMPLATE }, { "this", KWTHIS },
{ "throw", KWTHROW }, { "true", KWTRUE },
{ "try", KWTRY }, { "typedef", KWTYPEDEF },
{ "typeid", KWTYPEID }, { "typename", KWTYPENAME },
{ "union", KWUNION }, { "unsigned", KWUNSIGNED },
{ "using", KWUSING }, { "virtual", KWVIRTUAL },
{ "void", KWVOID }, { "volatile", KWVOLATILE },
{ "wchar_t", KWWCHAR_T }, { "while", KWWHILE }
};
// áóôåð ñ ñîäåðæèìûì ëåêñåìû
static string lexbuf;
// ôóíêöèÿ âîçâðàùàåò êîä êëþ÷åâîãî ñëîâà èëè -1
// â ñëó÷àå åñëè òàêîãî êëþ÷åâîãî ñëîâà íåò
inline int LookupKeywordCode( const char *keyname, keywords *kmas, int szmas )
{
int r;
for( int i = 0; i< szmas / sizeof(keywords); i++ )
{
r = strcmp( kmas[i].name, keyname );
if( !r )
return kmas[i].code;
else if( r > 0 )
break;
}
return -1;
}
// èùåò êëþ÷åâûå ñëîâà kpp
static int LookupKppKeywords( const char *keyname )
{
return LookupKeywordCode( keyname, kpp_words, sizeof( kpp_words ) );
}
// èùåò êëþ÷åâûå ñëîâà ÿçûêà Ñ
static int LookupCKeywords( const char *keyname )
{
return LookupKeywordCode( keyname, c_words, sizeof( c_words ) );
}
// èùåò êëþ÷åâûå ñëîâà ÿçûêà Ñ++
static int LookupCPPKeywords( const char *keyname )
{
return LookupKeywordCode( keyname, cpp_words, sizeof( cpp_words ) );
}
// âîçâðàùàåò èìÿ êëþ÷åâîãî ñëîâà ïî êîäó
const char *GetKeywordName( int code )
{
return cpp_words[code - KWASM].name;
}
// èãíîðèðóåò ïðîáåëû è íîâûå ñòðîêè
static int IgnoreNewlinesAndSpaces( BaseRead &ob )
{
register int c;
while( (ob >> c) != EOF )
// âíèìàíèå: òàáóëÿöèÿ ñ÷èòàåòñÿ êàê 1 ñèìâîë (à íå 4 ïðîáåëà)
if( c == ' ' || c == '\t' )
continue;
else if( c == '\n' )
((CppFileRead &)(ob)).NewLine();
else
break;
ob << c; // âîçâðàùàåì îäèí ñèìâîë â ïîòîê
return c;
}
// ïðîâåðÿåò, åñëè 'nam' - àëüòåðíàòèâíîå èìÿ òàêîå
// êàê and, or, ... òî âåðíóòü íåíóëîâîå çíà÷åíèå - êîä
// íàñòîÿùåé ëåêñåìû, èíà÷å 0
inline static int IsAlternativeName( const char *n )
{
struct TempAgr
{
const char *name;
int tok;
} alt[] = {
"and", LOGIC_AND,
"and_eq", AND_ASSIGN,
"bitand", '&',
"bitor", '|',
"compl", '~',
"not", '!',
"not_eq", NOT_EQUAL,
"or", LOGIC_OR,
"or_eq", OR_ASSIGN,
"xor", '^',
"xor_eq", XOR_ASSIGN
};
for( int i = 0; i<11; i++ )
if( !strcmp( alt[i].name, n ) )
return alt[i].tok;
return 0;
}
// âûäåëèòü ëåêñåìó 'èäåíòèôèêàòîð'
inline static int LexemName( BaseRead &ob )
{
register int c;
while( (ob >> c) != EOF )
if( !IS_NAME(c) )
break;
else
lexbuf += (char)c;
ob << c;
return NAME;
}
// âûäåëèòü ëåêñåìó 'îïåðàòîð'
inline static int LexemOperator( BaseRead &ob )
{
register int c;
ob >> c;
if( c == '-' )
{
ob >> c;
if(c == '-') { lexbuf = "--"; return DECREMENT; }
else if(c == '=') { lexbuf = "-="; return MINUS_ASSIGN; }
else if(c == '>')
{
ob >> c;
if(c == '*') { lexbuf = "->*"; return ARROW_POINT; }
else { ob << c; lexbuf = "->"; return ARROW; }
}
else { ob << c; lexbuf = '-'; return '-'; }
}
else if( c == '+' )
{
ob >> c;
if(c == '+') { lexbuf = "++"; return INCREMENT; }
else if(c == '=') { lexbuf = "+="; return PLUS_ASSIGN; }
else { ob << c; lexbuf = '+'; return '+'; }
}
else if( c == '*' )
{
ob >> c;
if(c == '=') { lexbuf = "*="; return MUL_ASSIGN; }
else { ob << c; lexbuf = '*'; return '*'; }
}
else if( c == '/' )
{
ob >> c;
if(c == '=') { lexbuf = "/="; return DIV_ASSIGN; }
else { ob << c; lexbuf = '/'; return '/'; }
}
else if( c == '%' )
{
ob >> c;
if(c == '=') { lexbuf = "%="; return PERCENT_ASSIGN; }
else if(c == '>') { lexbuf = "%>"; return '}'; }
// àëüòåðàíàòèâà '%>' - '{' , '%:' - #, '%:%:' - ##
else if(c == ':')
{
ob >> c;
if( c == '%' )
{
ob >> c;
if( c != ':' )
theApp.Error("ïðîïóùåí ñèìâîë ':' â ëåêñåìå '%%:%%:'"),
ob << c;
lexbuf = "%:%:";
return DOUBLE_SHARP;
}
else
{
ob << c;
lexbuf = "%:";
return '#';
}
}
else { ob << c; lexbuf = '%'; return '%'; }
}
else if( c == '<' )
{
ob >> c;
if(c == '=') { lexbuf = "<="; return LESS_EQU; }
else if(c == '<')
{
ob >> c;
if(c == '=') { lexbuf = "<<="; return LEFT_SHIFT_ASSIGN; }
else { ob << c; lexbuf = "<<"; return LEFT_SHIFT; }
}
// àëüòåðàíàòèâà '<%' - '{', '<:' - '['
else if(c == '%') { lexbuf = "<%"; return '{'; }
else if(c == ':') { lexbuf = "<:"; return '['; }
else { ob << c; lexbuf = '<'; return '<'; }
}
else if( c == '>' )
{
ob >> c;
if(c == '=') { lexbuf = ">="; return GREATER_EQU; }
else if(c == '>')
{
ob >> c;
if(c == '=') { lexbuf = ">>="; return RIGHT_SHIFT_ASSIGN; }
else { ob << c; lexbuf = ">>"; return RIGHT_SHIFT; }
}
else { ob << c; lexbuf = '>'; return '>'; }
}
else if( c == '=' )
{
ob >> c;
if( c == '=' ) { lexbuf = "=="; return EQUAL; }
else { ob << c; lexbuf = '='; return '='; }
}
else if( c == '!' )
{
ob >> c;
if( c == '=' ) { lexbuf = "!="; return NOT_EQUAL; }
else { ob << c; lexbuf = '!'; return '!'; }
}
else if( c == '^' )
{
ob >> c;
if( c == '=' ) { lexbuf = "^="; return XOR_ASSIGN; }
else { ob << c; lexbuf = '^'; return '^'; }
}
else if( c == '&' )
{
ob >> c;
if( c == '=' ) { lexbuf = "&="; return AND_ASSIGN; }
else if( c == '&' ) { lexbuf = "&&"; return LOGIC_AND; }
else { ob << c; lexbuf = '&'; return '&'; }
}
else if( c == '|' )
{
ob >> c;
if( c == '=' ) { lexbuf = "|="; return OR_ASSIGN; }
else if( c == '|' ) { lexbuf = "||"; return LOGIC_OR; }
else { ob << c; lexbuf = '|'; return '|'; }
}
else if( c == ':' )
{
ob >> c;
if( c == ':' ) { lexbuf = "::"; return COLON_COLON; }
else if(c == '>') { lexbuf = ":>"; return ']'; } // ':>' - ']'
else { ob << c; lexbuf = ':'; return ':'; }
}
else if( c == '.' )
{
ob >> c;
if( c == '*' ) { lexbuf = ".*"; return DOT_POINT; }
else if( c == '.' )
{
ob >> c;
if(c == '.') { lexbuf = "..."; return ELLIPSES; }
else
{
ob << c;
theApp.Error( "ïðîïóùåíà '.' â îïåðàòîðå '...'");
lexbuf = "...";
return ELLIPSES;
}
}
else { ob << c; lexbuf = '.'; return '.'; }
}
else if( c == '#' )
{
ob >> c;
if( c == '#' )
{
lexbuf = "##";
return DOUBLE_SHARP;
}
ob << c;
lexbuf = "#";
return '#';
}
else if( c == EOF )
{
lexbuf = "<êîíåö ôàéëà>";
return c;
}
else
{
lexbuf = c;
return c;
}
}
// âûäåëèòü ëåêñåìó 'ñòðîêîâûé ëèòåðàë'
inline static int LexemString( BaseRead &ob )
{
register int c;
bool wstr = lexbuf.at(0) == 'L';
// öèêë âûïîëíÿåòñÿ ïîêà åñòü âîçìîæíîñòü ñîåäèíÿòü ñòðîêè
for( ;; )
{
for(;;)
{
ob >> c;
if( c == '\"' )
break; // ñòðîêà ñ÷èòàíà
else if( c == '\\' )
{
int pc;
ob >> pc;
if(pc == '\"' || pc == '\\')
{
lexbuf += '\\'; lexbuf += (char)pc;
continue;
}
else
ob << pc;
}
else if( c == '\n' || c == EOF )
{
ob << c;
theApp.Error( "íå õâàòàåò `\"' â êîíöå ñòðîêè" );
lexbuf += '\"';
return STRING;
}
lexbuf += c;
}
// ïåðåõîäèì ê ñëåäóþùåé ëåêñåìå, âîçìîæíî ýòî áóäåò îïÿòü ñòðîêà,
// òîãäà âîçìîæíî áóäåò êîíêàòåíàöèÿ
c = IgnoreNewlinesAndSpaces( ob );
if( c == '\"' ) // ïðîäîëæàåì èòåðàöèþ öèêëà
{
if( wstr )
theApp.Error("êîíêàòåíàöèÿ ñòðîê ðàçíûõ òèïîâ");
wstr = false;
ob >> c;
}
// âîçìîæíî ñòðîêà òèïà wchar_t
else if( c == 'L' )
{
ob >> c, ob >> c;
if( c == '\"' )
{
if( !wstr )
theApp.Error("êîíêàòåíàöèÿ ñòðîê ðàçíûõ òèïîâ");
wstr = true;
}
else
{
ob << c, ob << (c = 'L');
lexbuf += '\"';
return STRING;
}
}
else
{
lexbuf += '\"';
return STRING;
}
}
return STRING; // kill warning
}
// ôóíêöèÿ âîçâðàùàåò íåíóëåâîå çíà÷åíèå åñëè ñèìâîë âîñüìåðè÷íûé
int isdigit8( int c )
{
return c >= '0' && c <= '7';
}
// ñ÷èòûâàåò ÷èñëî èç âõîäíîãî ïîòîêà, ïîêà
// ôóíêöèÿ isfunc âîçâðàùàåò true
static void ReadDigit( BaseRead &ob, int (*isfunc)(int) )
{
register int c;
while( (ob >> c) != EOF )
if( !isfunc(c) )
break;
else
lexbuf += c;
ob << c;
}
// ñ÷èòàòü ñóôôèêñ ó ÷èñëà, âåðíóòü true åñëè ñóôôèêñ suf
// áóäåò çàäàí
static inline bool ReadDigitSuffix( BaseRead &ob, char suf )
{
bool sl, ss;
sl = ss = false; // äâà ñóôôèêñà ìîãóò áûòü óñòàíîâëåíû
// ïåðâûé ñóôôèêñ 'l', âòîðîé suf
for( register int c;; )
{
ob >> c;
if( toupper(c) == 'L' )
{
if( sl )
theApp.Warning("ñóôôèêñ 'L' ó ÷èñëà óæå çàäàí");
else
sl = true, lexbuf += c;
}
// èëè 'U' èëè 'F'
else if( toupper(c) == toupper(suf) )
{
if( ss )
theApp.Warning("ñóôôèêñ '%c' ó ÷èñëà óæå çàäàí", suf);
else
ss = true, lexbuf += c;
}
else
{
ob << c;
break;
}
}
return ss;
}
// âûäåëèòü ëåêñåìó '÷èñëî'
inline static int LexemDigit( BaseRead &ob )
{
register int c;
int state = 0;
ob >> c;
for(;;)
switch(state)
{
case 0:
if(c == '0') state = 1;
else if(c == '.')
{
int p;
ob >> p; ob << p;
if( !isdigit(p) ) // ïðîñòî îïåðàòîð òî÷êà
{ ob << c; return -1; }
else
state = 2;
}
// äåñÿòè÷íîå ÷èñëî 1-9
else
{
lexbuf += c;
ReadDigit( ob, isdigit );
ob >> c;
if( c == '.' )
state = 2;
else if( c == 'e' || c == 'E' )
state = 3;
else
{
ob << c;
return ReadDigitSuffix(ob, 'U') ? UINTEGER10 : INTEGER10;
}
}
break;
case 1:
lexbuf += c;
ob >> c;
if( c == '.' ) state = 2;
else if( c == 'e' || c == 'E' ) state = 3;
else if( c == 'x' || c == 'X' )
{
lexbuf += c;
ReadDigit( ob, isxdigit );
if( toupper( *(lexbuf.end() - 1) ) == 'X' )
theApp.Error("îòñóòñòâóåò 16-ðè÷íàÿ ïîñëåäîâàòåëüíîñòü ïîñëå '%c'",c);
return ReadDigitSuffix(ob, 'U') ? UINTEGER16 : INTEGER16;
}
else if( isdigit8(c) )
{
lexbuf += c;
ReadDigit( ob, isdigit8 );
return ReadDigitSuffix(ob, 'U') ? UINTEGER8 : INTEGER8;
}
else
{
ob << c;
return ReadDigitSuffix(ob, 'U') ? UINTEGER10 : INTEGER10;
}
break;
case 2:
// ñþäà ïåðåõîä òîëüêî ïîñëå òî÷êè
lexbuf += c;
ob >> c;
if( c == 'e' || c == 'E' )
state = 3;
else if( isdigit(c) )
{
lexbuf += c;
ReadDigit(ob, isdigit);
ob >> c;
if( c == 'e' || c == 'E' )
state = 3;
else
{
read_suffix:
ob << c;
return ReadDigitSuffix(ob, 'F') ? LFLOAT : LDOUBLE;
}
}
// èíà÷å áûëî ñ÷èòàíî ÷èñëî è îñòàëàñü ïðîñòî òî÷êà
else
goto read_suffix;
break;
case 3:
// ñþäà ïåðåõîä ïîñëå E
lexbuf += c;
ob >> c;
if( c == '+' || c == '-' )
lexbuf += c, (ob >> c);
if( !isdigit(c) )
{
ob << c;
theApp.Error( "ïðîïóùåíî çíà÷åíèå ýêñïîíåíòû" );
return LDOUBLE;
}
else
{
lexbuf += c;
ReadDigit(ob, isdigit);
return ReadDigitSuffix(ob, 'F') ? LFLOAT : LDOUBLE;
}
}
}
// âûäåëèòü ëåêñåìó ñèìâîëüíàÿ êîíñòàíòà
inline static int LexemCharacter( BaseRead &ob )
{
register int c;
// ñèìâîë ' óæå ñ÷èòàí, ñ÷èòûâàåì äî äðóãîãî '
// ëèáî äî íîâîé ñòðîêè, êîëè÷åñòâî ñèìâîëîâ íå èìååò
// çíà÷åíèÿ, êîððåêòíîñòü çíà÷åíèÿ ñèìâîëà ïðîâåðÿåòñÿ ïîñëå
ob >> c;
if( c == '\'' ) // ïóñòîé ñèìâîë
{
lexbuf += '\\',
lexbuf += '0', lexbuf += '\''; // àâòîìàòè÷åñêè äîáàâëÿåì \0
theApp.Error( "ïóñòîé ñèìâîë" );
return CHARACTER;
}
ob << c;
for( ;; )
{
ob >> c;
if( c == '\'' )
{
lexbuf += c;
return CHARACTER;
}
else if( c == '\\' )
{
int pc;
ob >> pc;
if(pc == '\'')
{
lexbuf += '\\'; lexbuf += '\'';
continue;
}
else if(pc == '\\')
{
lexbuf += "\\\\";
continue;
}
else
ob << pc;
}
else if( c == '\n' || c == EOF )
{
ob << c;
theApp.Error( "íå õâàòàåò `\'' â êîíöå ñòðîêè" );
lexbuf += '\'';
return CHARACTER;
}
lexbuf += c;
}
return CHARACTER; // kill warning
}
// ôóíêöèÿ âûäåëÿåò ñëåäóþùóþ ëåêñåìó èç
// ïîòîêà in
static int Lex( BaseRead &ob, Position &lxmPos )
{
register int c;
lexbuf = "";
c = IgnoreNewlinesAndSpaces(ob);
lxmPos = ((CppFileRead&)ob).GetPosition(); // ñîõðàíÿåì ïîçèöèþ ëåêñåìû
if( IS_NAME_START(c) )
{
ob >> c; // ñ÷èòûâàåì ýòîò ñèìâîë åùå ðàç
lexbuf += c;
// âîçìîæíî ïðèçíàê îáîçíà÷åíèÿ wide-string
if( c == 'L' )
{
int p;
ob >> p;
if( p == '\'')
{
lexbuf += p;
LexemCharacter(ob);
return WCHARACTER;
}
else if( p == '\"' )
{
lexbuf += p;
LexemString(ob);
return WSTRING;
}
else
ob << p;
}
LexemName(ob);
// âîçìîæíî àëüòåðíàòèâíîå èìÿ, òàêîå êàê and, or...
if( int a = IsAlternativeName( lexbuf.c_str() ) )
return a;
// èíà÷å ïðîñòî èìÿ,
// êëþ÷åâûå ñëîâà îïðåäåëÿþòñÿ ïîòîì
return NAME;
}
else if( isdigit(c) || c == '.' )
{
int r;
if( (r = LexemDigit(ob)) == -1 )
return LexemOperator(ob); // èíà÷å ñ÷èòûâàåì òî÷êó (.*)
else
return r;
}
else if( c == '\"' )
{
lexbuf += c;
ob >> c;
return LexemString(ob);
}
else if( c == '\'' )
{
lexbuf += c;
ob >> c;
return LexemCharacter(ob);
}
else
return LexemOperator(ob);
}
// îïåðàòîð âûâîäà êîíòåéíåðà íà ñòàíäàðòíûé âûâîä, èñïîëüçóåòñÿ ïðè îòëàäêå
ostream &operator<<( ostream &out, const LexemContainer &lc )
{
out << "CALL \"operator<<( ostream &out, const LexemContainer &lc )\"\n ";
for( list<Lexem>::const_iterator p = lc.begin();
p != lc.end(); p++ )
out << (*p).GetBuf() << ' ';
out << endl << endl;
return out;
}
// ïîëó÷èòü ñëåäóþùóþ ëåêñåìó
const Lexem &LexicalAnalyzer::NextLexem()
{
// ñîõðàíÿåì òåêóùóþ ëåêñåìó, êàê ïðåäûäóùóþ è âû÷èñëÿåì ñëåä.
prevLxm = lastLxm;
// åñëè áóôåðíàÿ ëåêñåìà çàäàíà, âåðíåì åå
if( backLxm.GetCode() != 0 )
{
lastLxm = backLxm;
backLxm = Lexem(); // î÷èùàåì áóôåðíóþ ëåêñåìó
return lastLxm ;
}
// åñëè çàäàí êîíòåéíåð, âåðíåì èç íåãî
if( lexemContainer != NULL )
{
lastLxm = lexemContainer->front();
lexemContainer->pop_front();
if( lexemContainer->empty() )
lexemContainer = NULL;
return lastLxm;
}
// èíà÷å ðåæèì ñ÷èòûâàíèÿ èç ôàéëà
lastLxm.code = Lex(*inStream, lastLxm.pos);
lastLxm.buf = lexbuf.c_str();
// åñëè ýòî èìÿ, òî ïðîâåðèì åãî ñåìàíòè÷åñêîå çíà÷åíèå
if( lastLxm.code == NAME )
{
int nc = LookupCPPKeywords(lastLxm.buf.c_str());
if( nc != -1 )
lastLxm.code = nc;
}
return lastLxm;
}