-
Notifications
You must be signed in to change notification settings - Fork 0
/
rules.py
141 lines (134 loc) · 3.95 KB
/
rules.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
terminals_symbols = {
"http://": "http",
"ftp://": "ftp",
"telnet://": "telnet",
"mailto::": "mailto",
"/": "slash",
"?": "qmark",
"@": "at",
".": "dot",
":": "colon",
"+": "plus"
}
terminals_alpha = ("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
"A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z")
terminals_digits = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
non_terminals = ("url", "httpaddress", "ftpaddress", "telnetaddress", "mailtoaddress", "hostport", "httpaddr_1")
rules = [
"httpaddress", # 1
"ftpaddress", # 2
"telnetaddress", # 3
"mailtoaddress", # 4
"http hostport httpaddr_1", # 5
"slash path httpaddr_2", # 6
"qmark search", # 7
"eps", # 8
"qmark search", # 9
"eps", # 10
"ftp login slash path", # 11
"telnet login", # 12
"mailto xalphas at hostname", # 13
"user login_1", # 14
"colon password at hostport", # 15
"at hostport", # 16
"hostname hostport_1", # 17
"colon port", # 18
"eps", # 19
"xalphas hostname_1", # 20
"dot hostname", # 21
"eps", # 22
"digits", # 23
"segment path_1", # 24
"slash path", # 25
"eps", # 26
"xalphas search_1", # 27
"plus search", # 28
"eps", # 29
"xalphas", # 30
"xalphas", # 31
"xalpha segment", # 32
"eps", # 33
"xalpha xalphas_1", # 34
"xalphas", # 35
"eps", # 36
"alpha", # 37
"digit", # 38
"digit digits_1", # 39
"digits", # 40
"eps", # 41
terminals_alpha, # 42
terminals_digits # 43
]
# since there's so many letters and numbers, instead of writing them all manually in the rules we replaced them with placeholders
# terminals_alpha -> letter
# terminals_digits -> number
parsing_table = {
"url": {"http": 1, "ftp": 2, "telnet": 3, "mailto": 4},
"httpaddress": {"http": 5},
"httpaddr_1": {"slash": 6, "qmark": 7, "$": 8},
"httpaddr_2": {"qmark": 9, "$": 10},
"ftpaddress": {"ftp": 11},
"telnetaddress": {"telnet": 12},
"mailtoaddress": {"mailto": 13},
"login": {'letter': 14, 'number': 14},
"login_1": {"colon": 15, "at": 16},
"hostport": {'letter': 17, 'number': 17},
"hostport_1": {"colon": 18, "slash": 19, "qmark": 19, "$": 19},
"hostname": {'letter': 20, 'number': 20},
"hostname_1": {"slash": 22, "qmark": 22, "colon": 22, "dot": 21, "$": 22},
"port": {'number': 23},
"path": {"slash": 24, "qmark": 24, 'letter': 24, 'number': 24, "$": 24},
"path_1": {"slash": 25, "qmark": 26, "$": 26},
"search": {'letter': 27, 'number': 27},
"search_1": {"plus": 28, "$": 29},
"user": {'letter': 30, 'number': 30},
"password": {'letter': 31, 'number': 31},
"segment": {'letter': 32, 'number': 32, "slash": 33, "qmark": 33, "$": 33},
"xalphas": {'letter': 34, 'number': 34},
"xalphas_1": {"slash": 36, "qmark": 36, "plus": 36, "colon": 36, "at": 36, "dot": 36, "$": 36, 'letter': 35, 'number': 35},
"xalpha": {'letter': 37, 'number': 38},
"digits": {'number': 39},
"digits_1": {'number': 40, "$": 41, "slash": 41, "qmark": 41},
"alpha": {'letter': 42},
"digit": {'number': 43}
}
parsing_table_tree_counters = {
"url": 0,
"httpaddress": 0,
"httpaddr_1": 0,
"httpaddr_2": 0,
"ftpaddress": 0,
"telnetaddress": 0,
"mailtoaddress": 0,
"login": 0,
"login_1": 0,
"hostport": 0,
"hostport_1": 0,
"hostname": 0,
"hostname_1": 0,
"port": 0,
"path": 0,
"path_1": 0,
"search": 0,
"search_1": 0,
"user": 0,
"password": 0,
"segment": 0,
"xalphas": 0,
"xalphas_1": 0,
"xalpha": 0,
"digits": 0,
"digits_1": 0,
"alpha": 0,
"digit": 0,
"http": 0,
"ftp": 0,
"telnet": 0,
"mailto": 0,
"slash": 0,
"qmark": 0,
"at": 0,
"dot": 0,
"colon": 0,
"plus": 0
}