forked from mikenz/twitter-text-php
/
Regex.php
177 lines (160 loc) · 5.5 KB
/
Regex.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
<?php
/**
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter
*/
/**
* Twitter Regex Abstract Class
*
* Used by subclasses that need to parse tweets.
*
* Originally written by {@link http://github.com/mikenz Mike Cochrane}, this
* is based on code by {@link http://github.com/mzsanford Matt Sanford} and
* heavily modified by {@link http://github.com/ngnpope Nick Pope}.
*
* @author Mike Cochrane <mikec@mikenz.geek.nz>
* @author Nick Pope <nick@nickpope.me.uk>
* @copyright Copyright © 2010, Mike Cochrane, Nick Pope
* @license http://www.apache.org/licenses/LICENSE-2.0 Apache License v2.0
* @package Twitter
*/
abstract class Twitter_Regex {
/**
* Expression to at sign characters
*
* @var string
*/
const REGEX_AT_SIGNS = '[@@]';
/**
* Expression to match characters that may come before a URL.
*
* @var string
*/
const REGEX_URL_CHARS_BEFORE = '(?:[^/"\':!=]|^|\\:)';
/**
* Expression to match the domain portion of a URL.
*
* @var string
*/
const REGEX_URL_DOMAIN = '(?:[^\\p{P}\\s][\\.-]|[^\\p{P}\\s])+\\.[a-z]{2,}(?::[0-9]+)?';
/**
* Expression to match characters that may come in the URL path.
*
* @var string
*/
const REGEX_URL_CHARS_PATH = '(?:(?:\([a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~]+\))|@[^\\/]+\\/|[\\.\\,]?[a-z0-9!\\*\';:=\\+\\$\\/%#\\[\\]\\-_,~])';
/**
* Expression to match characters that may come at the end of the URL path.
*
* Valid end-of-path chracters (so /foo. does not gobble the period).
* 1. Allow ) for Wikipedia URLs.
* 2. Allow =&# for empty URL parameters and other URL-join artifacts.
*
* @var string
*/
const REGEX_URL_CHARS_PATH_END = '[a-z0-9=#\\/]';
/**
* Expression to match characters that may come in the URL query string.
*
* @var string
*/
const REGEX_URL_CHARS_QUERY = '[a-z0-9!\\*\'\\(\\);:&=\\+\\$\\/%#\\[\\]\\-_\\.,~]';
/**
* Expression to match characters that may come at the end of the URL query
* string.
*
* @var string
*/
const REGEX_URL_CHARS_QUERY_END = '[a-z0-9_&=#]';
/**
* Expression to match a username followed by a list.
*
* @var string
*/
const REGEX_USERNAME_LIST = '$([^a-z0-9_]|^)([@|@])([a-z0-9_]{1,20})(/[a-z][-_a-z0-9\x80-\xFF]{0,24})?$i';
/**
* Expression to match a username mentioned anywhere in a tweet.
*
* @var string
*/
const REGEX_USERNAME_MENTION = '/(^|[^a-zA-Z0-9_])[@@]([a-zA-Z0-9_]{1,20})(?=(.|$))/';
/**
* Expression to match a hashtag.
*
* @var string
*
* @todo Match latin characters with accents.
*/
const REGEX_HASHTAG = '$(^|[^0-9A-Z&/]+)([##]+)([0-9A-Z_]*[A-Z_]+[a-z0-9_üÀ-ÖØ-öø-ÿ]*)$iu';
/**
* Expression to match whitespace.
*
* Single byte whitespace characters
* 0x0009-0x000D White_Space # Cc # <control-0009>..<control-000D>
* 0x0020 White_Space # Zs # SPACE
* 0x0085 White_Space # Cc # <control-0085>
* 0x00A0 White_Space # Zs # NO-BREAK SPACE
* Multi byte whitespace characters
* 0x1680 White_Space # Zs # OGHAM SPACE MARK
* 0x180E White_Space # Zs # MONGOLIAN VOWEL SEPARATOR
* 0x2000-0x200A White_Space # Zs # EN QUAD..HAIR SPACE
* 0x2028 White_Space # Zl # LINE SEPARATOR
* 0x2029 White_Space # Zp # PARAGRAPH SEPARATOR
* 0x202F White_Space # Zs # NARROW NO-BREAK SPACE
* 0x205F White_Space # Zs # MEDIUM MATHEMATICAL SPACE
* 0x3000 White_Space # Zs # IDEOGRAPHIC SPACE
*
* @var string
*/
const REGEX_WHITESPACE = '[\x09-\x0D\x20\x85\xA0]|\xe1\x9a\x80|\xe1\xa0\x8e|\xe2\x80[\x80-\x8a,\xa8,\xa9,\xaf\xdf]|\xe3\x80\x80';
/**
* Contains the complete valid URL pattern string.
*
* This should be generated the first time the constructor is called.
*
* @var string The regex pattern for a valid URL.
*/
protected static $REGEX_VALID_URL = null;
/**
* Contains the reply username pattern string.
*
* This should be generated the first time the constructor is called.
*
* @var string The regex pattern for a reply username.
*/
protected static $REGEX_REPLY_USERNAME = null;
/**
* The tweet to be used in parsing. This should be populated by the
* constructor of all subclasses.
*
* @var string
*/
protected $tweet = '';
/**
* This constructor is used to populate some variables.
*
* @param string $tweet The tweet to parse.
*/
protected function __construct($tweet) {
if (is_null(self::$REGEX_VALID_URL)) {
self::$REGEX_VALID_URL = '$(' # $1 Complete match
. '('.self::REGEX_URL_CHARS_BEFORE.')' # $2 Preceding character
. '(' # $3 Complete URL
. '(https?://|www\\.)' # $4 Protocol (or www)
. '('.self::REGEX_URL_DOMAIN.')' # $5 Domain(s) (and port)
. '(/'.self::REGEX_URL_CHARS_PATH.'*' # $6 URL Path
. self::REGEX_URL_CHARS_PATH_END.'?)?'
. '(\\?'.self::REGEX_URL_CHARS_QUERY.'*' # $7 Query String
. self::REGEX_URL_CHARS_QUERY_END.')?'
. ')'
. ')$i';
}
if (is_null(self::$REGEX_REPLY_USERNAME)) {
self::$REGEX_REPLY_USERNAME = '/^('.self::REGEX_WHITESPACE.')*[@@]([a-zA-Z0-9_]{1,20})/';
}
$this->tweet = $tweet;
}
}