-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape_original.php
128 lines (107 loc) · 4.72 KB
/
scrape_original.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
<?php
function cleanString($text) {
// 1) convert á ô => a o
$text = preg_replace("/[áàâãªäā]/u","a",$text);
$text = preg_replace("/[ÁÀÂÃÄ]/u","A",$text);
$text = preg_replace("/[ÍÌÎÏ]/u","I",$text);
$text = preg_replace("/[íìîïī]/u","i",$text);
$text = preg_replace("/[éèêëē]/u","e",$text);
$text = preg_replace("/[ÉÈÊË]/u","E",$text);
$text = preg_replace("/[óòôõºöō]/u","o",$text);
$text = preg_replace("/[ÓÒÔÕÖ]/u","O",$text);
$text = preg_replace("/[úùûü]/u","u",$text);
$text = preg_replace("/[ÚÙÛÜ]/u","U",$text);
$text = preg_replace("/[’‘‹›‚]/u","'",$text);
$text = preg_replace("/[“”«»„]/u",'"',$text);
$text = str_replace("–","-",$text);
$text = str_replace(" "," ",$text);
$text = str_replace("ç","c",$text);
$text = str_replace("Ç","C",$text);
$text = str_replace("ñ","n",$text);
$text = str_replace("Ñ","N",$text);
//2) Translation CP1252. – => -
$trans = get_html_translation_table(HTML_ENTITIES);
$trans[chr(130)] = '‚'; // Single Low-9 Quotation Mark
$trans[chr(131)] = 'ƒ'; // Latin Small Letter F With Hook
$trans[chr(132)] = '„'; // Double Low-9 Quotation Mark
$trans[chr(133)] = '…'; // Horizontal Ellipsis
$trans[chr(134)] = '†'; // Dagger
$trans[chr(135)] = '‡'; // Double Dagger
$trans[chr(136)] = 'ˆ'; // Modifier Letter Circumflex Accent
$trans[chr(137)] = '‰'; // Per Mille Sign
$trans[chr(138)] = 'Š'; // Latin Capital Letter S With Caron
$trans[chr(139)] = '‹'; // Single Left-Pointing Angle Quotation Mark
$trans[chr(140)] = 'Œ'; // Latin Capital Ligature OE
$trans[chr(145)] = '‘'; // Left Single Quotation Mark
$trans[chr(146)] = '’'; // Right Single Quotation Mark
$trans[chr(147)] = '“'; // Left Double Quotation Mark
$trans[chr(148)] = '”'; // Right Double Quotation Mark
$trans[chr(149)] = '•'; // Bullet
$trans[chr(150)] = '–'; // En Dash
$trans[chr(151)] = '—'; // Em Dash
$trans[chr(152)] = '˜'; // Small Tilde
$trans[chr(153)] = '™'; // Trade Mark Sign
$trans[chr(154)] = 'š'; // Latin Small Letter S With Caron
$trans[chr(155)] = '›'; // Single Right-Pointing Angle Quotation Mark
$trans[chr(156)] = 'œ'; // Latin Small Ligature OE
$trans[chr(159)] = 'Ÿ'; // Latin Capital Letter Y With Diaeresis
$trans['euro'] = '€'; // euro currency symbol
ksort($trans);
foreach ($trans as $k => $v) {
$text = str_replace($v, $k, $text);
}
// 3) remove <p>, <br/> ...
$text = strip_tags($text);
// 4) & => & " => '
$text = html_entity_decode($text);
// 5) remove Windows-1252 symbols like "TradeMark", "Euro"...
$text = preg_replace('/[^(\x20-\x7F)]*/','', $text);
$targets=array('\r\n','\n','\r','\t');
$results=array(" "," "," ","");
$text = str_replace($targets,$results,$text);
return ($text);
}
function scrape($firstForm) {
setlocale(LC_ALL, "en_US.utf8");
$url = 'http://www.verbix.com/webverbix/go.php?T1=' . $firstForm . '&imageField.x=0&imageField.y=0&D1=9&H1=109';
$output = file_get_contents($url);
$doc = new DOMDocument();
@$doc->loadHTML($output);
$warning = preg_match("/does not exist/i", $doc->saveHTML());
if ($warning) {
return false;
}
$tables = $doc->getElementsByTagName("table");
$matches = array();
foreach ($tables as $t) {
if ($t->getAttribute("cellpadding") != "0") continue;
if ($t->getAttribute("cellspacing") != "0") continue;
if ($t->getAttribute("border") != "0") continue;
if ($t->getAttribute("width") != "100%") continue;
if ($t->getAttribute("height") == "48") continue;
$matches[sizeof($matches)] = $t;
}
// $matches[0] is the one we want, and it contains one table
$active = $matches[0]->getElementsByTagName("tr");
$passive = $matches[1]->getElementsByTagName("tr");
$voices = array($active, $passive);
$extracted = array();
foreach ($voices as $voice) {
foreach ($voice as $tr) {
$myTense = array();
foreach ($tr->getElementsByTagName("span") as $span) {
//$myTense[sizeof($myTense)] = cleanString($span->nodeValue);
if ($span->getAttribute("class") == "notused") {
$extracted[sizeof($extracted)] = "";
} else {
$myForm = cleanString($span->nodeValue);
// echo "\n> " . $myForm . "<br>";
$extracted[sizeof($extracted)] = $myForm;
}
}
//$extracted[sizeof($extracted) - 1] = $myTense;
}
}
return $extracted;
}
?>