/
scraper.rb
180 lines (132 loc) · 5.06 KB
/
scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
require 'scraperwiki'
# coding: utf-8
require 'nokogiri'
require 'open-uri'
require 'date'
require 'yaml'
require 'json'
#require 'pp'
#require 'uri'
#require 'net/http'
#require 'scraperwiki/datastore'
#require 'httpclient'
#require 'scraperwiki/scraper_require'
#ScraperWiki::sqliteexecute("update search_keyword set Provider = keyword where Provider is null")
#ScraperWiki::commit()
#glossary = ['num', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
glossary = ['x', 'y', 'z']
cur_word = ''
cur_abbr = ''
# webopedia
# http://www.webopedia.com/Top_Category.asp
url = 'http://www.abv.bg'
#url = "http://www.webopedia.com/Top_Category.asp"
doc = Nokogiri.HTML(open(url))
#ScraperWiki::sqliteexecute("delete from webopedia_glossary where 1=1")
doc.search('div[@class="subcat-list"]/div/ul/li').each do |v|
# p v.at("a").inner_html.strip + ' : http://www.webopedia.com' + v.at("a").attributes["href"].value
if v.at("a").inner_html.strip.start_with?("<img border=") == false
category = v.at("a").inner_html.strip
url_inner = 'http://www.webopedia.com' + v.at("a").attributes["href"].value
sub_category = v.at("a").attributes["href"].value.split("/")[1].strip
# category = v.at("a").attributes["href"].value.split("/")[2].strip
# p sub_category + ' : ' + category
doc_inner = Nokogiri.HTML(open(url_inner)) rescue doc_inner = v
doc_inner.search('div[@class="browse-list"]/div/ul/li').each do |vv|
cat_attrib = vv.at("a").attributes["class"].value rescue cat_attrib = ""
if cat_attrib != "category"
# p vv.at("a").inner_html.strip + ' :: http://www.webopedia.com' + vv.at("a").attributes["href"].value
term = vv.at("a").inner_html.strip
url_term = "http://www.webopedia.com" + vv.at("a").attributes["href"].value
key = vv.at("a").attributes["href"].value.split("/")[2].strip rescue key = ''
# doc_term = Nokogiri.HTML(open(url_term)) rescue doc_term = vv
gloss_text = ''
gloss_html = ''
# gloss_text = v.search 'div[@class="term termmargin"]'
# gloss_text = v.search 'div[@class="term termmargin"]'[0].inner_text.strip rescue gloss_text = ''
# gloss_html = v.search 'div[@class="term termmargin"]'[0].inner_html rescue gloss_html = ''
data = {
sub_category: sub_category,
category: category,
url_category: url_inner,
gloss: term,
url_gloss: url_term,
gloss_text: gloss_text,
gloss_html: gloss_html,
key: key
}
puts data.to_json
ScraperWiki::save_sqlite(unique_keys=['key', 'category'], data, table_name="webopedia_glossary", verbose=0)
ScraperWiki::commit()
end
end
else
cur_word = ''
cur_abbr = ''
end
end
if 1 > 0 then
# get it terms
glossary.each do |gloss|
url = "http://www.gartner.com/it-glossary/" + gloss + '/'
doc = Nokogiri.HTML(open(url))
doc.search('div[@id="main-content"]/ul/li').each do |v|
if v.at("a").inner_html.strip.split("(").count > 1
cur_word = v.at("a").inner_html.strip.split("(")[0].strip
cur_abbr = v.at("a").inner_html.strip.split("(")[1].strip.sub(')', '')
else
cur_word = v.at("a").inner_text.strip
cur_abbr = ''
end
data = {
word_link: v.at("a").attributes["href"].value,
word: cur_word,
abbr: cur_abbr,
gloss: v.at("a").inner_text.strip,
key: gloss
}
#puts data.to_json
ScraperWiki::save_sqlite(unique_keys=['key','gloss'], data, table_name="it_glossary", verbose=0)
end
end
# get OS terms
url = "http://www.computerhope.com/jargon/os.htm"
doc = Nokogiri.HTML(open(url))
doc.search('table[@class="mtable2"]/tr/td/p/a').each do |v|
# p "http://www.computerhope.com/jargon/" + v.attributes["href"].value
data = {
word_link: "http://www.computerhope.com/jargon/" + v.attributes["href"].value,
word: v.inner_html.strip,
abbr: '',
gloss: v.inner_html.strip,
key: v.inner_html.strip.slice!(0)
}
#puts data.to_json
ScraperWiki::save_sqlite(unique_keys=['key','gloss'], data, table_name="os_glossary", verbose=0)
end
# get database terms
url = "http://databases.about.com/od/administration/a/glossary.htm"
doc = Nokogiri.HTML(open(url))
doc.search('div[@id="articlebody"]/a').each do |v|
# p v.inner_html.strip
if v.inner_html.strip.split("(").count > 1
cur_word = v.inner_html.strip.split("(")[0].strip
cur_abbr = v.inner_html.strip.split("(")[1].strip.sub(')', '')
else
cur_word = v.inner_html.strip
cur_abbr = ''
end
data = {
word_link: v.attributes["href"].value,
word: cur_word,
abbr: cur_abbr,
gloss: v.inner_html.strip,
key: v.inner_html.strip.slice!(0)
}
#puts data.to_json
ScraperWiki::save_sqlite(unique_keys=['key','gloss'], data, table_name="database_glossary", verbose=0)
end
end #if 1 > 2
# MIT
# http://kb.mit.edu/confluence/labels/listlabels-alphaview.action
# http://kb.mit.edu/confluence/labels/listlabels-alphaview.action