/
url.go
89 lines (77 loc) · 1.85 KB
/
url.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
package urls
import (
"github.com/ocmdev/rita/config"
"gopkg.in/mgo.v2"
"gopkg.in/mgo.v2/bson"
)
func GetUrlCollectionScript(sysCfg *config.SystemConfig) (string, string, []string, mgo.MapReduce, []bson.D) {
// Name of source collection which will be aggregated into the new collection
source_collection_name := sysCfg.StructureConfig.HttpTable
// Name of the new collection
new_collection_name := sysCfg.UrlsConfig.UrlsTable
// Desired indeces
keys := []string{"$hashed:url", "-length"}
job := mgo.MapReduce{
Map: `function(){
var result = {
host: this.host,
uri: this.uri,
uid: this.uid,
ip: this.id_resp_h,
length: new NumberLong(this.host.length+this.uri.length)
};
emit(this._id, result);
}`,
Reduce: "function(key, values){return values}",
Out: bson.M{"replace": new_collection_name},
}
pipeline := []bson.D{
{
{"$project", bson.D{
{"_id", 1},
{"url", "$value.host"},
{"uri", "$value.uri"},
{"ip", "$value.ip"},
{"length", "$value.length"},
{"uid", "$value.uid"},
}},
},
{
{"$out", new_collection_name},
},
}
return source_collection_name, new_collection_name, keys, job, pipeline
}
func GetHostnamesAggregationScript(sysCfg *config.SystemConfig) (string, string, []string, []bson.D) {
source_collection_name := sysCfg.UrlsConfig.UrlsTable
new_collection_name := sysCfg.UrlsConfig.HostnamesTable
keys := []string{"$hashed:host"}
pipeline := []bson.D{
{
{"$project", bson.D{
{"_id", 0},
{"url", 1},
{"ip", 1},
}},
},
{
{"$group", bson.D{
{"_id", "$url"},
{"ips", bson.D{
{"$addToSet", "$ip"},
}},
}},
},
{
{"$project", bson.D{
{"_id", 0},
{"host", "$_id"},
{"ips", 1},
}},
},
{
{"$out", new_collection_name},
},
}
return source_collection_name, new_collection_name, keys, pipeline
}