-
Notifications
You must be signed in to change notification settings - Fork 0
/
clean.py
executable file
·74 lines (64 loc) · 2.01 KB
/
clean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#!/usr/bin/env python3
import re
hosts = list()
whitelist = (
"0.0.0.0 s.youtube.com\n",
"0.0.0.0 s.click.aliexpress.com\n",
"0.0.0.0 click.linksynergy.com\n",
"0.0.0.0 www.googleadservices.com\n",
"0.0.0.0 googleadservices.com\n",
"0.0.0.0 geolocation.onetrust.com\n", # for cbs news video player
"0.0.0.0 bnc.lt\n", # airbnb links
"0.0.0.0 akamai.net\n", # cdn
"0.0.0.0 intel.com\n",
)
def normalize(string):
# Ignore comments by themselves
if string.startswith("#"):
return None
# If there is no inline comment, keep it as is for now
if (string.find("#") == -1):
x = string
# If there is inline comment, remove it
else:
x = string[:string.find("#")] + "\n"
# Replace tabs with spaces
x = x.replace("\t", " ")
# Replace 127.0.0.1 with 0.0.0.0 except for localhost
if (string.find("localhost") == -1):
x = x.replace("127.0.0.1", "0.0.0.0")
# Remove multiple spaces
x = re.sub(" +", " ", x)
# Remove trailing spaces
x = x.replace(" \n", "\n")
# Insert missing spaces
if not x.endswith("\n"):
x += "\n"
# Final clean up: ignore non-valid lines and whitelisted domains
if (x != "0.0.0.0\n") and (x != "\n") and (x not in whitelist):
return x
with open("raw-hosts.txt", "r") as f:
for row in f:
if normalize(row):
hosts.append(normalize(row))
with open("raw-simple.txt", "r") as f:
for row in f:
if normalize(row):
hosts.append("0.0.0.0 " + normalize(row))
try:
with open("custom-hosts.txt", "r") as f:
for row in f:
if normalize(row):
hosts.append(normalize(row))
except FileNotFoundError:
pass
hosts = set(hosts)
hosts = sorted(list(hosts), key=None, reverse=True)
prev_count = 0
with open("hosts", "r") as f:
for row in f:
prev_count += 1
with open("hosts", "w") as f:
for row in hosts:
f.write(row)
print("Done, total records:", len(hosts), "previously:", prev_count)