/
not_transcluded.py
145 lines (117 loc) · 4.52 KB
/
not_transcluded.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# -*- coding: utf-8 -*-
#
# @file transclusions.py
#
# @remark Copyright 2016 Philippe Elie
# @remark Read the file COPYING
#
# @author Philippe Elie
import sys
import os
sys.path.append(os.path.expanduser('~/wikisource'))
from ws_category import domain_urls as urls
from ws_namespaces import index as index_name
from common import db
from gen_stats import all_domain
from common import common_html
import urllib
def filter_result(books):
result = []
for key in books:
# FIXME: is >= 5 ok ?
page_ids = books[key]
if len(page_ids) >= 5:
# FIXME: this is perhaps not the best way as we check for
# activity only in pages not transcluded, that means someone
# is perhaps working on pages already transcluded or is working
# on red page w/o validating them. Another way will be to get the
# page id of the index and from that use "related changes" filtered
# to namespace Page:
fmt_strs = ','.join(['%s'] * len(page_ids))
cursor.execute("""SELECT count(*)
FROM recentchanges
WHERE rc_bot=0 AND rc_cur_id IN (%s)
""" % fmt_strs,
page_ids )
if not cursor.fetchone()[0]:
result.append( ( len(page_ids), key ) )
else:
print "filtered:", key
# debug: for replicas missing record
if False and key == 'Dictionnaire_portatif_de_cuisine,_d’office,_et_de_distillation,_1772.djvu':
q = 'select page_title from page where page_id in (%s)' % fmt_strs
cursor.execute(q, page_ids)
for x in range(cursor.rowcount):
print cursor.fetchone()[0]
result.sort(reverse = True)
return result
def format_html_line(domain, bookname, count):
title = index_name['wikisource'][domain] + ':' + bookname
if domain == 'old':
domain = 'mul'
result = '<li>'
fmt = '<a href="//%s.wikisource.org/wiki/%s">%s</a> %d'
result += fmt % (domain, urllib.quote(title), bookname, count)
# checker redirect with a 301 from checker? to checker/? so use
# directly that url even if it's a bit weird
fmt = ' — <a href="/checker/?db=%s&title=%s">Check pages</a>'
result += fmt % (db.database_name(domain, 'wikisource'), title)
result += '</li>'
return result
def not_transcluded(domain, cursor):
# set of Page: in cat 3/4 not transcluded from main
query = """
SELECT page_title, page_id FROM categorylinks LEFT JOIN page ON page_id=cl_from
WHERE cl_to in (%s, %s) AND page_title NOT IN
(SELECT tl_title FROM templatelinks
WHERE tl_namespace=%s AND tl_from_namespace=0);
"""
ns = urls[domain][0]
cat3 = urls[domain][1]
cat4 = urls[domain][2]
cursor.execute(query, [ cat3, cat4, ns ])
print cursor.rowcount
result = {}
for x in range(cursor.rowcount):
title, page_id = cursor.fetchone()
title = title.split('/')[0]
if title[-5:] in [ '.djvu', '.pdf', '.tif' ]:
result.setdefault(title, [])
result[title].append(page_id)
result = filter_result(result)
if False:
out_file = os.path.expanduser('~/tmp/transclusions/%s.txt' % domain)
out_fd = open(out_file, 'w')
for d in result:
print >> out_fd, d[1], d[0]
out_fd.close()
out_file = os.path.expanduser('~/tmp/transclusions/%s.html' % domain)
if os.path.exists(out_file):
os.remove(out_file)
out_fd = open(out_file, 'w')
title = '%s.wikisource.org not transcluded page' % domain
head = common_html.get_head(title, html5 = True).encode('utf-8')
print >> out_fd, head
print >> out_fd, '<body>'
if len(result):
print >> out_fd, '<ol>'
for d in result:
print >> out_fd, format_html_line(domain, d[1], d[0])
print >> out_fd, '</ol>'
else:
"Empty result, no Index meet the criteria to be listed in this file."
print >> out_fd, '\n</body>\n</html>'
out_fd.close()
return len(result)
if __name__ == "__main__":
tot_count = 0
for domain in all_domain:
print domain
#if domain != 'fr':
# continue
conn = db.create_conn(domain = domain, family = 'wikisource')
cursor = db.use_db(conn, domain = domain, family = 'wikisource')
tot_count += not_transcluded(domain, cursor)
cursor.close()
conn.close()
print "total:", tot_count