In [104]:
import requests
import json
import wikipedia as wk
import csv

import ast
import numpy as np

In [121]:

def get_revisions_by_pageid(pageid, title=None, lat=None, lon=None):
    headers = {'content-type': 'text/mediawiki-api-prettyprint-wrapped; charset=utf-8'}
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query', 
        'format': 'json', 
        'prop': 'revisions', 
        'pageids': pageid,
        'rvprop': 'timestamp|ids|comment',
        'rvlimit': 500
    }

    req = requests.post(url, params=params, headers=headers)

    revs = req.json()['query']['pages'][str(pageid)]['revisions']

    for rev in revs:
        rev['title'] = title
        rev['pageid'] = pageid
        rev['lat'] = lat
        rev['lon'] = lon

    return revs

def get_revisions_by_page(page):
    return get_revisions_by_pageid(page['pageid'], page['title'], page['lat'], page['lon'])

def to_csv(_json, file_name):
    f = csv.writer(open(file_name, "wb+"))

    # Write CSV Header, If you dont need that, remove this line
    _headers = ["pageid", "title", "lat", "lon", "timestamp", "revid"]
    f.writerow(_headers)

    i = 0
    def to_utf(t):
        if isinstance(t, basestring):
            return t.encode('utf-8')
        else:
            return t
    
    for x in _json:
        row = [to_utf(x[h]) for h in _headers]
        
        if i % 100 == 0:
            print("Progress: %s" % i)
        f.writerow(row)
        i += 1

def get_pages_in_radius(lat, lon, radius=2500):
    headers = {'content-type': 'text/mediawiki-api-prettyprint-wrapped; charset=utf-8'}
    url = 'https://en.wikipedia.org/w/api.php'
    params = {
        'action': 'query', 
        'format': 'json', 
        'list': 'geosearch', 
        'gscoord': str(lat) + "|" + str(lon),
        'gsradius': radius,
        'gslimit': 10000
    }

    req = requests.post(url, params=params, headers=headers)

    revs = req.json()['query']['geosearch']

    result = [
        {
            'pageid': rev['pageid'],
            'lat': rev['lat'],
            'lon': rev['lon'],
            'title': rev['title'],
        } for rev in revs
    ]
    return result


def get_pages_in_area():
    lat1, long1 = 50.27, 30.30
    lat2, long2 = 50.57, 30.81

    main_set = {}

    count = 0
    new_count = 0
    for lat in np.arange(lat1, lat2, 0.04):
        for lon in np.arange(long1, long2, 0.04):
            _p = get_pages_in_radius(lat, lon)
            for i in _p:
                if count != 0 and count % 100 == 0:
                    print("Area search: %s / %s" % (new_count, count))

                if i['pageid'] not in main_set:
                    main_set[i['pageid']] = i
                    new_count += 1
                
                count += 1
    print("Area pages found: %s" % len(main_set.values()))
    return main_set.values()

In [None]:
pages = get_pages_in_area()

print("Pages found: %s" % len(pages))
revs = []

i = 0
for p in pages:
    if i != 0 and i % 10 ==0:
        print("Progress revisions: %s" %i)
    new_revs = list(get_revisions_by_page(p))
    
    for r in new_revs:
        revs.append(r)
    i += 1
    
print('Finish! Total revisions: %s' % len(revs))

Area search: 68 / 100
Area search: 153 / 200
Area search: 201 / 300
Area search: 272 / 400

In [103]:
to_csv(revs, 'results_v3.csv')

Progress: 0
Progress: 100
Progress: 200
Progress: 300
Progress: 400
Progress: 500
Progress: 600
Progress: 700
Progress: 800
Progress: 900
Progress: 1000
Progress: 1100
Progress: 1200
Progress: 1300
Progress: 1400
