# Genève vu par Wikipédia

In [1]:
import requests, json, random, re, pandas as pd, time
from IPython.core.display import display, HTML

In [2]:
def query(request, limit):
    request['action'] = 'query'
    request['format'] = 'json'
    lastContinue = {'continue': ''}
    counter = 0
    while True:
        counter += 1
        if counter > limit:
            break
        # Clone original request
        req = request.copy()
        # Modify it with the values returned in the 'continue' section of the last result.
        req.update(lastContinue)
        # Call API
        result = requests.get('http://en.wikipedia.org/w/api.php', params=req).json()
        if 'error' in result:
            raise Error(result['error'])
        if 'warnings' in result:
            print(result['warnings'])
        if 'query' in result:
            yield result['query']
        if 'continue' not in result:
            break
        lastContinue = result['continue']

In [3]:
result_list = []

for result in query({'list':'search','srsearch': 'Genève', 'srwhat':'text'}, 3):
    print('Got a new result')
    result_list.extend(result['search'])

Got a new result
Got a new result
Got a new result


In [4]:
df = pd.DataFrame(result_list)
df.head()

Unnamed: 0,ns,size,snippet,timestamp,title,wordcount
0,0,100443,"industriels, Archives d'État de <span class=""s...",2016-11-14T05:47:15Z,Geneva,10419
1,0,5033,"&quot;Le Stade de <span class=""searchmatch"">Ge...",2016-11-26T15:11:48Z,Stade de Genève,366
2,0,4930,Sports. Originally founded as the <span clas...,2016-05-29T23:45:52Z,Lions de Genève,220
3,0,8475,"Vernets. 1963 : Creation of <span class=""searc...",2016-10-26T12:07:18Z,Genève-Servette HC,868
4,0,2465,"Banque cantonale de <span class=""searchmatch"">...",2016-10-25T11:12:48Z,Banque cantonale de Genève,157


In [5]:
time.strftime('%B %Y', time.strptime("2016-11-28", '%Y-%m-%d'))

'November 2016'

In [6]:
def get_day(timestamp):
    #return time.strftime('%A, %B %d', time.strptime(timestamp.split('T')[0], '%Y-%m-%d'))
    return time.strftime('%B %Y', time.strptime(timestamp.split('T')[0], '%Y-%m-%d'))

In [7]:
df['date'] = df['timestamp'].apply(get_day)

In [8]:
df['date'].value_counts()

November 2016     8
September 2016    5
October 2016      5
March 2016        2
July 2016         2
April 2016        1
November 2014     1
September 2015    1
June 2015         1
February 2016     1
January 2016      1
August 2013       1
May 2016          1
Name: date, dtype: int64

In [9]:
json_list = []
for date, group in df.groupby('date'):
    titles = []
    for index, row in group.iterrows():
        titles.append(row['title'])
    json_list.append({'name':date, 'words':titles})
json_list

[{'name': 'April 2016', 'words': ['École Japonaise Complémentaire de Genève']},
 {'name': 'August 2013', 'words': ['Geneva University of Music']},
 {'name': 'February 2016', 'words': ["Musée d'ethnographie de Genève"]},
 {'name': 'January 2016', 'words': ['Orchestre de chambre de Genève']},
 {'name': 'July 2016',
  'words': ['Genève-Cornavin railway station', 'Trolleybuses in Geneva']},
 {'name': 'June 2015', 'words': ['Gli Angeli Genève']},
 {'name': 'March 2016',
  'words': ['Urania Genève Sport', 'Racing Club Genève']},
 {'name': 'May 2016', 'words': ['Lions de Genève']},
 {'name': 'November 2014', 'words': ['Conservatoire de Musique de Genève']},
 {'name': 'November 2016',
  'words': ['Geneva',
   'Stade de Genève',
   'Universal Genève',
   'Fêtes de Genève',
   'Centre d’Art Contemporain Genève',
   'Servette FC',
   'Collège Calvin',
   'Canton of Geneva']},
 {'name': 'October 2016',
  'words': ['Genève-Servette HC',
   'Banque cantonale de Genève',
   'Paddle steamer Genève',
 

In [10]:
df.to_csv('wikipedia_geneva_dates.csv')

In [11]:
data = []    

## return an array with the other nodes to connect to
def import_words(word_list):
    lines = []
    for word in word_list:
        lines.append('Title.' + word)
    return lines

def import_authors(author_list):
    lines = []
    for author in author_list:
        lines.append('Author.' + author)
    return lines

## create the node data
def generate_word(word, deputee):
    timports = []
    timports.extend(import_authors(deputee))
    
    element = {
        "name": "Title." + word,
        "size": 0,
        "imports": import_authors(deputee)
    }
    return element

def generate_author(deputee, words):
    element = {
        "name": "Author." + deputee,
        "size": 0,
        "imports": import_words(words)
    }
    return element

for item in json_list:
    deputee_name = item['name']

    for single_word in item['words']:
        data.append(generate_word(single_word, [deputee_name]))

    deputee = generate_author(deputee_name, item['words'])
    data.append(deputee)

## export
with open('wiki_titles.json', 'w') as outfile:
    json.dump(data, outfile, indent=4, separators=(',', ': '))

In [12]:
display(HTML('<iframe src="http://rospo.local/~paul/Wiki_dataviz/dataviz.html" width="900" height="900"></iframe>'))

In [14]:
!open .