<a href="https://colab.research.google.com/github/mattiapocci/PhilosopherRank/blob/master/scrapingWikiList.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# ***Imports***

In [None]:
import urllib.request
import requests
import json
from bs4 import BeautifulSoup as bs
import pandas as pd
print('imports successful')
starturl = 'https://en.wikipedia.org'

# ***Clustering***

In [None]:
# Define a cluster for every philosophy, in order to consider them for the influences
import pprint
url = 'https://en.wikipedia.org/wiki/List_of_philosophies'
page = urllib.request.urlopen(url)
soup = bs(page, "lxml")

# return a list of titles for a given page
def page_scrape(address):
    page = urllib.request.urlopen(address)
    soup = bs(page, "lxml")
    a_list = soup.find_all('a')
    res = []
    for a in a_list:
        if a.get('title') is not None:
            res.append(a.get('title'))
    return res

raw_clusters = {}
p_list = soup.find_all('p')
# for every philosophy
for p in p_list:
    a_list = p.find_all('a')
    # get a list of titles
    for a in a_list:
        try:
            current_url = starturl + a.get('href')
            raw_clusters[a.get('title')] = page_scrape(current_url)
            #construct a list and return 
            print('Built element with title ' + a.get('title'))
        except:
            print(a)
            print('anchor has no title')
# at this point raw_clusters will contain a list of titles for every philosophy (for example raw_clusters['stoicism'] will contain [stoic_phil1, stoic_phil2, ...])
pprint.pprint(raw_clusters)

In [None]:
# write clusters to file and download
from google.colab import files
with open('raw_clusters.json', 'w', encoding="utf-8") as fp:
    json.dump(raw_clusters, fp, indent=4)
files.download('raw_clusters.json')

# ***Biography Table***

In [None]:
# Get data from wikipedias biography table
# some philosophers have the fields 'influencers' and 'influenced'
import pprint
def bio_table(url):
    # open url with bs
    page = urllib.request.urlopen(url)
    soup = bs(page, "lxml")
    # get biography table
    table = soup.find('table', class_='infobox biography vcard')
    print(len(table.find_all('ul', class_='NavContent')))
    try:
        # get influencers unordered list
        influencers = table.find_all('ul', class_='NavContent')[0]
    except:
        influencers = []
    try:
        # get influenced unordered list
        influenced = table.find_all('ul', class_='NavContent')[1]
    except:
        influenced = []
    print(influenced)
    final_influencers = []
    final_influenced = []
    # We want a list of titles of wikipedia pages
    if influencers != []:
        for a in influencers.find_all('a'):
            try:
                # extract the title
                final_influencers.append(a.get('title'))
            except:
                pass
    # We want a list of titles of wikipedia pages
    if influenced != []:
        for a in influenced.find_all('a'):
            try:
                # extract the title
                final_influenced.append(a.get('title'))
            except:
                pass

    return final_influencers,final_influenced

# ***Scraping philosophers lists from wikipedia***

In [None]:
# Wikipedia has 4 pages listing philosophers, (A-C), (D-H), (I-Q), (R-Z)
urls = ["https://en.wikipedia.org/wiki/List_of_philosophers_(A%E2%80%93C)", "https://en.wikipedia.org/wiki/List_of_philosophers_(D%E2%80%93H)", "https://en.wikipedia.org/wiki/List_of_philosophers_(I%E2%80%93Q)", "https://en.wikipedia.org/wiki/List_of_philosophers_(R%E2%80%93Z)"]

# Initialising json
# Philosopher
phil=[]
# Page url
href=[]
# Article
raw=[]
# Influence bio-table data
influencers=[]
influenced=[]
# For every list of philosophers
for url in urls:
    # Parse page with beautiful soup
    page = urllib.request.urlopen(url)
    soup = bs(page, "lxml")
    # Parse the list, unordered list
    for ultag in soup.find_all('ul'):
        # Access the list item
        for litag in ultag.find_all('li'):
            try:
                # Get href
                print('HREF: ' + litag.a.get('href'))
                wikiurl = starturl + litag.a.get('href')
                print(wikiurl)
                href.append(wikiurl)
            except:
                # Invalid list item
                print('No HREF')
                href.append('NONE')
            try:
                # Use wikipedia api to get article
                response = requests.get(
                    'https://en.wikipedia.org/w/api.php',
                    params={
                    'action': 'query',
                    'format': 'json',
                    'titles': litag.a.get('title'),
                    'prop': 'extracts',
                    'explaintext': True,
                    'exlimit': 'max',
                }
                ).json()
                raw.append(response)
            except:
                # No article
                raw.append('NONE')
            try:
                # Get influence bio-table data
                tempinfluencers, tempinfluenced = bio_table(wikiurl)
                influencers.append(tempinfluencers)
                influenced.append(tempinfluenced)
            except:
                # No bio-table available
                influencers.append([])
                influenced.append([])
            # Set philosophers name
            phil.append(litag.text)
            print(litag.text)


# ***Process the result***

Transform lists into pandas dataframe

In [None]:
df = pd.DataFrame(phil,columns=['Philosopher'])
df['href']=href
df['raw']=raw
df['influencers']=influencers
df['influenced']=influenced
df
df['raw'].sample(n=3)
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):  # more options can be specified also
#    print(df)

Convert data frame to json

In [None]:
result = df.to_json(orient='index')
from google.colab import files
open('result.json', 'w').write(result)
files.download('result.json')

Transform the result

In [None]:
# We want to transform the result
# INPUT: json with fields [id][raw], [id][influencers], [id][influenced] -> we have to extract pageid, philosophers and article from raw (which is the raw response from the wikipedia api)
# OUTPUT: json with fields [pageid], [philosopher], [article], [table] -> [table] will contain influencers and influenced (biography table data)
with open('result.json', 'r') as f:
  contents = json.loads(f.read())
# prints for logging purposes
print(contents['67']['raw']['query']['pages'])
print(contents['67']['raw'])
# json_list is a list of dictionaries
json_list = []
# for each entry
for content in contents:
  # define a json object
  json_object = {}
  json_object['table'] = {}
  # populate the object
  try:
    json_object['pageid'] = list(contents[content]['raw']['query']['pages'].keys())[0]
    json_object['philosopher'] = contents[content]['raw']['query']['pages'][json_object["pageid"]]['title']
    json_object['article'] = contents[content]['raw']['query']['pages'][json_object['pageid']]['extract']
    json_object['table']['influencers'] = contents[content]['influencers']
    json_object['table']['influenced'] = contents[content]['influenced']
    json_list.append(json_object)
  except:
    print('Invalid page')
# write and download final result
with open('final_result.json', 'w', encoding='utf-8') as f:
    json.dump(json_list, f, ensure_ascii=False, indent=4)
files.download('final_result.json')