# Using Wikipedia API to get Artificial Intelligence articles

In [1]:
import sqlite3
import wptools
import re
from bs4 import BeautifulSoup

In [2]:
category = 'Category:Artificial_intelligence'
depth = 1

In [11]:
class crawl_wikipedia:
    def __init__(self, db_file):
        self.categories=[]
        #self.pageids=[]
        self.count=0
        # Create db
        self.conn = sqlite3.connect(db_file)
        c = self.conn.cursor()
        c.execute('''CREATE TABLE IF NOT EXISTS content
            (pageid text, url text, content text)''')
        self.conn.commit()
        self.cursor = self.conn.cursor()
     
    def save_page_content(self, pageid, url, content):
        self.cursor.execute("INSERT INTO content VALUES (?,?,?)",
            (pageid, url, content))
        self.conn.commit()
        
    def get_page_content(self, columns='*'):
        # TODO - Add in ability to select single columns
        output = []
        for row in self.cursor.execute("SELECT url FROM content"):
            output.append(row)
            print(row)
        return row
            
    def get_categories_and_members(self, category, depth):
        print('Checking for subcategories of {} at depth {}'.format(category, depth))
        if depth:
            cat = wptools.category(category)
            cat_members = cat.get_members()
            # First let's save any members (pages) for this category
            if 'members' in cat_members.data.keys():
                for cat_member in cat_members.data['members']:
                    # print('Appending {} to pageids'.format(cat_member['pageid']))                   
                    #self.pageids.append(cat_member['pageid'])
                    page = wptools.page(pageid=cat_member['pageid']).get_parse()
                    # Remove <ref> and other HTML syntax
                    text = BeautifulSoup(page.data['wikitext'], 'html.parser').get_text()
                    # Remove other markup such as [[...]] and {{...}}
                    clean_content = re.sub(r"\s*{.*}\s*|\s*\[.*\]\s*", " ", text)
                    # Get URL in wikipedia
                    url = page.get_query().data['url']
                    # Now store
                    print('Saving pageid {} / url {}'.format(cat_member['pageid'], url))
                    self.save_page_content(cat_member['pageid'], url, clean_content)
            # Now iterate through any subcategories
            if 'subcategories' in cat_members.data.keys():
                subcats = cat_members.data['subcategories']               
                for subcat in subcats: 
                    self.categories.append(subcat)
                    self.count += 1
                    #print('Appending {} / count = {}'.format(subcat['title'], self.count))
                    self.get_categories_and_members(subcat['title'], depth - 1)
            

In [12]:
crawler = crawl_wikipedia('content.db')

In [None]:
crawler.get_categories_and_members(category, depth)

In [13]:
len(crawler.categories)

0

In [15]:
x=crawler.get_page_content()

('https://en.wikipedia.org/wiki/Artificial_intelligence',)
('https://en.wikipedia.org/wiki/Outline_of_artificial_intelligence',)
('https://en.wikipedia.org/wiki/List_of_artificial_intelligence_projects',)
('https://en.wikipedia.org/wiki/List_of_programming_languages_for_artificial_intelligence',)
('https://en.wikipedia.org/wiki/0music',)
('https://en.wikipedia.org/wiki/3D_reconstruction_from_multiple_images',)
('https://en.wikipedia.org/wiki/20Q',)
('https://en.wikipedia.org/wiki/ACROSS_Project',)
('https://en.wikipedia.org/wiki/Action_selection',)
('https://en.wikipedia.org/wiki/Admissible_heuristic',)
('https://en.wikipedia.org/wiki/ADS-AC',)
('https://en.wikipedia.org/wiki/Agent_systems_reference_model',)
('https://en.wikipedia.org/wiki/AgentSheets',)
('https://en.wikipedia.org/wiki/A.I._Artificial_Intelligence',)
('https://en.wikipedia.org/wiki/AI-complete',)
('https://en.wikipedia.org/wiki/AIVA',)
('https://en.wikipedia.org/wiki/Alesis_Artificial_Intelligence',)
('https://en.wikip

('https://en.wikipedia.org/wiki/Wearable_technology',)
('https://en.wikipedia.org/wiki/Wikirating',)
('https://en.wikipedia.org/wiki/Wizard_of_Oz_experiment',)
('https://en.wikipedia.org/wiki/World_Wide_Web',)
('https://en.wikipedia.org/wiki/Smartwatch',)
('https://en.wikipedia.org/wiki/Knowledge_engineering',)
('https://en.wikipedia.org/wiki/Collaborative_information_seeking',)
('https://en.wikipedia.org/wiki/Collaborative_innovation_network',)
('https://en.wikipedia.org/wiki/Conceptualization_(information_science)',)
('https://en.wikipedia.org/wiki/Conference_on_Semantics_in_Healthcare_and_Life_Sciences',)
('https://en.wikipedia.org/wiki/D3web',)
('https://en.wikipedia.org/wiki/Data_%26_Knowledge_Engineering',)
('https://en.wikipedia.org/wiki/Decision_support_system',)
('https://en.wikipedia.org/wiki/Frame_language',)
('https://en.wikipedia.org/wiki/Integrated_Operations_in_the_High_North',)
('https://en.wikipedia.org/wiki/Intelligent_decision_support_system',)
('https://en.wikipedia

In [16]:
x

('https://en.wikipedia.org/wiki/Zeuthen_strategy',)

In [288]:
page = wptools.page(pageid=1648132).get_parse()

en.wikipedia.org (parse) 1648132
Weak AI (en) data
{
  pageid: 1648132
  parsetree: <str(3573)> <root><template><title>Use dmy dates</tit...
  requests: <list(1)> parse
  title: Weak AI
  wikibase: Q17097955
  wikidata_url: https://www.wikidata.org/wiki/Q17097955
  wikitext: <str(2870)> {{Use dmy dates|date=January 2015}}'''Weak...
}


In [289]:
page.get_parse()

+ parse results in cache


<wptools.page.WPToolsPage at 0x7f5b8430bb00>

In [290]:
text = page.data['wikitext']

In [291]:
page.get_query().data['url']

en.wikipedia.org (query) Weak AI
Weak AI (en) data
{
  assessments: <dict(5)> Linguistics, Systems, Computing, Technolo...
  extext: <str(1261)> **Weak artificial intelligence** ( **weak AI...
  extract: <str(1305)> <p><b>Weak artificial intelligence</b> (<b>...
  label: Weak AI
  length: 2,893
  links: <list(13)> Artificial general intelligence, Artificial in...
  modified: <dict(1)> page
  pageid: 1648132
  parsetree: <str(3573)> <root><template><title>Use dmy dates</tit...
  random: Arturo Mas
  redirects: <list(4)> {'pageid': 7869823, 'ns': 0, 'title': 'Weak...
  requests: <list(2)> parse, query
  title: Weak AI
  url: https://en.wikipedia.org/wiki/Weak_AI
  url_raw: https://en.wikipedia.org/wiki/Weak_AI?action=raw
  watchers: 33
  wikibase: Q17097955
  wikidata_url: https://www.wikidata.org/wiki/Q17097955
  wikitext: <str(2870)> {{Use dmy dates|date=January 2015}}'''Weak...
}


'https://en.wikipedia.org/wiki/Weak_AI'

In [292]:
from bs4 import BeautifulSoup

In [293]:
soup = BeautifulSoup(text, 'html.parser')

In [294]:
txt2 = soup.get_text()

In [295]:
import re
re.sub(r"\s*{.*}\s*|\s*\[.*\]\s*", " ", txt2)

' \'\'\'Weak artificial intelligence\'\'\' (\'\'\'weak AI\'\'\'), also known as \'\'\'narrow AI\'\'\',io9.com mentions narrow AI. Published 1 April 2013, retrieved 16 February 2014: http://io9.com/how-much-longer-before-our-first-ai-catastrophe-464043243AI researcher Ben Goertzel explains why he became interested in AGI instead of narrow AI. Published 18 Oct 2013. Retrieved 16 February 2014. http://intelligence.org/2013/10/18/ben-goertzel/TechCrunch discusses AI App building regarding Narrow AI. Published 16 Oct 2015, retrieved 17 Oct 2015. https://techcrunch.com/2015/10/15/machine-learning-its-the-hard-problems-that-are-valuable/ is (a machine with the ability to apply intelligence to any problem, rather than just one specific problem). All currently existing systems considered artificial intelligence of any sort are weak AI at most. , on his blog in 2010, stated Siri was "VERY narrow and brittle" evidenced by annoying results if you ask questions outside the limits of the application