# Webpage search and download - simple example

--------------------------install specifically required packages-------------------------------------

    conda install beautifulsoup4
    conda install urllib3

This is an example for how to download files from a simple webpage (Wikipedia) in order to learn the basic principles of how to do this with urllib and beautifulsoup in python.

In [1]:
import shutil

In [2]:
import re, os

In [3]:
import urllib3
from bs4 import BeautifulSoup

## Download one file from wikipedia page

In [4]:
# first specify the page
url = "https://en.wikipedia.org/wiki/Cat_intelligence"

In [5]:
# setup pool manager
http = urllib3.PoolManager()

In [6]:
# send request
response = http.request('GET', url)



In [7]:
# analyze response with beautifulsoup
soup = BeautifulSoup(response.data)

In [8]:
soup

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>Cat intelligence - Wikipedia</title>
<script>document.documentElement.className=document.documentElement.className.replace(/(^|\s)client-nojs(\s|$)/,"$1client-js$2");RLCONF={"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"Cat_intelligence","wgTitle":"Cat intelligence","wgCurRevisionId":904358062,"wgRevisionId":904358062,"wgArticleId":6648008,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with self-published sources","Articles with self-published sources from June 2013","Wikipedia articles needing page number citations from June 2013","All articles with failed verification","Articles with failed verification from February 2012","CS1 maint: Multiple names: authors list","Webarchive template wayback links","All articles lacking reliable references","Articles lacking reli

In [9]:
# find images on page
images = []
for img in soup.findAll('img'):
    images.append(img.get('src'))

In [10]:
images

['//upload.wikimedia.org/wikipedia/en/thumb/b/b4/Ambox_important.svg/40px-Ambox_important.svg.png',
 '//upload.wikimedia.org/wikipedia/commons/thumb/6/66/An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg/220px-An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Cat_brain.jpg/220px-Cat_brain.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/4/48/RedCat_8727.jpg/220px-RedCat_8727.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/28px-Cat03.jpg',
 '//upload.wikimedia.org/wikipedia/commons/thumb/8/89/Symbol_book_class2.svg/16px-Symbol_book_class2.svg.png',
 '//upload.wikimedia.org/wikipedia/en/thumb/4/48/Folder_Hexagonal_Icon.svg/16px-Folder_Hexagonal_Icon.svg.png',
 '//upload.wikimedia.org/wikipedia/en/thumb/f/fd/Portal-puzzle.svg/16px-Portal-puzzle.svg.png',
 '//upload.wikimedia.org/wikipedia/en/thumb/4/48/Folder_Hexagonal_Icon.svg/16px-Folder_Hexagonal_Icon.svg.png',


In [11]:
# url for one image
url2 = 'https:'+images[1]
#url2 = images[1]

In [12]:
url2

'https://upload.wikimedia.org/wikipedia/commons/thumb/6/66/An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg/220px-An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg'

In [13]:
# download image to testfile
filename = 'testfile.png'

In [14]:
c = urllib3.PoolManager()

with c.request('GET',url2, preload_content=False) as resp, open(filename, 'wb') as out_file:
    shutil.copyfileobj(resp, out_file)

resp.release_conn()  



## Download filelist

No we repeat the download for a list of files.

In [15]:
# we search for all pictures with 'cat' in their name
image_list = []
for image in images:
    if any(re.findall(r'cat', image, re.IGNORECASE)):
        urlx = 'https:'+image
        image_list.append(urlx)

In [16]:
image_list

['https://upload.wikimedia.org/wikipedia/commons/thumb/6/66/An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg/220px-An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg',
 'https://upload.wikimedia.org/wikipedia/commons/thumb/e/e6/Cat_brain.jpg/220px-Cat_brain.jpg',
 'https://upload.wikimedia.org/wikipedia/commons/thumb/4/48/RedCat_8727.jpg/220px-RedCat_8727.jpg',
 'https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/28px-Cat03.jpg']

In [17]:
# list entries can be further decomposed
words = image_list[0].split('/')
words

['https:',
 '',
 'upload.wikimedia.org',
 'wikipedia',
 'commons',
 'thumb',
 '6',
 '66',
 'An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg',
 '220px-An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg']

In [18]:
words[-2]

'An_up-close_picture_of_a_curious_male_domestic_shorthair_tabby_cat.jpg'

In [19]:
# make a folder for downloading pictures
folder = 'catpictures'
if not os.path.exists(folder):
    os.mkdir(folder)

In [20]:
# download all selected pictures
c = urllib3.PoolManager()

for image in image_list:
    words = image.split('/')
    filename = os.path.join(folder,words[-2])
    with c.request('GET',image, preload_content=False) as resp, open(filename, 'wb') as out_file:
        shutil.copyfileobj(resp, out_file)

resp.release_conn()  

