In [3]:
from IPython.display import Image
from IPython.core.display import HTML 
Image(url= "https://upload.wikimedia.org/wikipedia/commons/b/b9/CRISP-DM_Process_Diagram.png",width=600,height=600)

# Web scraping for Data Collection

Web scraping is one popular method to collect data from the web. This method requires some knowledge of html & **xpath**.

When you collect the data from websites, you need to be mindful about the website's terms and conditions. Two examples below are created for educational purpose.

# XPath
- Xpath is a query language to access specific contents or data on the web page (web scraping)
- Xpath is used to collect data from webpages

# XPath Tutorial

- http://zvon.org/comp/r/tut-XPath_1.html#Pages~List_of_XPaths
- http://zvon.org/xxl/XPathTutorial/General/examples.html
- We'll learn Xpath and Web scraping by doing

## This is a web page (http://sfbay.craigslist.org/search/npo)

<img src="images\craiglist.gif">

## This is the html page of the above webpage

<img src="images\craiglist_title.gif">

## Some basic terms to know for web scraping

- nodes (or elements): html tags such as div, section, span, a, p, etc.
- attributes: more information included inside html tags, such as class, href, datetime, title, data-id, etc.
- children: the nodes or elements in lower hierarchical levels (e.g., a is a child of span, section is a child of div)
- parent: opposite of children

## Python modules used in this lecture 

1. For web scraping, you need to use a python moduel (or package) called **Request** to **access HTML file or web page** http://docs.python-requests.org/en/latest/

2. Then, **HTML and XML need to be processed** using **lxml**, a Python package http://lxml.de/
3. **csv** and **pandas** Python modules for **saving data in csv files**

# Example 1
- http://sfbay.craigslist.org/search/npo

In [4]:
import requests
from lxml import html
import csv
import pandas as pd

# visit the webpage and access the web contents
r = requests.get('http://sfbay.craigslist.org/search/npo')
data = html.fromstring(r.text)

In [5]:
# xpath: collect the specific data you're looking for
title = data.xpath("//p[@class='result-info']/a/text()")


print title

['Soccer Coordinator for Hayward SCORES--Soccer (hayward)', 'Mental Health: Full-time Care Manager, evening/weekend shifts', 'Mental Health Care Manager - Relief/On Call', '2 days fulltime with benefits, Residential Relief Manager Support/Care', 'HEALTH CARE COORDINATOR', 'Weekend job, 2nd job, Caregiver for adult with intellectual disability', 'Make a difference, Caregiver for adults with intellectual disability.', 'Sr. Administrative Assistant', 'Galileo Learning looking for year-round Tech Museum program manager!', 'Job Coach - Part Time/Substitute', 'Community Support Facilitator - Santa Clara', 'Bilingual Linkage Clinician', 'Bookkeeper (Temporary Position)', "Grant Writer - Support a nonprofit children's agency!", 'Clinical Director-Residential Treatment Center', 'Administrative Associate', "Bilingual Classroom Therapist - Children's Therapeutic Preschool", 'Clinical Training and Quality Assurance (QA) Director', 'Associate Director of Support Services', 'Property Manager (Suppor

In [6]:
#xpath: collect the url information
web_url = data.xpath("//p/a/@href")
web_url 

['https://sfbay.craigslist.org/eby/npo/d/soccer-coordinator-for/6319462765.html',
 'https://sfbay.craigslist.org/nby/npo/d/mental-health-full-time-care/6319110625.html',
 'https://sfbay.craigslist.org/nby/npo/d/mental-health-care-manager/6319105549.html',
 'https://sfbay.craigslist.org/nby/npo/d/2-days-fulltime-with-benefits/6319001908.html',
 'https://sfbay.craigslist.org/nby/npo/d/health-care-coordinator/6318995526.html',
 'https://sfbay.craigslist.org/nby/npo/d/weekend-job-2nd-job-caregiver/6318990570.html',
 'https://sfbay.craigslist.org/nby/npo/d/make-difference-caregiver-for/6318979163.html',
 'https://sfbay.craigslist.org/eby/npo/d/sr-administrative-assistant/6317265137.html',
 'https://sfbay.craigslist.org/sby/npo/d/galileo-learning-looking-for/6317256378.html',
 'https://sfbay.craigslist.org/sby/npo/d/job-coach-part-time-substitute/6317223846.html',
 'https://sfbay.craigslist.org/sby/npo/d/community-support-facilitator/6317220415.html',
 'https://sfbay.craigslist.org/sfc/npo/d

In [7]:
# combine two columns
r = zip(title, web_url)
r
# changing the data more like Excel format
r_df = pd.DataFrame(r)
r_df
r_df.to_csv('r.csv',encoding = "utf-8")
r_df



Unnamed: 0,0,1
0,Soccer Coordinator for Hayward SCORES--Soccer ...,https://sfbay.craigslist.org/eby/npo/d/soccer-...
1,"Mental Health: Full-time Care Manager, evening...",https://sfbay.craigslist.org/nby/npo/d/mental-...
2,Mental Health Care Manager - Relief/On Call,https://sfbay.craigslist.org/nby/npo/d/mental-...
3,"2 days fulltime with benefits, Residential Rel...",https://sfbay.craigslist.org/nby/npo/d/2-days-...
4,HEALTH CARE COORDINATOR,https://sfbay.craigslist.org/nby/npo/d/health-...
5,"Weekend job, 2nd job, Caregiver for adult with...",https://sfbay.craigslist.org/nby/npo/d/weekend...
6,"Make a difference, Caregiver for adults with i...",https://sfbay.craigslist.org/nby/npo/d/make-di...
7,Sr. Administrative Assistant,https://sfbay.craigslist.org/eby/npo/d/sr-admi...
8,Galileo Learning looking for year-round Tech M...,https://sfbay.craigslist.org/sby/npo/d/galileo...
9,Job Coach - Part Time/Substitute,https://sfbay.craigslist.org/sby/npo/d/job-coa...


# Example 2

* http://www.pythonscraping.com/pages/warandpeace.html

In [8]:
import requests
from lxml import html
import csv
import pandas as pd

r = requests.get('http://www.pythonscraping.com/pages/warandpeace.html')
data = html.fromstring(r.text)
print data

<Element html at 0x114e28940>


In [9]:
#get green texts
green = data.xpath("//span[@class='green']/text()")
green

['Anna\nPavlovna Scherer',
 'Empress Marya\nFedorovna',
 'Prince Vasili Kuragin',
 'Anna Pavlovna',
 'St. Petersburg',
 'the prince',
 'Anna Pavlovna',
 'Anna Pavlovna',
 'the prince',
 'the prince',
 'the prince',
 'Prince Vasili',
 'Anna Pavlovna',
 'Anna Pavlovna',
 'the prince',
 'Wintzingerode',
 'King of Prussia',
 'le Vicomte de Mortemart',
 'Montmorencys',
 'Rohans',
 'Abbe Morio',
 'the Emperor',
 'the prince',
 'Prince Vasili',
 'Dowager Empress Marya Fedorovna',
 'the baron',
 'Anna Pavlovna',
 'the Empress',
 'the Empress',
 "Anna Pavlovna's",
 'Her Majesty',
 'Baron\nFunke',
 'The prince',
 'Anna\nPavlovna',
 'the Empress',
 'The prince',
 'Anatole',
 'the prince',
 'The prince',
 'Anna\nPavlovna',
 'Anna Pavlovna']

Some data cleaning is needed (e.g., Anna\nPavlovna Scherer). \n means new line. We need to remove them. This is an important process (**data cleansing or transformation**) in data analytics. 

In [10]:
# get green texts
para_green = data.xpath("//span[@class='green']/text()")
for i in para_green:
   print i.replace('\n', ' ')

Anna Pavlovna Scherer
Empress Marya Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron Funke
The prince
Anna Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna Pavlovna
Anna Pavlovna


**print** (in the above statement) does not save anything. To save the extracted data, do the following (add brakets to the for loop)

In [11]:
para_green = [ i.replace('\n', ' ') for i in para_green ]
para_green

['Anna Pavlovna Scherer',
 'Empress Marya Fedorovna',
 'Prince Vasili Kuragin',
 'Anna Pavlovna',
 'St. Petersburg',
 'the prince',
 'Anna Pavlovna',
 'Anna Pavlovna',
 'the prince',
 'the prince',
 'the prince',
 'Prince Vasili',
 'Anna Pavlovna',
 'Anna Pavlovna',
 'the prince',
 'Wintzingerode',
 'King of Prussia',
 'le Vicomte de Mortemart',
 'Montmorencys',
 'Rohans',
 'Abbe Morio',
 'the Emperor',
 'the prince',
 'Prince Vasili',
 'Dowager Empress Marya Fedorovna',
 'the baron',
 'Anna Pavlovna',
 'the Empress',
 'the Empress',
 "Anna Pavlovna's",
 'Her Majesty',
 'Baron Funke',
 'The prince',
 'Anna Pavlovna',
 'the Empress',
 'The prince',
 'Anatole',
 'the prince',
 'The prince',
 'Anna Pavlovna',
 'Anna Pavlovna']

In [12]:
# get red texts
red = data.xpath("//span[@class = 'red']/text()")
red

["Well, Prince, so Genoa and Lucca are now just family estates of the\nBuonapartes. But I warn you, if you don't tell me that this means war,\nif you still try to defend the infamies and horrors perpetrated by\nthat Antichrist- I really believe he is Antichrist- I will have\nnothing more to do with you and you are no longer my friend, no longer\nmy 'faithful slave,' as you call yourself! But how do you do? I see\nI have frightened you- sit down and tell me all the news.",
 'If you have nothing better to do, Count [or Prince], and if the\nprospect of spending an evening with a poor invalid is not too\nterrible, I shall be very charmed to see you tonight between 7 and 10-\nAnnette Scherer.',
 'Heavens! what a virulent attack!',
 "First of all, dear friend, tell me how you are. Set your friend's\nmind at rest,",
 'Can one be well while suffering morally? Can one be calm in times\nlike these if one has any feeling?',
 'You are\nstaying the whole evening, I hope?',
 "And the fete at the Eng

In [13]:
# get red texts
for i in red:
    print i.replace('\n', ' ')



Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, if you don't tell me that this means war, if you still try to defend the infamies and horrors perpetrated by that Antichrist- I really believe he is Antichrist- I will have nothing more to do with you and you are no longer my friend, no longer my 'faithful slave,' as you call yourself! But how do you do? I see I have frightened you- sit down and tell me all the news.
If you have nothing better to do, Count [or Prince], and if the prospect of spending an evening with a poor invalid is not too terrible, I shall be very charmed to see you tonight between 7 and 10- Annette Scherer.
Heavens! what a virulent attack!
First of all, dear friend, tell me how you are. Set your friend's mind at rest,
Can one be well while suffering morally? Can one be calm in times like these if one has any feeling?
You are staying the whole evening, I hope?
And the fete at the English ambassador's? Today is Wednesday.

In [14]:
# get all texts
alltexts = data.xpath("//div/text()")
alltexts

['\n"',
 '"\n',
 '\nIt was in July, 1805, and the speaker was the well-known ',
 ', maid of honor and favorite of the ',
 '. With these words she greeted ',
 ', a man\nof high rank and importance, who was the first to arrive at her\nreception. ',
 ' had had a cough for some days. She was, as\nshe said, suffering from la grippe; grippe being then a new word in\n',
 ', used only by the elite.\n',
 '\nAll her invitations without exception, written in French, and\ndelivered by a scarlet-liveried footman that morning, ran as follows:\n',
 '\n"',
 '"\n',
 '\n"',
 '" replied ',
 ', not in the\nleast disconcerted by this reception. He had just entered, wearing\nan embroidered court uniform, knee breeches, and shoes, and had\nstars on his breast and a serene expression on his flat face. He spoke\nin that refined French in which our grandfathers not only spoke but\nthought, and with the gentle, patronizing intonation natural to a\nman of importance who had grown old in society and at court. He w

## text() vs text_content()
- text_content() - Returns the text content of the element, **including the text content of its children**, with no markup.

In [15]:
# get all inside span tag
allspan = data.xpath("//span/text()")
for i in allspan:
    print i.replace('\n', ' ')

Well, Prince, so Genoa and Lucca are now just family estates of the Buonapartes. But I warn you, if you don't tell me that this means war, if you still try to defend the infamies and horrors perpetrated by that Antichrist- I really believe he is Antichrist- I will have nothing more to do with you and you are no longer my friend, no longer my 'faithful slave,' as you call yourself! But how do you do? I see I have frightened you- sit down and tell me all the news.
Anna Pavlovna Scherer
Empress Marya Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
If you have nothing better to do, Count [or Prince], and if the prospect of spending an evening with a poor invalid is not too terrible, I shall be very charmed to see you tonight between 7 and 10- Annette Scherer.
Heavens! what a virulent attack!
the prince
Anna Pavlovna
First of all, dear friend, tell me how you are. Set your friend's mind at rest,
Can one be well while suffering morally? Can one be calm in times like these if one

In [16]:
# get the content of 1st span
para_1 = data.xpath("//span[1]/text()")
para_1


["Well, Prince, so Genoa and Lucca are now just family estates of the\nBuonapartes. But I warn you, if you don't tell me that this means war,\nif you still try to defend the infamies and horrors perpetrated by\nthat Antichrist- I really believe he is Antichrist- I will have\nnothing more to do with you and you are no longer my friend, no longer\nmy 'faithful slave,' as you call yourself! But how do you do? I see\nI have frightened you- sit down and tell me all the news.",
 'Wintzingerode',
 'le Vicomte de Mortemart',
 'Anatole']

In [17]:
# get the content of 2nd span
para_2 = data.xpath("//span[2]/text()")
para_2

['Anna\nPavlovna Scherer', 'King of Prussia', 'Montmorencys']

# Example 3 

- https://en.wikipedia.org/wiki/Kansas_State_University

In [18]:
import requests
from lxml import html
import csv
import pandas as pd

r = requests.get('https://en.wikipedia.org/wiki/Kansas_State_University')
data = html.fromstring(r.text)

In [19]:
# get main texts
maintexts = data.xpath("//p/text()")
maintexts

[', commonly shortened to ',
 ' or ',
 ', is a ',
 ' ',
 ' with its main ',
 ' in ',
 ', ',
 ", United States. Kansas State was opened as the state's ",
 u' in 1863 \u2013 the first public institution of higher learning in the state of Kansas.',
 ' It had a record high enrollment of 24,766 students for the Fall 2014 semester.',
 ' are in ',
 ' and ',
 '. The Kansas State University Polytechnic Campus in Salina is home to the College of Technology and Aviation. The Olathe Innovation Campus is the academic research presence within the Kansas Bioscience Park, where graduate students participate in research bioenergy, animal health, plant science and food safety and security.',
 'The university is classified as a research university with highest research activity (R1) by the ',
 ". Kansas State's academic offerings are administered through nine colleges, including the ",
 ' and the ',
 " in Salina. Graduate degrees offered include 65 master's degree programs and 45 doctoral degrees.",
 'Ka

In [20]:
# get main texts
maintexts = data.xpath("//p/text()")
for i in maintexts:
    maintexts_df = i.replace('\n', ' ')
maintexts_df


', the first Hispanic female president of an American public university.'

In [21]:
# get the list of Fraternities
fra = data.xpath("//td[1]/ul/li/a/text()")
fra

['Acacia',
 'Alpha Gamma Rho',
 'Alpha Kappa Lambda',
 'Alpha Phi Alpha',
 'Alpha Phi Omega',
 'Alpha Tau Omega',
 'Alpha Chi Sigma',
 'Beta Sigma Psi',
 'Beta Theta Pi',
 'Delta Chi',
 'Delta Lambda Phi',
 'Delta Sigma Phi',
 'Delta Upsilon',
 'FarmHouse',
 'Iota Phi Theta',
 'Kappa Alpha Psi',
 'Kappa Sigma',
 'Lambda Chi Alpha',
 'Omega Psi Phi',
 'Phi Beta Sigma',
 'Phi Delta Theta',
 'Phi Gamma Delta',
 'Phi Kappa Theta',
 'Pi Kappa Alpha',
 'Pi Kappa Phi',
 'Sigma Alpha Epsilon',
 'Sigma Chi',
 'Sigma Lambda Beta',
 'Sigma Nu',
 'Sigma Phi Epsilon',
 'Sigma Pi',
 'Tau Kappa Epsilon',
 'Theta Xi',
 'Triangle']

In [22]:
# get the list of Fraternities URLs
fra_url= data.xpath("//td[1]/ul/li/a/@href")
fra_url

['/wiki/Acacia_Fraternity',
 '/wiki/Alpha_Gamma_Rho',
 '/wiki/Alpha_Kappa_Lambda',
 '/wiki/Alpha_Phi_Alpha',
 '/wiki/Alpha_Phi_Omega',
 '/wiki/Alpha_Tau_Omega',
 '/wiki/Alpha_Chi_Sigma',
 '/wiki/Beta_Sigma_Psi',
 '/wiki/Beta_Theta_Pi',
 '/wiki/Delta_Chi',
 '/wiki/Delta_Lambda_Phi',
 '/wiki/Delta_Sigma_Phi',
 '/wiki/Delta_Upsilon',
 '/wiki/FarmHouse',
 '/wiki/Iota_Phi_Theta',
 '/wiki/Kappa_Alpha_Psi',
 '/wiki/Kappa_Sigma',
 '/wiki/Lambda_Chi_Alpha',
 '/wiki/Omega_Psi_Phi',
 '/wiki/Phi_Beta_Sigma',
 '/wiki/Phi_Delta_Theta',
 '/wiki/Phi_Gamma_Delta',
 '/wiki/Phi_Kappa_Theta',
 '/wiki/Pi_Kappa_Alpha',
 '/wiki/Pi_Kappa_Phi',
 '/wiki/Sigma_Alpha_Epsilon',
 '/wiki/Sigma_Chi',
 '/wiki/Sigma_Lambda_Beta',
 '/wiki/Sigma_Nu',
 '/wiki/Sigma_Phi_Epsilon',
 '/wiki/Sigma_Pi',
 '/wiki/Tau_Kappa_Epsilon',
 '/wiki/Theta_Xi',
 '/wiki/Triangle_Fraternity']

In [23]:
import csv
import pandas as pd

fra = data.xpath("//td[1]/ul/li/a/text()")
fra_url= data.xpath("//td[1]/ul/li/a/@href")

#combining two columns
fra_data = zip(fra, fra_url)
#chaning the data format to excel-like
fra_data = pd.DataFrame(fra_data)
#finally saving the data
fra_data.to_csv("data/output_fradata.csv")

IOError: [Errno 2] No such file or directory: 'data/output_fradata.csv'

In [26]:
# get the sororities list, their urls

sor = data.xpath("//td[2]/ul/li/a/text()")
sor_url = data.xpath("//td[2]/ul/li/a/@href")

sor_data = zip(sor, sor_url)
sor_data_df = pd.DataFrame(sor_data)
print sor_data_df





                     0                                                  1
0      Alpha Chi Omega                              /wiki/Alpha_Chi_Omega
1       Alpha Delta Pi                               /wiki/Alpha_Delta_Pi
2    Alpha Gamma Delta                            /wiki/Alpha_Gamma_Delta
3    Alpha Kappa Alpha                            /wiki/Alpha_Kappa_Alpha
4       Alpha Xi Delta                               /wiki/Alpha_Xi_Delta
5       Beta Sigma Chi  /w/index.php?title=Beta_Sigma_Chi&action=edit&...
6            Chi Omega                                    /wiki/Chi_Omega
7    Delta Delta Delta                            /wiki/Delta_Delta_Delta
8    Delta Sigma Theta                            /wiki/Delta_Sigma_Theta
9       Gamma Phi Beta                               /wiki/Gamma_Phi_Beta
10    Gamma Rho Lambda                             /wiki/Gamma_Rho_Lambda
11   Kappa Alpha Theta                            /wiki/Kappa_Alpha_Theta
12         Kappa Delta                

In [25]:
# save the data
sor = data.xpath("//td[2]/ul/li/a/text()")
sor_url = data.xpath("//td[2]/ul/li/a/@href")

sor_data = zip(sor, sor_url)
sor_data_df = pd.DataFrame(sor_data)
sor_data_df.to_csv("data/output_sordata.csv")



IOError: [Errno 2] No such file or directory: 'data/output_sordata.csv'

# Example 4

- http://econpy.pythonanywhere.com/ex/001.html

In [25]:
# visit the webpage and access the web contents
r = requests.get('http://econpy.pythonanywhere.com/ex/001.html')
data = html.fromstring(r.text)

In [26]:
#create a list of buyers:
b = data.xpath("//div[@title = 'buyer-name']/text()")


#create a list of prices
p = data.xpath("//span[@class = 'item-price']/text()")



# combine two columns
buyer_info = zip(b,p)
buyer_info

# changing the data more like Excel format
buyer_info_df = pd.DataFrame(buyer_info)
buyer_info_df


Unnamed: 0,0,1
0,Carson Busses,$29.95
1,Earl E. Byrd,$8.37
2,Patty Cakes,$15.26
3,Derri Anne Connecticut,$19.25
4,Moe Dess,$19.25
5,Leda Doggslife,$13.99
6,Dan Druff,$31.57
7,Al Fresco,$8.49
8,Ido Hoe,$14.47
9,Howie Kisses,$15.86


### Simple data analysis

In [27]:
# view the first five row
buyer_info_df.head()
buyer_info_df.tail()
buyer_info_df.iloc[[2]]
buyer_info_df.iloc[2:5]

Unnamed: 0,0,1
2,Patty Cakes,$15.26
3,Derri Anne Connecticut,$19.25
4,Moe Dess,$19.25


In [28]:
#find out the dataset
buyer_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
0    20 non-null object
1    20 non-null object
dtypes: object(2)
memory usage: 392.0+ bytes


#### Data types in Python

There are different **data types**:
    
1. object : string (e.g., last name, non-numeric values)
2. int : integer values
3. float : decimal values

In [29]:
#remove dollar sign
buyer_info_df[1] = buyer_info_df[1].str.replace('$', '')
buyer_info_df.head()

Unnamed: 0,0,1
0,Carson Busses,29.95
1,Earl E. Byrd,8.37
2,Patty Cakes,15.26
3,Derri Anne Connecticut,19.25
4,Moe Dess,19.25


In [30]:
buyer_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
0    20 non-null object
1    20 non-null object
dtypes: object(2)
memory usage: 392.0+ bytes


In [31]:
#convert object to number
buyer_info_df[1] = pd.to_numeric(buyer_info_df[1])
buyer_info_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
0    20 non-null object
1    20 non-null float64
dtypes: float64(1), object(1)
memory usage: 392.0+ bytes


0 and 1 are the default column names. You can change the column names.

In [32]:
df=buyer_info_df.rename(columns={0: 'Buyers', 1: 'Prices'})
df.head()

Unnamed: 0,Buyers,Prices
0,Carson Busses,29.95
1,Earl E. Byrd,8.37
2,Patty Cakes,15.26
3,Derri Anne Connecticut,19.25
4,Moe Dess,19.25


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
Buyers    20 non-null object
Prices    20 non-null float64
dtypes: float64(1), object(1)
memory usage: 392.0+ bytes


In [34]:
# summary statistics
df.describe()

Unnamed: 0,Prices
count,20.0
mean,21.8635
std,24.024161
min,5.68
25%,10.855
50%,15.13
75%,19.25
max,114.07


In [35]:
# simple descriptive statistics
df['Tax_Amounts'] = df['Prices'] * 0.0825
df.head()

Unnamed: 0,Buyers,Prices,Tax_Amounts
0,Carson Busses,29.95,2.470875
1,Earl E. Byrd,8.37,0.690525
2,Patty Cakes,15.26,1.25895
3,Derri Anne Connecticut,19.25,1.588125
4,Moe Dess,19.25,1.588125


In [46]:
df['Totals'] = df['Prices'] + df['Tax_Amounts']
df.head()

Unnamed: 0,Buyers,Prices,Tax_Amounts,Totals
0,Carson Busses,29.95,2.470875,32.420875
1,Earl E. Byrd,8.37,0.690525,9.060525
2,Patty Cakes,15.26,1.25895,16.51895
3,Derri Anne Connecticut,19.25,1.588125,20.838125
4,Moe Dess,19.25,1.588125,20.838125


In [47]:
df.describe()

Unnamed: 0,Prices,Tax_Amounts,Totals
count,20.0,20.0,20.0
mean,21.8635,1.803739,23.667239
std,24.024161,1.981993,26.006154
min,5.68,0.4686,6.1486
25%,10.855,0.895538,11.750538
50%,15.13,1.248225,16.378225
75%,19.25,1.588125,20.838125
max,114.07,9.410775,123.480775


In [51]:
# who paid most?
df.sort_values('Prices', ascending=False)
# another way
top = df.sort_values('Prices', ascending=False)
top.head()


Unnamed: 0,Buyers,Prices,Tax_Amounts,Totals
18,Rose Tattoo,114.07,9.410775,123.480775
14,Ave Sectomy,50.85,4.195125,55.045125
6,Dan Druff,31.57,2.604525,34.174525
0,Carson Busses,29.95,2.470875,32.420875
3,Derri Anne Connecticut,19.25,1.588125,20.838125


In [49]:
# who paid least?
df.sort_values('Prices', ascending=True)

Unnamed: 0,Buyers,Prices,Tax_Amounts,Totals
16,Bobbi Soks,5.68,0.4686,6.1486
13,Ben D. Rules,7.5,0.61875,8.11875
1,Earl E. Byrd,8.37,0.690525,9.060525
7,Al Fresco,8.49,0.700425,9.190425
19,Moe Tell,10.09,0.832425,10.922425
10,Len Lease,11.11,0.916575,12.026575
5,Leda Doggslife,13.99,1.154175,15.144175
15,Gary Shattire,14.26,1.17645,15.43645
8,Ido Hoe,14.47,1.193775,15.663775
17,Sheila Takya,15.0,1.2375,16.2375


# Example 5
- https://en.wikipedia.org/wiki/List_of_current_NBA_team_rosters

In [52]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_current_NBA_team_rosters')
data = html.fromstring(r.text)

In [53]:
# get names
name = data.xpath('//tr/td[3]/a/text()')
# get schools (or from)
school = data.xpath('//tr/td[7]/a/text()')

finaldata = zip(name, school)
finaldata

[('Allen, Kadeem', 'Arizona'),
 ('Baynes, Aron', 'Washington State'),
 ('Bird, Jabari', 'UC Berkeley'),
 ('Brown, Jaylen', 'UC Berkeley'),
 ('Hayward, Gordon', 'Butler'),
 ('Horford, Al', 'Florida'),
 ('Irving, Kyrie', 'Duke'),
 ('Larkin, Shane', 'Miami (FL)'),
 ('Morris, Marcus', 'Kansas'),
 ('Nader, Abdel', 'Iowa State'),
 ('Ojeleye, Semi', 'Southern Methodist'),
 ('Rozier, Terry', 'Louisville'),
 ('Smart, Marcus', 'Oklahoma State'),
 ('Tatum, Jayson', 'Duke'),
 ('Theis, Daniel', 'Germany'),
 ('Yabusele, Guerschon', 'France'),
 ('Acy, Quincy', 'Baylor'),
 ('Allen, Jarrett', 'Texas'),
 ('Booker, Trevor', 'Clemson'),
 ('Carroll, DeMarre', 'Missouri'),
 ('Crabbe, Allen', 'UC Berkeley'),
 ('Dinwiddie, Spencer', 'Colorado'),
 ('Doyle, Milton', 'Loyola (IL)'),
 ('Harris, Joe', 'Virginia'),
 ('Hollis-Jefferson, Rondae', 'Arizona'),
 ('Kilpatrick, Sean', 'Cincinnati'),
 ('LeVert, Caris', 'Michigan'),
 ('Lin, Jeremy', 'Harvard'),
 ('Mozgov, Timofey', 'Russia'),
 ('Ouattara, Yakuba', 'Ghana'),

The data collected does not contain all the players on the website. What is clear here is two players (**Temple, Garrett; Tolliver, Anthony**) are missing. Also, the player names and their colleges do not match.

The following players' **from** information is included in not **a** tag but **td[7]** tag

- Felício, Cristiano
- Kanter, Enes

In [54]:
name = data.xpath('//tr/td[3]/a/text()')
#multiple Xpaths using |
school = data.xpath('//tr/td[7]/a/text() | //tr/td[7]/text()')

finaldata = zip(name, school)
finaldata

[('Allen, Kadeem', 'Arizona'),
 ('Baynes, Aron', 'Washington State'),
 ('Bird, Jabari', 'UC Berkeley'),
 ('Brown, Jaylen', 'UC Berkeley'),
 ('Hayward, Gordon', 'Butler'),
 ('Horford, Al', 'Florida'),
 ('Irving, Kyrie', 'Duke'),
 ('Larkin, Shane', 'Miami (FL)'),
 ('Morris, Marcus', 'Kansas'),
 ('Nader, Abdel', 'Iowa State'),
 ('Ojeleye, Semi', 'Southern Methodist'),
 ('Rozier, Terry', 'Louisville'),
 ('Smart, Marcus', 'Oklahoma State'),
 ('Tatum, Jayson', 'Duke'),
 ('Theis, Daniel', 'Germany'),
 ('Yabusele, Guerschon', 'France'),
 ('Acy, Quincy', 'Baylor'),
 ('Allen, Jarrett', 'Texas'),
 ('Booker, Trevor', 'Clemson'),
 ('Carroll, DeMarre', 'Missouri'),
 ('Crabbe, Allen', 'UC Berkeley'),
 ('Dinwiddie, Spencer', 'Colorado'),
 ('Doyle, Milton', 'Loyola (IL)'),
 ('Harris, Joe', 'Virginia'),
 ('Hollis-Jefferson, Rondae', 'Arizona'),
 ('Kilpatrick, Sean', 'Cincinnati'),
 ('LeVert, Caris', 'Michigan'),
 ('Lin, Jeremy', 'Harvard'),
 ('Mozgov, Timofey', 'Russia'),
 ('Ouattara, Yakuba', 'Ghana'),

In [55]:
df = pd.DataFrame(finaldata)
df.head()

Unnamed: 0,0,1
0,"Allen, Kadeem",Arizona
1,"Baynes, Aron",Washington State
2,"Bird, Jabari",UC Berkeley
3,"Brown, Jaylen",UC Berkeley
4,"Hayward, Gordon",Butler


In [56]:
df=df.rename(columns={0: 'Players', 1: 'Schools'})
df.head()

Unnamed: 0,Players,Schools
0,"Allen, Kadeem",Arizona
1,"Baynes, Aron",Washington State
2,"Bird, Jabari",UC Berkeley
3,"Brown, Jaylen",UC Berkeley
4,"Hayward, Gordon",Butler


In [57]:
# most successful schools
df.groupby('Schools').count()

Unnamed: 0_level_0,Players
Schools,Unnamed: 1_level_1
Advanced Prep International (TX),1
Alabama,1
Alabama–Birmingham,1
American School (GRE),1
Argentina,2
Arizona,13
Arizona State,1
Arkansas,3
Baylor,5
Belmont,1


In [58]:
pop = df.groupby('Schools').count()
pop.sort_values('Players', ascending=False)

Unnamed: 0_level_0,Players
Schools,Unnamed: 1_level_1
Kentucky,27
Duke,20
North Carolina,17
Kansas,16
UCLA,14
Arizona,13
Texas,10
Florida,9
Syracuse,9
Washington,9


# Example 6

- http://www.basketball-reference.com/draft/
- http://www.basketball-reference.com/draft/BAA_1947.html    

In [60]:
r = requests.get('http://www.basketball-reference.com/draft/BAA_1947.html')
data = html.fromstring(r.text)

In [61]:
player = data.xpath("//td[@data-stat='player']/@csk")
player

['McNeeley,Clifton',
 'Selbo,Glen',
 'Ehlers,Bulbs',
 'Dropo,Walt',
 'Holub,Dick',
 'Crossin,Chink',
 'Underman,Jack',
 'Huston,Paul',
 "O'Keefe,Dick",
 'Killick,Larry',
 'Bolyard,Robert',
 'Gainer,Elmer',
 'Gallatin,Harry',
 'Hamilton,Scotty',
 'Hampton,Hugh',
 'Jake,Bob',
 'Raynor,Charles',
 'Reiser,Chick',
 'Rusinko,John',
 'Alemeida,Bob',
 'Biasatti,Hank',
 'Ezersky,Johnny',
 'Felt,George',
 'Hewson,Jack',
 'Kelly,John',
 'Petrovick,George',
 'Stump,Gene',
 'Bishop,Ralph',
 'Darden,Jimmy',
 'Decker,Hank',
 'Phillip,Andy',
 'Pollard,Jim',
 'Schadler,Ben',
 'Smith,Don',
 'Stone,Jack',
 'Vance,Gene',
 'Duncan,Andy',
 'Evans,Ray',
 'Golub,Ed',
 'Head,Garland',
 'Livingston,Ron',
 'Miller,Dan',
 'Misaka,Wat',
 'Reichert,Carl',
 'Tomlinson,Tom',
 'Butz,Norman',
 'Kaeding,Jim',
 'Koffenberger,Ed',
 'Pollard,Jim',
 'Alamo,Bob',
 'Brown,George',
 'Ives,Dick',
 'Knoche,Herman',
 'Nagy,Fritz',
 'Walton,Jack',
 'Barry,Joe',
 'Furey,Dick',
 'Hubbard,Bob',
 'Joyce,Bob',
 'Lipscomb,Roy',
 'Mills,

In [62]:
pts_per_g = data.xpath("//td[@data-stat='pts_per_g']/text()")
pts_per_g

['3.2',
 '8.1',
 '10.5',
 '4.7',
 '3.6',
 '5.2',
 '1.5',
 '13.0',
 '10.4',
 '1.0',
 '7.6',
 '2.7',
 '5.3',
 '8.1',
 '9.1',
 '13.2',
 '1.5',
 '0.8',
 '8.3',
 '5.5',
 '2.3',
 '13.2',
 '5.1',
 '3.6',
 '0.0',
 '13.2',
 '10.2',
 '10.9',
 '2.7',
 '4.1',
 '7.7',
 '5.4',
 '2.3',
 '4.9']

In [63]:
print len(player), len(pts_per_g)

80 34


Note: 80 players, but points-per-game are available for 34 players. Thus, combining these two columns is NOT working. 

In [64]:
finaldata = zip(player, pts_per_g)
finaldata

[('McNeeley,Clifton', '3.2'),
 ('Selbo,Glen', '8.1'),
 ('Ehlers,Bulbs', '10.5'),
 ('Dropo,Walt', '4.7'),
 ('Holub,Dick', '3.6'),
 ('Crossin,Chink', '5.2'),
 ('Underman,Jack', '1.5'),
 ('Huston,Paul', '13.0'),
 ("O'Keefe,Dick", '10.4'),
 ('Killick,Larry', '1.0'),
 ('Bolyard,Robert', '7.6'),
 ('Gainer,Elmer', '2.7'),
 ('Gallatin,Harry', '5.3'),
 ('Hamilton,Scotty', '8.1'),
 ('Hampton,Hugh', '9.1'),
 ('Jake,Bob', '13.2'),
 ('Raynor,Charles', '1.5'),
 ('Reiser,Chick', '0.8'),
 ('Rusinko,John', '8.3'),
 ('Alemeida,Bob', '5.5'),
 ('Biasatti,Hank', '2.3'),
 ('Ezersky,Johnny', '13.2'),
 ('Felt,George', '5.1'),
 ('Hewson,Jack', '3.6'),
 ('Kelly,John', '0.0'),
 ('Petrovick,George', '13.2'),
 ('Stump,Gene', '10.2'),
 ('Bishop,Ralph', '10.9'),
 ('Darden,Jimmy', '2.7'),
 ('Decker,Hank', '4.1'),
 ('Phillip,Andy', '7.7'),
 ('Pollard,Jim', '5.4'),
 ('Schadler,Ben', '2.3'),
 ('Smith,Don', '4.9')]

In [65]:
pts_per_g = data.xpath("//td[@data-stat='pts_per_g']")
for i in pts_per_g:
    print i.text_content()


3.2
8.1

10.5
4.7

3.6
5.2


1.5
13.0




10.4


1.0
7.6

2.7


5.3

8.1

9.1
13.2
1.5
0.8

8.3
5.5





2.3





13.2




5.1



3.6






0.0
13.2



10.2

10.9

2.7
4.1
7.7

5.4
2.3
4.9


In [66]:
pts_per_g = [ i.text_content() for i in pts_per_g ]
pts_per_g

['',
 '3.2',
 '8.1',
 '',
 '10.5',
 '4.7',
 '',
 '3.6',
 '5.2',
 '',
 '',
 '1.5',
 '13.0',
 '',
 '',
 '',
 '',
 '10.4',
 '',
 '',
 '1.0',
 '7.6',
 '',
 '2.7',
 '',
 '',
 '5.3',
 '',
 '8.1',
 '',
 '9.1',
 '13.2',
 '1.5',
 '0.8',
 '',
 '8.3',
 '5.5',
 '',
 '',
 '',
 '',
 '',
 '2.3',
 '',
 '',
 '',
 '',
 '',
 '13.2',
 '',
 '',
 '',
 '',
 '5.1',
 '',
 '',
 '',
 '3.6',
 '',
 '',
 '',
 '',
 '',
 '',
 '0.0',
 '13.2',
 '',
 '',
 '',
 '10.2',
 '',
 '10.9',
 '',
 '2.7',
 '4.1',
 '7.7',
 '',
 '5.4',
 '2.3',
 '4.9']

In [67]:
finaldata = zip(player, pts_per_g)
finaldata

[('McNeeley,Clifton', ''),
 ('Selbo,Glen', '3.2'),
 ('Ehlers,Bulbs', '8.1'),
 ('Dropo,Walt', ''),
 ('Holub,Dick', '10.5'),
 ('Crossin,Chink', '4.7'),
 ('Underman,Jack', ''),
 ('Huston,Paul', '3.6'),
 ("O'Keefe,Dick", '5.2'),
 ('Killick,Larry', ''),
 ('Bolyard,Robert', ''),
 ('Gainer,Elmer', '1.5'),
 ('Gallatin,Harry', '13.0'),
 ('Hamilton,Scotty', ''),
 ('Hampton,Hugh', ''),
 ('Jake,Bob', ''),
 ('Raynor,Charles', ''),
 ('Reiser,Chick', '10.4'),
 ('Rusinko,John', ''),
 ('Alemeida,Bob', ''),
 ('Biasatti,Hank', '1.0'),
 ('Ezersky,Johnny', '7.6'),
 ('Felt,George', ''),
 ('Hewson,Jack', '2.7'),
 ('Kelly,John', ''),
 ('Petrovick,George', ''),
 ('Stump,Gene', '5.3'),
 ('Bishop,Ralph', ''),
 ('Darden,Jimmy', '8.1'),
 ('Decker,Hank', ''),
 ('Phillip,Andy', '9.1'),
 ('Pollard,Jim', '13.2'),
 ('Schadler,Ben', '1.5'),
 ('Smith,Don', '0.8'),
 ('Stone,Jack', ''),
 ('Vance,Gene', '8.3'),
 ('Duncan,Andy', '5.5'),
 ('Evans,Ray', ''),
 ('Golub,Ed', ''),
 ('Head,Garland', ''),
 ('Livingston,Ron', ''),
 (

In [68]:
df = pd.DataFrame(finaldata)
df

Unnamed: 0,0,1
0,"McNeeley,Clifton",
1,"Selbo,Glen",3.2
2,"Ehlers,Bulbs",8.1
3,"Dropo,Walt",
4,"Holub,Dick",10.5
5,"Crossin,Chink",4.7
6,"Underman,Jack",
7,"Huston,Paul",3.6
8,"O'Keefe,Dick",5.2
9,"Killick,Larry",


# Recommended Reading (& Resources)

- http://docs.python-guide.org/en/latest/scenarios/scrape/
- http://lxml.de/xpathxslt.html
- http://www.w3schools.com/xml/xpath_intro.asp
- http://infohost.nmt.edu/tcc/help/pubs/pylxml/web/index.html

# Appendix: Web Crawling (Collecting data from multiple webpages) - MIS670 

### Example: http://sfbay.craigslist.org/search/npo?s=

In [69]:
a = "http://sfbay.craigslist.org/search/npo?s="
count = 0
for i in range(1,5):
    print '%s%s' %(a,count)
    count = count + 100    

http://sfbay.craigslist.org/search/npo?s=0
http://sfbay.craigslist.org/search/npo?s=100
http://sfbay.craigslist.org/search/npo?s=200
http://sfbay.craigslist.org/search/npo?s=300


In [70]:
a = "http://sfbay.craigslist.org/search/npo?s="
count = 0
for i in range(1,3):
    webpage = '%s%s' %(a,count)
    count = count + 100      
    r = requests.get(webpage)
    data = html.fromstring(r.text)
    
    # xpath: collect the specific data you're looking for
    title = data.xpath("//p[@class='result-info']/a/text()")
    url = data.xpath("//p[@class='result-info']/a/@href")
    
    print title, url

['JOB DEVELOPER', 'Defend Immigrant Rights on behalf of the ACLU! Earn $13-$17/hour', 'Executive Director', 'COMMUNITY WORK SUPERVISOR - PART TIME', 'Clinical Training and Quality Assurance (QA) Director', 'Supportive Living Attendant - South Bay Area', 'Case Manager III- SF Outpatient', 'Work for Amnesty International- Help Protect Human Rights! (Oakland)', 'Work for Amnesty International- Help Protect Human Rights! (Oakland)', 'Work for Amnesty International-Help Protect Human Rights! (San Jose)', u'\u25b6Gain Experience making social change\u25b6 STOP TRUMP! Up to $640/wk', 'Chapter 510 | Literary Arts Nonprofit | Development & Events Manager', 'Peer Mentor - Transitions', 'Parent Partner - VTP', 'Part-Time Mock Trial Coordinator', 'Mental Health Counselor - Avalon', 'Mental Health Counselor - On-Call Avalon', 'Mental Health Therapist - Transitions', 'Clinical Social Worker Avalon', 'Maintenance/Facility Technician', 'Director of The Music School', 'Become an Activist - GET PAID TO 

['Assistant Director - Progress Foundation La Posada', 'Program Director - Seniors- Progress Foundation', 'Assistant Director -- Shrader House', 'Assistant Director- Progress Sonoma', 'Housing Navigator -Multiple Positions Available #:HW04281701', 'Intensive Care Coordinator', 'Project Manager, CA Food is Medicine Pilot Project', 'Property Supervisor', 'Maintenance II - Great Benefits!', 'Make a Difference - Join the ESBA Team! -  Paid Training - ABA Therapy', 'Admissions Counselor', 'Hiring Immediately for Outreach Staff -- --  Speak for the Trees!', 'Licensed Vocational Nurse - FTE w/Benefits (New Crisis Residential)', 'Clinician I - New Crisis Residential', 'Licensed Psychiatric Technician - IOP-120 Outpatient (FTE w/Benefits)', 'Training Coordinator', u'\u25baGIRLS ROCK\u25c4 Women Warriors Needed to Help Fight HIV  $$', u'\u25baGIRLS ROCK\u25c4 Women Warriors Needed to Help Fight HIV  $$', 'Crisis Residential Specialist - New Crisis Residential', u'\u25baGIRLS ROCK\u25c4 Women War

In [71]:
# creating an empty list (or data storage)
finaldata = []

a = "http://sfbay.craigslist.org/search/npo?s="
count = 0
for i in range(1,3):
    webpage = '%s%s' %(a,count)
    count = count + 100      
    r = requests.get(webpage)
    data = html.fromstring(r.text)
    
    # xpath: collect the specific data you're looking for
    title = data.xpath("//p[@class='result-info']/a/text()")
    url = data.xpath("//p[@class='result-info']/a/@href")
    
    crawleddata = zip(title, url)
    
    for row in crawleddata:
        finaldata.append(row)    

finaldata = pd.DataFrame(finaldata)       
finaldata.to_csv("data/output_craiglist.csv", encoding="utf-8")

IOError: [Errno 2] No such file or directory: 'data/output_craiglist.csv'

### Example: http://econpy.pythonanywhere.com/ex/001

In [None]:
# creating an empty list (or data storage)
finaldata = []

a = "http://econpy.pythonanywhere.com/ex/00"
count = 1
b = ".html"
for i in range(1,5):
    webpage = '%s%s%s' %(a,count,b)
    count = count + 1      
    r = requests.get(webpage)
    data = html.fromstring(r.text)
    
    #create a list of buyers:
    buyers = data.xpath('//div[@title="buyer-name"]/text()')
    #create a list of prices
    prices = data.xpath('//span[@class="item-price"]/text()')

    crawleddata = zip(buyers, prices)
    
    for row in crawleddata:
        finaldata.append(row)    

finaldata = pd.DataFrame(finaldata)       
finaldata.to_csv("data/output_buyersprices.csv", encoding="utf-8")

### Example: http://www.basketball-reference.com/draft/BAA_1947.html

In [72]:
# creating an empty list (or data storage)
finaldata = []

a = "http://www.basketball-reference.com/draft/BAA_"
count = 1947
b = ".html"
for i in range(1,5):
    webpage = '%s%s%s' %(a,count,b)
    count = count + 1      
    r = requests.get(webpage)
    data = html.fromstring(r.text)
    
    player = data.xpath("//td[@data-stat='player']/@csk")
    pts_per_g = data.xpath("//td[@data-stat='pts_per_g']")
    pts_per_g = [ i.text_content() for i in pts_per_g ]

    crawleddata = zip(player, pts_per_g)
    
    for row in crawleddata:
        finaldata.append(row)    

finaldata = pd.DataFrame(finaldata)       
finaldata.to_csv("data/output_basketball.csv", encoding="utf-8")

IOError: [Errno 2] No such file or directory: 'data/output_basketball.csv'