Reading HTML Tables
    - Read and parse HTML tables from websites into a list of DataFrame objects to work with

In [50]:
!pip3 install lxml



In [51]:
import pandas as pd

Parsing raw HTML strings 
    - read_html(): useful panda method 
        - reads html tables from given URL, file like object, or raw string containing html 
        - returns a list of DataFrame objects

In [52]:
# read following html_string into a DataFrame

html_string = """
<table>
    <thead>
      <tr>
        <th>Order date</th>
        <th>Region</th> 
        <th>Item</th>
        <th>Units</th>
        <th>Unit cost</th>
      </tr>
    </thead>
    <tbody>
      <tr>
        <td>1/6/2018</td>
        <td>East</td> 
        <td>Pencil</td>
        <td>95</td>
        <td>1.99</td>
      </tr>
      <tr>
        <td>1/23/2018</td>
        <td>Central</td> 
        <td>Binder</td>
        <td>50</td>
        <td>19.99</td>
      </tr>
      <tr>
        <td>2/9/2018</td>
        <td>Central</td> 
        <td>Pencil</td>
        <td>36</td>
        <td>4.99</td>
      </tr>
      <tr>
        <td>3/15/2018</td>
        <td>West</td> 
        <td>Pen</td>
        <td>27</td>
        <td>19.99</td>
      </tr>
    </tbody>
</table>
"""

In [53]:
dfs = pd.read_html(html_string)

  dfs = pd.read_html(html_string)


In [54]:
# read_html returned one DataFrame object:

len(dfs)

1

In [55]:
df = dfs[0]

df

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


In [56]:
# Previous DataFrame looks similar to the raw HTML table
# now have a DataFrame object -> can apply pandas operation

df.shape

(4, 5)

In [57]:
df.loc[df['Region'] == 'Central']

# location Regions that are Central

Unnamed: 0,Order date,Region,Item,Units,Unit cost
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99


In [58]:
df.loc[df['Units'] > 35]

Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99


Defining Header
    - pandas automatically finds the header thanks to tag 
    - but often will find wrong or incomplete tables
        - > make the read_html method parse the tables in a wrong way without the proper headers

In [59]:
#  fix them we can use the header parameter.

html_string = """
<table>
  <tr>
    <td>Order date</td>
    <td>Region</td> 
    <td>Item</td>
    <td>Units</td>
    <td>Unit cost</td>
  </tr>
  <tr>
    <td>1/6/2018</td>
    <td>East</td> 
    <td>Pencil</td>
    <td>95</td>
    <td>1.99</td>
  </tr>
  <tr>
    <td>1/23/2018</td>
    <td>Central</td> 
    <td>Binder</td>
    <td>50</td>
    <td>19.99</td>
  </tr>
  <tr>
    <td>2/9/2018</td>
    <td>Central</td> 
    <td>Pencil</td>
    <td>36</td>
    <td>4.99</td>
  </tr>
  <tr>
    <td>3/15/2018</td>
    <td>West</td> 
    <td>Pen</td>
    <td>27</td>
    <td>19.99</td>
  </tr>
</table>
"""

In [60]:
pd.read_html(html_string)[0]

  pd.read_html(html_string)[0]


Unnamed: 0,0,1,2,3,4
0,Order date,Region,Item,Units,Unit cost
1,1/6/2018,East,Pencil,95,1.99
2,1/23/2018,Central,Binder,50,19.99
3,2/9/2018,Central,Pencil,36,4.99
4,3/15/2018,West,Pen,27,19.99


In [61]:
# pass the row number to use as header using the header parameter

pd.read_html(html_string, header=0)[0]

  pd.read_html(html_string, header=0)[0]


Unnamed: 0,Order date,Region,Item,Units,Unit cost
0,1/6/2018,East,Pencil,95,1.99
1,1/23/2018,Central,Binder,50,19.99
2,2/9/2018,Central,Pencil,36,4.99
3,3/15/2018,West,Pen,27,19.99


Parsing HTML tables from the web 
    - Parse html tables directly from an URL
    - read_html method with URL as parameter

In [62]:
# simple example
html_url = "https://www.basketball-reference.com/leagues/NBA_2019_per_game.html"

In [63]:
nba_tables = pd.read_html(html_url)

In [64]:
len(nba_tables)


1

In [65]:
# work with the only one table found:

nba = nba_tables[0]

In [66]:
nba.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Álex Abrines,SG,25,OKC,31,2,19.0,1.8,5.1,...,0.923,0.2,1.4,1.5,0.6,0.5,0.2,0.5,1.7,5.3
1,2,Quincy Acy,PF,28,PHO,10,0,12.3,0.4,1.8,...,0.7,0.3,2.2,2.5,0.8,0.1,0.4,0.4,2.4,1.7
2,3,Jaylen Adams,PG,22,ATL,34,1,12.6,1.1,3.2,...,0.778,0.3,1.4,1.8,1.9,0.4,0.1,0.8,1.3,3.2
3,4,Steven Adams,C,25,OKC,80,80,33.4,6.0,10.1,...,0.5,4.9,4.6,9.5,1.6,1.5,1.0,1.7,2.6,13.9
4,5,Bam Adebayo,C,21,MIA,82,28,23.3,3.4,5.9,...,0.735,2.0,5.3,7.3,2.2,0.9,0.8,1.5,2.5,8.9


In [67]:
# Complex example: 
# use requests module to get HTML code from URL to parse into DataFrame objects

# given URL has multiple tables about The Simpsons TV show

# goal: keep the table with information about each season
import requests

html_url = "https://en.wikipedia.org/wiki/The_Simpsons"

In [68]:
r = requests.get(html_url)

wiki_tables = pd.read_html(r.text, header=0)

  wiki_tables = pd.read_html(r.text, header=0)


In [69]:
len(wiki_tables)

50

In [70]:
simpsons = wiki_tables[1]

In [74]:
simpsons.head()

Unnamed: 0,Cast members,Cast members.1,Cast members.2,Cast members.3,Cast members.4,Cast members.5,Cast members.6,Cast members.7,Cast members.8
2,"Homer Simpson, Abe Simpson, Krusty the Clown, ...","Marge Simpson, Patty and Selma Bouvier, additi...","Bart Simpson, Maggie Simpson, various characters",Lisa Simpson,"Moe Szyslak, Chief Wiggum, Apu Nahasapeemapeti...","Ned Flanders, Mr. Burns, Dr. Hibbert (1990–202...",,,


In [72]:
simpsons.drop([0, 1], inplace=True)

#simpsons: This is assumed to be a Pandas DataFrame that contains data.

# .drop([0, 1], inplace=True): This line of code calls the drop() method on the simpsons DataFrame
# drop():  remove rows or columns from a DataFrame
# [0, 1]: specifies to drop rows with index 0 and 1 from the DataFrame
#         -> These are the row labels.
# inplace=True: means DataFrame is modified in-place
#         -> rows with index 0 and 1 are removed from original simpsons DataFrame



In [None]:
simpsons.set_index('Season', inplace=True)


Save to CSV file

In [75]:
simpsons.head()

Unnamed: 0,Cast members,Cast members.1,Cast members.2,Cast members.3,Cast members.4,Cast members.5,Cast members.6,Cast members.7,Cast members.8
2,"Homer Simpson, Abe Simpson, Krusty the Clown, ...","Marge Simpson, Patty and Selma Bouvier, additi...","Bart Simpson, Maggie Simpson, various characters",Lisa Simpson,"Moe Szyslak, Chief Wiggum, Apu Nahasapeemapeti...","Ned Flanders, Mr. Burns, Dr. Hibbert (1990–202...",,,


In [76]:
simpsons.to_csv('out.csv')

In [None]:
pd.read_csv('out.csv', index_col="Season").head()