# Scraping with Pandas

In [1]:
import pandas as pd

We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.

In [11]:
url = 'https://animalcrossing.fandom.com/wiki/Fish_(New_Horizons)'

In [12]:
tables = pd.read_html(url)
tables[2]

Unnamed: 0,Name,Image,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Bitterling,,900,River,1,All day,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
1,Pale chub,,200,River,1,9 AM - 4 PM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
2,Crucian carp,,160,River,2,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
3,Dace,,240,River,3,4 PM - 9 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
4,Carp,,300,Pond,4,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Suckerfish,,1500,Sea,4 (Fin),All day,-,-,-,-,-,✓,✓,✓,✓,-,-,-
76,Football fish,,2500,Sea,4,4 PM - 9 AM,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
77,Oarfish,,9000,Sea,6,All day,✓,✓,✓,✓,✓,-,-,-,-,-,-,✓
78,Barreleye,,15000,Sea,2,9 PM - 4 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓


What we get in return is a list of dataframes for any tabular data that Pandas found.

In [13]:
type(tables)

list

We can slice off any of those dataframes that we want using normal indexing.

In [14]:
df = tables[2]
del df["Image"]
df

Unnamed: 0,Name,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Bitterling,900,River,1,All day,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
1,Pale chub,200,River,1,9 AM - 4 PM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
2,Crucian carp,160,River,2,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
3,Dace,240,River,3,4 PM - 9 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
4,Carp,300,Pond,4,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Suckerfish,1500,Sea,4 (Fin),All day,-,-,-,-,-,✓,✓,✓,✓,-,-,-
76,Football fish,2500,Sea,4,4 PM - 9 AM,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
77,Oarfish,9000,Sea,6,All day,✓,✓,✓,✓,✓,-,-,-,-,-,-,✓
78,Barreleye,15000,Sea,2,9 PM - 4 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓


Set the index to the `State` column

In [15]:
df.set_index('Name', inplace=True)
df.head()

Unnamed: 0_level_0,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Bitterling,900,River,1,All day,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
Pale chub,200,River,1,9 AM - 4 PM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
Crucian carp,160,River,2,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
Dace,240,River,3,4 PM - 9 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
Carp,300,Pond,4,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓


## Changing plus and minus to bool

In [16]:
for i in range(len(df)):
    df["Jan"] = df["Jan"].replace("✓", True)
    df["Feb"] = df["Feb"].replace("✓", True)
    df["Mar"] = df["Mar"].replace("✓", True)
    df["Apr"] = df["Apr"].replace("✓", True)
    df["May"] = df["May"].replace("✓", True)
    df["Jun"] = df["Jun"].replace("✓", True)
    df["Jul"] = df["Jul"].replace("✓", True)
    df["Aug"] = df["Aug"].replace("✓", True)
    df["Sep"] = df["Sep"].replace("✓", True)
    df["Oct"] = df["Oct"].replace("✓", True)
    df["Nov"] = df["Nov"].replace("✓", True)
    df["Dec"] = df["Dec"].replace("✓", True)
    
    df["Jan"] = df["Jan"].replace("-", False)
    df["Feb"] = df["Feb"].replace("-", False)
    df["Mar"] = df["Mar"].replace("-", False)
    df["Apr"] = df["Apr"].replace("-", False)
    df["May"] = df["May"].replace("-", False)
    df["Jun"] = df["Jun"].replace("-", False)
    df["Jul"] = df["Jul"].replace("-", False)
    df["Aug"] = df["Aug"].replace("-", False)
    df["Sep"] = df["Sep"].replace("-", False)
    df["Oct"] = df["Oct"].replace("-", False)
    df["Nov"] = df["Nov"].replace("-", False)
    df["Dec"] = df["Dec"].replace("-", False)

df

Unnamed: 0_level_0,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Bitterling,900,River,1,All day,True,True,True,False,False,False,False,False,False,False,True,True
Pale chub,200,River,1,9 AM - 4 PM,True,True,True,True,True,True,True,True,True,True,True,True
Crucian carp,160,River,2,All day,True,True,True,True,True,True,True,True,True,True,True,True
Dace,240,River,3,4 PM - 9 AM,True,True,True,True,True,True,True,True,True,True,True,True
Carp,300,Pond,4,All day,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Suckerfish,1500,Sea,4 (Fin),All day,False,False,False,False,False,True,True,True,True,False,False,False
Football fish,2500,Sea,4,4 PM - 9 AM,True,True,True,False,False,False,False,False,False,False,True,True
Oarfish,9000,Sea,6,All day,True,True,True,True,True,False,False,False,False,False,False,True
Barreleye,15000,Sea,2,9 PM - 4 AM,True,True,True,True,True,True,True,True,True,True,True,True


## DataFrames as HTML

Pandas also had a `to_html` method that we can use to generate HTML tables from DataFrames.

In [11]:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Image</th>\n      <th>Price</th>\n      <th>Location</th>\n      <th>Shadow Size</th>\n      <th>Time</th>\n      <th>Jan</th>\n      <th>Feb</th>\n      <th>Mar</th>\n      <th>Apr</th>\n      <th>May</th>\n      <th>Jun</th>\n      <th>Jul</th>\n      <th>Aug</th>\n      <th>Sep</th>\n      <th>Oct</th>\n      <th>Nov</th>\n      <th>Dec</th>\n    </tr>\n    <tr>\n      <th>Name</th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n      <th></th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>Bitterling</th>\n      <td>NaN</td>\n      <td>900</td>\n      <td>River</td>\n      <td>1</td>\n      <td>All day</td>\n      <td>✓</td>\n      <td>✓</t

You may have to strip unwanted newlines to clean up the table.

In [12]:
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Image</th>      <th>Price</th>      <th>Location</th>      <th>Shadow Size</th>      <th>Time</th>      <th>Jan</th>      <th>Feb</th>      <th>Mar</th>      <th>Apr</th>      <th>May</th>      <th>Jun</th>      <th>Jul</th>      <th>Aug</th>      <th>Sep</th>      <th>Oct</th>      <th>Nov</th>      <th>Dec</th>    </tr>    <tr>      <th>Name</th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>      <th></th>    </tr>  </thead>  <tbody>    <tr>      <th>Bitterling</th>      <td>NaN</td>      <td>900</td>      <td>River</td>      <td>1</td>      <td>All day</td>      <td>✓</td>      <td>✓</td>      <td>✓</td>      <td>-</td>      <td>-</td>      <td>-</td>      <td>-</td>      <td>-</td>      

You can also save the table directly to a file.

In [13]:
df.to_html('fish_data.html')

UnicodeEncodeError: 'charmap' codec can't encode character '\u2713' in position 977: character maps to <undefined>

In [None]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html