# Scraping with Pandas

In [1]:
import pandas as pd

We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.

In [57]:
url = 'https://animalcrossing.fandom.com/wiki/Bugs_(New_Horizons)'

In [58]:
tables = pd.read_html(url)
tables[3]

Unnamed: 0,Name,Image,Price,Location,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Common butterfly,,160,Flying,4 AM - 7 PM,✓,✓,✓,✓,✓,✓,-,-,✓,✓,✓,✓
1,Yellow butterfly,,160,Flying,4 AM - 7 PM,-,-,✓,✓,✓,✓,-,-,✓,✓,-,-
2,Tiger butterfly,,240,Flying,4 AM - 7 PM,-,-,✓,✓,✓,✓,✓,✓,✓,-,-,-
3,Peacock butterfly,,2500,Flying by Hybrid Flowers,4 AM - 7 PM,-,-,✓,✓,✓,✓,-,-,-,-,-,-
4,Common bluebottle,,300,Flying,4 AM - 7 PM,-,-,-,✓,✓,✓,✓,✓,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Pill bug,,250,Hitting Rocks,11 PM - 4 PM,✓,✓,✓,✓,✓,✓,-,-,✓,✓,✓,✓
76,Centipede,,300,Hitting Rocks,4 PM - 11 PM,✓,✓,✓,✓,✓,✓,-,-,✓,✓,✓,✓
77,Spider,,600,Shaking Trees,7 PM - 8 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
78,Tarantula,,8000,On the Ground,7 PM - 4 AM,✓,✓,✓,✓,-,-,-,-,-,-,✓,✓


What we get in return is a list of dataframes for any tabular data that Pandas found.

In [59]:
type(tables)

list

We can slice off any of those dataframes that we want using normal indexing.

In [60]:
df = tables[3]
del df['Image']
df

Unnamed: 0,Name,Price,Location,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Common butterfly,160,Flying,4 AM - 7 PM,✓,✓,✓,✓,✓,✓,-,-,✓,✓,✓,✓
1,Yellow butterfly,160,Flying,4 AM - 7 PM,-,-,✓,✓,✓,✓,-,-,✓,✓,-,-
2,Tiger butterfly,240,Flying,4 AM - 7 PM,-,-,✓,✓,✓,✓,✓,✓,✓,-,-,-
3,Peacock butterfly,2500,Flying by Hybrid Flowers,4 AM - 7 PM,-,-,✓,✓,✓,✓,-,-,-,-,-,-
4,Common bluebottle,300,Flying,4 AM - 7 PM,-,-,-,✓,✓,✓,✓,✓,-,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Pill bug,250,Hitting Rocks,11 PM - 4 PM,✓,✓,✓,✓,✓,✓,-,-,✓,✓,✓,✓
76,Centipede,300,Hitting Rocks,4 PM - 11 PM,✓,✓,✓,✓,✓,✓,-,-,✓,✓,✓,✓
77,Spider,600,Shaking Trees,7 PM - 8 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
78,Tarantula,8000,On the Ground,7 PM - 4 AM,✓,✓,✓,✓,-,-,-,-,-,-,✓,✓


Set the index to the `State` column

In [61]:
df.set_index('Name', inplace=True)
df.head()

Unnamed: 0_level_0,Price,Location,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Common butterfly,160,Flying,4 AM - 7 PM,✓,✓,✓,✓,✓,✓,-,-,✓,✓,✓,✓
Yellow butterfly,160,Flying,4 AM - 7 PM,-,-,✓,✓,✓,✓,-,-,✓,✓,-,-
Tiger butterfly,240,Flying,4 AM - 7 PM,-,-,✓,✓,✓,✓,✓,✓,✓,-,-,-
Peacock butterfly,2500,Flying by Hybrid Flowers,4 AM - 7 PM,-,-,✓,✓,✓,✓,-,-,-,-,-,-
Common bluebottle,300,Flying,4 AM - 7 PM,-,-,-,✓,✓,✓,✓,✓,-,-,-,-


In [None]:
# given a string, seperate into two columns (start and end), 

In [62]:
for i in range(len(df)):
    df["Jan"] = df["Jan"].replace("✓", True)
    df["Feb"] = df["Feb"].replace("✓", True)
    df["Mar"] = df["Mar"].replace("✓", True)
    df["Apr"] = df["Apr"].replace("✓", True)
    df["May"] = df["May"].replace("✓", True)
    df["Jun"] = df["Jun"].replace("✓", True)
    df["Jul"] = df["Jul"].replace("✓", True)
    df["Aug"] = df["Aug"].replace("✓", True)
    df["Sep"] = df["Sep"].replace("✓", True)
    df["Oct"] = df["Oct"].replace("✓", True)
    df["Nov"] = df["Nov"].replace("✓", True)
    df["Dec"] = df["Dec"].replace("✓", True)
    
    df["Jan"] = df["Jan"].replace("-", False)
    df["Feb"] = df["Feb"].replace("-", False)
    df["Mar"] = df["Mar"].replace("-", False)
    df["Apr"] = df["Apr"].replace("-", False)
    df["May"] = df["May"].replace("-", False)
    df["Jun"] = df["Jun"].replace("-", False)
    df["Jul"] = df["Jul"].replace("-", False)
    df["Aug"] = df["Aug"].replace("-", False)
    df["Sep"] = df["Sep"].replace("-", False)
    df["Oct"] = df["Oct"].replace("-", False)
    df["Nov"] = df["Nov"].replace("-", False)
    df["Dec"] = df["Dec"].replace("-", False)
    

In [63]:
df

Unnamed: 0_level_0,Price,Location,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Common butterfly,160,Flying,4 AM - 7 PM,True,True,True,True,True,True,False,False,True,True,True,True
Yellow butterfly,160,Flying,4 AM - 7 PM,False,False,True,True,True,True,False,False,True,True,False,False
Tiger butterfly,240,Flying,4 AM - 7 PM,False,False,True,True,True,True,True,True,True,False,False,False
Peacock butterfly,2500,Flying by Hybrid Flowers,4 AM - 7 PM,False,False,True,True,True,True,False,False,False,False,False,False
Common bluebottle,300,Flying,4 AM - 7 PM,False,False,False,True,True,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pill bug,250,Hitting Rocks,11 PM - 4 PM,True,True,True,True,True,True,False,False,True,True,True,True
Centipede,300,Hitting Rocks,4 PM - 11 PM,True,True,True,True,True,True,False,False,True,True,True,True
Spider,600,Shaking Trees,7 PM - 8 AM,True,True,True,True,True,True,True,True,True,True,True,True
Tarantula,8000,On the Ground,7 PM - 4 AM,True,True,True,True,False,False,False,False,False,False,True,True


In [65]:
df['Time'] = df.Time.replace("All day","12 AM - 11 PM")
df

Unnamed: 0_level_0,Price,Location,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Common butterfly,160,Flying,4 AM - 7 PM,True,True,True,True,True,True,False,False,True,True,True,True
Yellow butterfly,160,Flying,4 AM - 7 PM,False,False,True,True,True,True,False,False,True,True,False,False
Tiger butterfly,240,Flying,4 AM - 7 PM,False,False,True,True,True,True,True,True,True,False,False,False
Peacock butterfly,2500,Flying by Hybrid Flowers,4 AM - 7 PM,False,False,True,True,True,True,False,False,False,False,False,False
Common bluebottle,300,Flying,4 AM - 7 PM,False,False,False,True,True,True,True,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pill bug,250,Hitting Rocks,11 PM - 4 PM,True,True,True,True,True,True,False,False,True,True,True,True
Centipede,300,Hitting Rocks,4 PM - 11 PM,True,True,True,True,True,True,False,False,True,True,True,True
Spider,600,Shaking Trees,7 PM - 8 AM,True,True,True,True,True,True,True,True,True,True,True,True
Tarantula,8000,On the Ground,7 PM - 4 AM,True,True,True,True,False,False,False,False,False,False,True,True


In [34]:
df.Time.unique()

array(['4 AM - 7 PM', '8 AM - 7 PM', '4 AM - 5 PM', '5 PM - 8 AM',
       '8 AM - 5 PM', '8 AM - 4 PM', '7 PM - 4 AM', '12 AM - 11 PM',
       '4 AM - 8 AM & 4 PM - 7 PM', '7 PM - 8 AM', '11 PM - 8 AM',
       '4 AM - 8 AM & 5 PM - 7 PM', '5 PM - 4 AM', '11 PM - 4 PM',
       '4 PM - 11 PM'], dtype=object)

In [37]:

# startampm = df.Time.str.split()[75][1]
for i in range(len(df)):
    if df["Time"].str.split()[i][1] == 'AM':
        df['start_time'] = (df.Time.str.split()[i][0])
    else:
        df['start_time'] = (int(df.Time.str.split()[i][0]) + 12)

df

Unnamed: 0_level_0,Image,Price,Location,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec,start_time
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Common butterfly,,160,Flying,4 AM - 7 PM,True,True,True,True,True,True,False,False,True,True,True,True,19
Yellow butterfly,,160,Flying,4 AM - 7 PM,False,False,True,True,True,True,False,False,True,True,False,False,19
Tiger butterfly,,240,Flying,4 AM - 7 PM,False,False,True,True,True,True,True,True,True,False,False,False,19
Peacock butterfly,,2500,Flying by Hybrid Flowers,4 AM - 7 PM,False,False,True,True,True,True,False,False,False,False,False,False,19
Common bluebottle,,300,Flying,4 AM - 7 PM,False,False,False,True,True,True,True,True,False,False,False,False,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Pill bug,,250,Hitting Rocks,11 PM - 4 PM,True,True,True,True,True,True,False,False,True,True,True,True,19
Centipede,,300,Hitting Rocks,4 PM - 11 PM,True,True,True,True,True,True,False,False,True,True,True,True,19
Spider,,600,Shaking Trees,7 PM - 8 AM,True,True,True,True,True,True,True,True,True,True,True,True,19
Tarantula,,8000,On the Ground,7 PM - 4 AM,True,True,True,True,False,False,False,False,False,False,True,True,19


## DataFrames as HTML

Pandas also had a `to_html` method that we can use to generate HTML tables from DataFrames.

In [None]:
html_table = df.to_html()
html_table

You may have to strip unwanted newlines to clean up the table.

In [None]:
html_table.replace('\n', '')

You can also save the table directly to a file.

In [None]:
df.to_html('fish_data.html')

In [None]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html