In [1]:
import pandas as pd

# Scraping with Pandas

We can use the `read_html` function in Pandas to automatically scrape any tabular data from a page.

In [2]:
url = 'https://animalcrossing.fandom.com/wiki/Fish_(New_Horizons)'

In [3]:
# Reading url table into a df
tables = pd.read_html(url)
tables[2]

Unnamed: 0,Name,Image,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Bitterling,,900,River,1,All day,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
1,Pale chub,,200,River,1,9 AM - 4 PM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
2,Crucian carp,,160,River,2,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
3,Dace,,240,River,3,4 PM - 9 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
4,Carp,,300,Pond,4,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Suckerfish,,1500,Sea,4 (Fin),All day,-,-,-,-,-,✓,✓,✓,✓,-,-,-
76,Football fish,,2500,Sea,4,4 PM - 9 AM,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
77,Oarfish,,9000,Sea,6,All day,✓,✓,✓,✓,✓,-,-,-,-,-,-,✓
78,Barreleye,,15000,Sea,2,9 PM - 4 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓


# Data Transformation

In [4]:
# storing the third table from the url in a df
df = tables[2]
# deleting the image column
del df["Image"]
# displaying the df
df

Unnamed: 0,Name,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Bitterling,900,River,1,All day,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
1,Pale chub,200,River,1,9 AM - 4 PM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
2,Crucian carp,160,River,2,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
3,Dace,240,River,3,4 PM - 9 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
4,Carp,300,Pond,4,All day,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Suckerfish,1500,Sea,4 (Fin),All day,-,-,-,-,-,✓,✓,✓,✓,-,-,-
76,Football fish,2500,Sea,4,4 PM - 9 AM,✓,✓,✓,-,-,-,-,-,-,-,✓,✓
77,Oarfish,9000,Sea,6,All day,✓,✓,✓,✓,✓,-,-,-,-,-,-,✓
78,Barreleye,15000,Sea,2,9 PM - 4 AM,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓,✓


In [5]:
# Replacing check marks with True and minus signs with False
df["Jan"] = df["Jan"].replace("✓", True).replace("-", False)
df["Feb"] = df["Feb"].replace("✓", True).replace("-", False)
df["Mar"] = df["Mar"].replace("✓", True).replace("-", False)
df["Apr"] = df["Apr"].replace("✓", True).replace("-", False)
df["May"] = df["May"].replace("✓", True).replace("-", False)
df["Jun"] = df["Jun"].replace("✓", True).replace("-", False)
df["Jul"] = df["Jul"].replace("✓", True).replace("-", False)
df["Aug"] = df["Aug"].replace("✓", True).replace("-", False)
df["Sep"] = df["Sep"].replace("✓", True).replace("-", False)
df["Oct"] = df["Oct"].replace("✓", True).replace("-", False)
df["Nov"] = df["Nov"].replace("✓", True).replace("-", False)
df["Dec"] = df["Dec"].replace("✓", True).replace("-", False)
    
# Capitalizing the name column
df["Name"] = df["Name"].str.title()

# Displaying df
df

Unnamed: 0,Name,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
0,Bitterling,900,River,1,All day,True,True,True,False,False,False,False,False,False,False,True,True
1,Pale Chub,200,River,1,9 AM - 4 PM,True,True,True,True,True,True,True,True,True,True,True,True
2,Crucian Carp,160,River,2,All day,True,True,True,True,True,True,True,True,True,True,True,True
3,Dace,240,River,3,4 PM - 9 AM,True,True,True,True,True,True,True,True,True,True,True,True
4,Carp,300,Pond,4,All day,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,Suckerfish,1500,Sea,4 (Fin),All day,False,False,False,False,False,True,True,True,True,False,False,False
76,Football Fish,2500,Sea,4,4 PM - 9 AM,True,True,True,False,False,False,False,False,False,False,True,True
77,Oarfish,9000,Sea,6,All day,True,True,True,True,True,False,False,False,False,False,False,True
78,Barreleye,15000,Sea,2,9 PM - 4 AM,True,True,True,True,True,True,True,True,True,True,True,True


## Fish ID df

In [6]:
# Reading id csv 
filepath = "ids.csv"

# Reading csv into a df
id_df = pd.read_csv(filepath)

# Filtering id df to only contain Fish 
fish_id = id_df.loc[id_df['Type']=='Fish']

## Merging ID and fish df


In [7]:
# Merging fish id and fish df
fish_id_merge = df.merge(fish_id[["ID","Name"]],on='Name',how='left')

# Setting the index to ID
fish_id_merge = fish_id_merge.set_index("ID")

# Displaying the df
fish_id_merge

Unnamed: 0_level_0,Name,Price,Location,Shadow size,Time,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
F01,Bitterling,900,River,1,All day,True,True,True,False,False,False,False,False,False,False,True,True
F02,Pale Chub,200,River,1,9 AM - 4 PM,True,True,True,True,True,True,True,True,True,True,True,True
F03,Crucian Carp,160,River,2,All day,True,True,True,True,True,True,True,True,True,True,True,True
F04,Dace,240,River,3,4 PM - 9 AM,True,True,True,True,True,True,True,True,True,True,True,True
F05,Carp,300,Pond,4,All day,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
F76,Suckerfish,1500,Sea,4 (Fin),All day,False,False,False,False,False,True,True,True,True,False,False,False
F77,Football Fish,2500,Sea,4,4 PM - 9 AM,True,True,True,False,False,False,False,False,False,False,True,True
F78,Oarfish,9000,Sea,6,All day,True,True,True,True,True,False,False,False,False,False,False,True
F79,Barreleye,15000,Sea,2,9 PM - 4 AM,True,True,True,True,True,True,True,True,True,True,True,True


## Splitting dataframe

In [15]:
# Creating a df for the fish 
fish_df = fish_id_merge.iloc[:,:4]

# Displaying the fish df
fish_df

Unnamed: 0_level_0,Name,Price,Location,Shadow size
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F01,Bitterling,900,River,1
F02,Pale Chub,200,River,1
F03,Crucian Carp,160,River,2
F04,Dace,240,River,3
F05,Carp,300,Pond,4
...,...,...,...,...
F76,Suckerfish,1500,Sea,4 (Fin)
F77,Football Fish,2500,Sea,4
F78,Oarfish,9000,Sea,6
F79,Barreleye,15000,Sea,2


In [9]:
# Creating a df for the months fish are available
fish_months_df = fish_id_merge.iloc[:,5:]

# Displaying fish months df
fish_months_df

Unnamed: 0_level_0,Jan,Feb,Mar,Apr,May,Jun,Jul,Aug,Sep,Oct,Nov,Dec
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
F01,True,True,True,False,False,False,False,False,False,False,True,True
F02,True,True,True,True,True,True,True,True,True,True,True,True
F03,True,True,True,True,True,True,True,True,True,True,True,True
F04,True,True,True,True,True,True,True,True,True,True,True,True
F05,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...
F76,False,False,False,False,False,True,True,True,True,False,False,False
F77,True,True,True,False,False,False,False,False,False,False,True,True
F78,True,True,True,True,True,False,False,False,False,False,False,True
F79,True,True,True,True,True,True,True,True,True,True,True,True


In [14]:
# Creating a df for the months fish are available
fish_time_df = fish_id_merge.iloc[:,4:5]

# Displaying fish months df
fish_time_df

Unnamed: 0_level_0,Time
ID,Unnamed: 1_level_1
F01,All day
F02,9 AM - 4 PM
F03,All day
F04,4 PM - 9 AM
F05,All day
...,...
F76,All day
F77,4 PM - 9 AM
F78,All day
F79,9 PM - 4 AM


## DataFrames as HTML

Pandas also had a `to_html` method that we can use to generate HTML tables from DataFrames.

In [10]:
html_table = df.to_html()
html_table

'<table border="1" class="dataframe">\n  <thead>\n    <tr style="text-align: right;">\n      <th></th>\n      <th>Name</th>\n      <th>Price</th>\n      <th>Location</th>\n      <th>Shadow size</th>\n      <th>Time</th>\n      <th>Jan</th>\n      <th>Feb</th>\n      <th>Mar</th>\n      <th>Apr</th>\n      <th>May</th>\n      <th>Jun</th>\n      <th>Jul</th>\n      <th>Aug</th>\n      <th>Sep</th>\n      <th>Oct</th>\n      <th>Nov</th>\n      <th>Dec</th>\n    </tr>\n  </thead>\n  <tbody>\n    <tr>\n      <th>0</th>\n      <td>Bitterling</td>\n      <td>900</td>\n      <td>River</td>\n      <td>1</td>\n      <td>All day</td>\n      <td>True</td>\n      <td>True</td>\n      <td>True</td>\n      <td>False</td>\n      <td>False</td>\n      <td>False</td>\n      <td>False</td>\n      <td>False</td>\n      <td>False</td>\n      <td>False</td>\n      <td>True</td>\n      <td>True</td>\n    </tr>\n    <tr>\n      <th>1</th>\n      <td>Pale Chub</td>\n      <td>200</td>\n      <td>River</td>\n

You may have to strip unwanted newlines to clean up the table.

In [11]:
html_table.replace('\n', '')

'<table border="1" class="dataframe">  <thead>    <tr style="text-align: right;">      <th></th>      <th>Name</th>      <th>Price</th>      <th>Location</th>      <th>Shadow size</th>      <th>Time</th>      <th>Jan</th>      <th>Feb</th>      <th>Mar</th>      <th>Apr</th>      <th>May</th>      <th>Jun</th>      <th>Jul</th>      <th>Aug</th>      <th>Sep</th>      <th>Oct</th>      <th>Nov</th>      <th>Dec</th>    </tr>  </thead>  <tbody>    <tr>      <th>0</th>      <td>Bitterling</td>      <td>900</td>      <td>River</td>      <td>1</td>      <td>All day</td>      <td>True</td>      <td>True</td>      <td>True</td>      <td>False</td>      <td>False</td>      <td>False</td>      <td>False</td>      <td>False</td>      <td>False</td>      <td>False</td>      <td>True</td>      <td>True</td>    </tr>    <tr>      <th>1</th>      <td>Pale Chub</td>      <td>200</td>      <td>River</td>      <td>1</td>      <td>9 AM - 4 PM</td>      <td>True</td>      <td>True</td>      <td>True</td

You can also save the table directly to a file.

In [12]:
df.to_html('fish_data.html')

In [13]:
# OSX Users can run this to open the file in a browser, 
# or you can manually find the file and open it in the browser
!open table.html

The file /Users/jeremybrent/Documents/Rutgers/Projects/etl_project/table.html does not exist.
