# Caspar's notebook

Scrape some images?

In [35]:
# ! pip install beautifulsoup4 requests pandas


In [37]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# URL to scrape
url = 'https://pokemondb.net/evolution'

# Send a GET request to the URL
response = requests.get(url)

if response.status_code == 200:
    soup = BeautifulSoup(response.content, 'html.parser')
    infocard_lists = soup.find_all(class_='infocard-list-evo')

    # Prepare a list to hold the scraped data
    data = []

    # Iterate over each infocard-list-evo
    for infocard_list in infocard_lists:
        # Iterate over each infocard within the list
        for pokemon in infocard_list.find_all(class_='infocard'):
            source_tag = pokemon.find('source')
            if source_tag and 'srcset' in source_tag.attrs:
                # Get the first URL from srcset
                image_url = source_tag['srcset'].split()[0]
            else:
                image_url = None  # or some default value

            name_tag = pokemon.find('a', class_='ent-name')
            if name_tag:
                name = name_tag.text
            else:
                name = "Unknown"  # or some default value

            type_tags = pokemon.find_all(class_='itype')
            types = [type_.text for type_ in type_tags] if type_tags else []

            data.append({
                'name': name,
                'image': image_url,
                'types': ', '.join(types)
            })

    # Convert the list to a DataFrame
    pokemon_df = pd.DataFrame(data)

    # Save the DataFrame to a CSV file
    pokemon_df.to_csv('pokemon_data.csv', index=False)

    # Display the first few rows of the DataFrame
    print(pokemon_df.head())
else:
    print("Failed to scrape the page")


        name  \
0  Bulbasaur   
1    Unknown   
2    Ivysaur   
3    Unknown   
4   Venusaur   

                                                                  image  \
0  https://img.pokemondb.net/sprites/home/normal/2x/avif/bulbasaur.avif   
1                                                                  None   
2    https://img.pokemondb.net/sprites/home/normal/2x/avif/ivysaur.avif   
3                                                                  None   
4   https://img.pokemondb.net/sprites/home/normal/2x/avif/venusaur.avif   

           types  
0  Grass, Poison  
1                 
2  Grass, Poison  
3                 
4  Grass, Poison  


In [38]:
# pd.set_option('display.max_colwidth', None)  # or use a specific large number instead of None

# pokemon_df.head(20)


Unnamed: 0,name,image,types
0,Bulbasaur,https://img.pokemondb.net/sprites/home/normal/2x/avif/bulbasaur.avif,"Grass, Poison"
1,Unknown,,
2,Ivysaur,https://img.pokemondb.net/sprites/home/normal/2x/avif/ivysaur.avif,"Grass, Poison"
3,Unknown,,
4,Venusaur,https://img.pokemondb.net/sprites/home/normal/2x/avif/venusaur.avif,"Grass, Poison"
5,Charmander,https://img.pokemondb.net/sprites/home/normal/2x/avif/charmander.avif,Fire
6,Unknown,,
7,Charmeleon,https://img.pokemondb.net/sprites/home/normal/2x/avif/charmeleon.avif,Fire
8,Unknown,,
9,Charizard,https://img.pokemondb.net/sprites/home/normal/2x/avif/charizard.avif,"Fire, Flying"


In [39]:
# Print the first few image URLs to check their full length
for url in pokemon_df['image'].head(10):
    print(url)


https://img.pokemondb.net/sprites/home/normal/2x/avif/bulbasaur.avif
None
https://img.pokemondb.net/sprites/home/normal/2x/avif/ivysaur.avif
None
https://img.pokemondb.net/sprites/home/normal/2x/avif/venusaur.avif
https://img.pokemondb.net/sprites/home/normal/2x/avif/charmander.avif
None
https://img.pokemondb.net/sprites/home/normal/2x/avif/charmeleon.avif
None
https://img.pokemondb.net/sprites/home/normal/2x/avif/charizard.avif


In [40]:
# https://img.pokemondb.net/artwork/ivysaur.jpg


Clean the scraped data

1 - duplicates

In [41]:
# To view the duplicate rows
duplicate_rows = pokemon_df[pokemon_df.duplicated()]

print(duplicate_rows)


           name image          types
3       Unknown  None               
6       Unknown  None               
8       Unknown  None               
11      Unknown  None               
13      Unknown  None               
...         ...   ...            ...
1561  Armarouge  None  Fire, Psychic
1562    Unknown  None               
1563  Ceruledge  None    Fire, Ghost
1565    Unknown  None               
1568    Unknown  None               

[690 rows x 3 columns]


In [42]:
# Filter out rows where name is 'Unknown'
filtered_df = pokemon_df[pokemon_df['name'] != 'Unknown']

# Find duplicates in the filtered DataFrame
duplicate_rows_filtered = filtered_df[filtered_df.duplicated()]

# Display the duplicate rows
print(duplicate_rows_filtered)


           name  \
50       Raichu   
52       Raichu   
98    Vileplume   
100   Bellossom   
144   Poliwrath   
...         ...   
1506   Maushold   
1508   Maushold   
1510   Maushold   
1561  Armarouge   
1563  Ceruledge   

                                                                         image  \
50           https://img.pokemondb.net/sprites/home/normal/2x/avif/raichu.avif   
52    https://img.pokemondb.net/sprites/home/normal/2x/avif/raichu-alolan.avif   
98        https://img.pokemondb.net/sprites/home/normal/2x/avif/vileplume.avif   
100       https://img.pokemondb.net/sprites/home/normal/2x/avif/bellossom.avif   
144       https://img.pokemondb.net/sprites/home/normal/2x/avif/poliwrath.avif   
...                                                                        ...   
1506                                                                      None   
1508                                                                      None   
1510                             

In [43]:
# Filter out rows where the 'image' column is 'no_image_available.jpg' or None
filtered_df = pokemon_df[pokemon_df['image'] != 'no_image_available.jpg']
filtered_df = filtered_df[pokemon_df['name'] != 'Unknown']

filtered_df = filtered_df[filtered_df['image'].notna()]

print(filtered_df.head())


         name  \
0   Bulbasaur   
2     Ivysaur   
4    Venusaur   
5  Charmander   
7  Charmeleon   

                                                                   image  \
0   https://img.pokemondb.net/sprites/home/normal/2x/avif/bulbasaur.avif   
2     https://img.pokemondb.net/sprites/home/normal/2x/avif/ivysaur.avif   
4    https://img.pokemondb.net/sprites/home/normal/2x/avif/venusaur.avif   
5  https://img.pokemondb.net/sprites/home/normal/2x/avif/charmander.avif   
7  https://img.pokemondb.net/sprites/home/normal/2x/avif/charmeleon.avif   

           types  
0  Grass, Poison  
2  Grass, Poison  
4  Grass, Poison  
5           Fire  
7           Fire  


In [44]:
# pokemon_df = pokemon_df.drop_duplicates()
filtered_df.head()


Unnamed: 0,name,image,types
0,Bulbasaur,https://img.pokemondb.net/sprites/home/normal/2x/avif/bulbasaur.avif,"Grass, Poison"
2,Ivysaur,https://img.pokemondb.net/sprites/home/normal/2x/avif/ivysaur.avif,"Grass, Poison"
4,Venusaur,https://img.pokemondb.net/sprites/home/normal/2x/avif/venusaur.avif,"Grass, Poison"
5,Charmander,https://img.pokemondb.net/sprites/home/normal/2x/avif/charmander.avif,Fire
7,Charmeleon,https://img.pokemondb.net/sprites/home/normal/2x/avif/charmeleon.avif,Fire


check missing values

standardize data formats

In [45]:
# # Ensure names are capitalized
# pokemon_df['name'] = pokemon_df['name'].str.capitalize()


remove unnecessary columns

In [46]:
# Example: Drop a column if needed
# pokemon_df = pokemon_df.drop(columns=['unnecessary_column'])


split columns if needed

In [47]:
# Example: If you have a column that needs to be split
# pokemon_df[['part1', 'part2']] = pokemon_df['combined_column'].str.split(' ', expand=True)


Save to csv

In [50]:
filtered_df.to_csv('pokemon_data_from_pokemondb_net_evolution_page.csv', index=False)


# OK now for the preprocessing bit