## FOOTBALL LOGO SCRAPER
##### For more compelling and effortless visualization applications, I had an idea to scrape the logos and getting a color palette from them to find the most dominant colors. <br> This will help me get the team-relevant coloring for various purposes. <br> I'll use Wikipedia to scrape logos then ColorThief and Pillow to get the color palette.

In [37]:
%%time
import sqlite3
import pandas as pd
import json

with open('../config.json', 'r') as config_file:
    config = json.load(config_file)

db_path = config.get('statsbomb_db_path') # get the relevant db path from config
conn = sqlite3.connect(db_path)

# read scrape search query
with open('data/query/scrape_search.sql') as inserts:
    scrape_search = inserts.read()
df_scrape = pd.read_sql_query(scrape_search, conn)

# close the connection
conn.close()
df_scrape

CPU times: user 21.5 ms, sys: 1.72 ms, total: 23.3 ms
Wall time: 23.2 ms


Unnamed: 0,home_team_name,team_id,home_team_country_name
0,AC Milan,243,Italy
1,AFC Bournemouth,28,England
2,AS Monaco,129,Monaco
3,AS Roma,229,Italy
4,ATK Mohun Bagan,7282,India
...,...,...,...
196,West Bromwich Albion,27,England
197,West Ham United,40,England
198,Wolfsburg,179,Germany
199,Wolverhampton Wanderers,46,England


In [38]:
import requests
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import unquote  # import the unquote function


def get_wikipedia_main_page_link(search_query):
    """
    This function takes a search query, performs a Google search,
    and returns the first main Wikipedia page link found in the search results.
    It specifically filters out non-article links such as images or categories.
    """
    # google search url
    google_search_url = 'https://www.google.com/search'
    params = {'q': search_query}

    # perform google search
    response = requests.get(google_search_url, params=params)

    # parse html content
    soup = BeautifulSoup(response.text, 'html.parser')

    # find all links in the search result
    for link in soup.find_all('a'):
        href = link.get('href')

        # check if the link is a main wikipedia page link
        if 'en.wikipedia.org/wiki/' in href:
            # exclude links to non-article pages (images, categories, files)
            if not re.search(r'/wiki/(File|Category|Portal):', href):
                # extract the actual link from the Google redirect url
                start = href.find('http')  # find the start of url
                end = href.find('&', start)  # find the end of url
                wikipedia_link = href[start:end]

                # decode any percent-encoded characters
                wikipedia_link = unquote(wikipedia_link)

                return wikipedia_link

    # return none if no main wikipedia page is found
    return None

print(len(df_scrape.home_team_name))

# Example usage
search_queries = list("Football club soccer " +df_scrape.home_team_name + " , " +df_scrape.home_team_country_name+ " logo")
zib = zip(df_scrape.home_team_name, df_scrape.home_team_country_name, df_scrape.team_id)
linklist = list()
nonelist = list()

# iterate over the search queries and get wikipedia main page links
for name,country,id in zib:
    search = "Football club soccer " + name + " , " + country + " logo"
    wiki_link = get_wikipedia_main_page_link(search)
    tup = (name,id,wiki_link)
    linklist.append(tup) if wiki_link != None else nonelist.append(tup)
    print(f"Query: {search}, Wikipedia Main Page Link: {wiki_link}")
    time.sleep(.3)  # avoid  Google with requests

linklist[:10], len(nonelist), nonelist[:10]

201
Query: Football club soccer AC Milan , Italy logo, Wikipedia Main Page Link: https://en.wikipedia.org/wiki/AC_Milan
Query: Football club soccer AFC Bournemouth , England logo, Wikipedia Main Page Link: https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/e/e5/AFC_Bournemouth_%282013%29.svg/800px-AFC_Bournemouth_%282013%29.svg.png
Query: Football club soccer AS Monaco , Monaco logo, Wikipedia Main Page Link: https://en.wikipedia.org/wiki/AS_Monaco_FC
Query: Football club soccer AS Roma , Italy logo, Wikipedia Main Page Link: https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/f/f7/AS_Roma_logo_%282017%29.svg/1200px-AS_Roma_logo_%282017%29.svg.png
Query: Football club soccer ATK Mohun Bagan , India logo, Wikipedia Main Page Link: https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/3/32/Mohun_Bagan_Super_Giant.svg/640px-Mohun_Bagan_Super_Giant.svg.png
Query: Football club soccer Ajax , Ne

([('AC Milan', 243, 'https://en.wikipedia.org/wiki/AC_Milan'),
  ('AFC Bournemouth',
   28,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/e/e5/AFC_Bournemouth_%282013%29.svg/800px-AFC_Bournemouth_%282013%29.svg.png'),
  ('AS Monaco', 129, 'https://en.wikipedia.org/wiki/AS_Monaco_FC'),
  ('AS Roma',
   229,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/f/f7/AS_Roma_logo_%282017%29.svg/1200px-AS_Roma_logo_%282017%29.svg.png'),
  ('ATK Mohun Bagan',
   7282,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/3/32/Mohun_Bagan_Super_Giant.svg/640px-Mohun_Bagan_Super_Giant.svg.png'),
  ('Ajax', 806, 'https://en.wikipedia.org/wiki/AFC_Ajax'),
  ('Albacete',
   608,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/3/37/Albacete_balompie.svg/1200px-Albacete_balompie.svg.png'),
  ('Almería',
   403,
   'https://www.google.com/imgres?i

In [39]:
import os

# function to extract image URLs from Wikipedia pages
def get_wikipedia_image_url(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        image = infobox.find('img')
        if image:
            return 'https:' + image['src']
    return None

# function to download image from a URL
def download_image(image_url, file_name):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(image_url, headers=headers)
    if response.status_code == 200:
        with open(file_name, 'wb') as file:
            file.write(response.content)
        return True
    return False


# directory to save images
image_dir = 'data/logo'
os.makedirs(image_dir, exist_ok=True)

# Structured data
structured_data = {}

for team, id, link in linklist:
    data = {'team_name': team, 'team_id': id , 'original_link': link, 'image_url': None, 'download_status': False, 'file_path': None}
    if 'wikipedia.org/wiki' in link:
        image_url = get_wikipedia_image_url(link)
    else:
        image_url = link.split('imgurl=')[1].split('&')[0]
        image_url = requests.utils.unquote(image_url)

    data['image_url'] = image_url
    if image_url:
        file_extension = image_url.split('.')[-1].split('?')[0]
        sanitized_team_name = team.replace(' ', '_')  # Replace spaces with underscores or any other sanitization you need
        file_name = os.path.join(image_dir, sanitized_team_name + '.' + file_extension)
        data['download_status'] = download_image(image_url, file_name)
        data['file_path'] = file_name if data['download_status'] else None
    structured_data[id] = data

import json

# File to save the structured data
output_file = 'team_logo_data.json'

# Save structured data to a JSON file
with open(output_file, 'w') as file:
    json.dump(structured_data, file, indent=4)

print(f"Data saved to {output_file}")

Data saved to team_logo_data.json


In [52]:
import cairosvg
from PIL import Image
from colorthief import ColorThief
import io
import json
import os

# function to convert .svg to .png
def convert_svg_to_png(svg_path, png_path):
    cairosvg.svg2png(url=svg_path, write_to=png_path)

# function to get the main colors from the logo
def get_main_colors(image_path, num_colors):
    if image_path and image_path.endswith('.svg'):
        png_path = image_path.replace('.svg', '.png')
        convert_svg_to_png(image_path, png_path)
        image_path = png_path

    img = Image.open(image_path)
    rgb_img = img.convert('RGB') if img.mode == 'RGBA' else img

    buffer = io.BytesIO()
    rgb_img.save(buffer, format='PNG')
    buffer.seek(0)

    color_thief = ColorThief(buffer)
    palette = color_thief.get_palette(color_count=num_colors, quality=1)

    return palette

# function to convert rgb to hex
def rgb_to_hex(rgb):
    return "#{:02x}{:02x}{:02x}".format(rgb[0], rgb[1], rgb[2])

# load json
with open('team_logo_data.json', 'r') as file:
    team_logo_data = json.load(file)

# update the team logo data with color palettes
for team_id, team_info in team_logo_data.items():
    if 'file_path' in team_info and team_info['file_path']:
        palette = get_main_colors(team_info['file_path'], 6)
        palette_hex = [rgb_to_hex(color) for color in palette]
        team_info['palette'] = palette_hex

# write the updated data back to a new json file
with open('team_logo_data.json', 'w') as file:
    json.dump(team_logo_data, file, indent=4)