## FOOTBALL LOGO SCRAPER
##### For more compelling and effortless visualization applications, I had an idea to scrape the logos and getting a color palette from them to find the most dominant colors. <br> This will help me get the team-relevant coloring for various purposes. <br> I'll use Wikipedia to scrape logos then ColorThief and Pillow to get the color palette.

In [10]:
%%time
import sqlite3
import pandas as pd
import json

with open('../config.json', 'r') as config_file:
    config = json.load(config_file)

db_path = config.get('statsbomb_db_path') # get the relevant db path from config
conn = sqlite3.connect(db_path)

# read scrape search query
with open('data/query/scrape_search.sql') as inserts:
    scrape_search = inserts.read()
df_scrape = pd.read_sql_query(scrape_search, conn)

# close the connection
conn.close()
df_scrape

CPU times: user 22.9 ms, sys: 3.01 ms, total: 25.9 ms
Wall time: 28 ms


Unnamed: 0,home_team_name,team_id,home_team_country_name
0,AC Milan,243,Italy
1,AFC Bournemouth,28,England
2,AS Monaco,129,Monaco
3,AS Roma,229,Italy
4,ATK Mohun Bagan,7282,India
...,...,...,...
196,West Bromwich Albion,27,England
197,West Ham United,40,England
198,Wolfsburg,179,Germany
199,Wolverhampton Wanderers,46,England


In [11]:
import requests
from bs4 import BeautifulSoup
import re
import time
from urllib.parse import unquote  # import the unquote function


def get_wikipedia_main_page_link(search_query):
    """
    this function takes a search query, performs a Google search,
    and returns the first main Wikipedia page link found in the search results.
    it specifically filters out non-article links such as images or categories.
    """
    # google search url
    google_search_url = 'https://www.google.com/search'
    params = {'q': search_query}

    # perform google search
    response = requests.get(google_search_url, params=params)

    # parse html content
    soup = BeautifulSoup(response.text, 'html.parser')

    # find all links in the search result
    for link in soup.find_all('a'):
        href = link.get('href')

        # check if the link is a main wikipedia page link
        if 'en.wikipedia.org/wiki/' in href:
            # exclude links to non-article pages (images, categories, files)
            if not re.search(r'/wiki/(File|Category|Portal):', href):
                # extract the actual link from the Google redirect url
                start = href.find('http')  # find the start of url
                end = href.find('&', start)  # find the end of url
                wikipedia_link = href[start:end]

                # decode any percent-encoded characters
                wikipedia_link = unquote(wikipedia_link)

                return wikipedia_link

    # return none if no main wikipedia page is found
    return None

print(len(df_scrape.home_team_name))

# Example usage
search_queries = list("Football club soccer " +df_scrape.home_team_name + " , " +df_scrape.home_team_country_name+ " logo")
zib = zip(df_scrape.home_team_name, df_scrape.home_team_country_name, df_scrape.team_id)
linklist = list()
nonelist = list()

# iterate over the search queries and get wikipedia main page links
for name,country,id in zib:
    search = "Football club soccer " + name + " , " + country + " logo"
    wiki_link = get_wikipedia_main_page_link(search)
    tup = (name,id,wiki_link)
    linklist.append(tup) if wiki_link != None else nonelist.append(tup)
    print(f"Query: {search}, Wikipedia Main Page Link: {wiki_link}")
    time.sleep(.3)  # avoid timeouts & crashes

linklist[:10], len(nonelist), nonelist[:10]

201
Query: Football club soccer AC Milan , Italy logo, Wikipedia Main Page Link: https://en.wikipedia.org/wiki/AC_Milan
Query: Football club soccer AFC Bournemouth , England logo, Wikipedia Main Page Link: https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/e/e5/AFC_Bournemouth_%282013%29.svg/800px-AFC_Bournemouth_%282013%29.svg.png
Query: Football club soccer AS Monaco , Monaco logo, Wikipedia Main Page Link: https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/c/cf/LogoASMonacoFC2021.svg/1200px-LogoASMonacoFC2021.svg.png
Query: Football club soccer AS Roma , Italy logo, Wikipedia Main Page Link: https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/f/f7/AS_Roma_logo_%282017%29.svg/1200px-AS_Roma_logo_%282017%29.svg.png
Query: Football club soccer ATK Mohun Bagan , India logo, Wikipedia Main Page Link: https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/2/26/Offic

([('AC Milan', 243, 'https://en.wikipedia.org/wiki/AC_Milan'),
  ('AFC Bournemouth',
   28,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/e/e5/AFC_Bournemouth_%282013%29.svg/800px-AFC_Bournemouth_%282013%29.svg.png'),
  ('AS Monaco',
   129,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/c/cf/LogoASMonacoFC2021.svg/1200px-LogoASMonacoFC2021.svg.png'),
  ('AS Roma',
   229,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/f/f7/AS_Roma_logo_%282017%29.svg/1200px-AS_Roma_logo_%282017%29.svg.png'),
  ('ATK Mohun Bagan',
   7282,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/2/26/Official_ATK_Logo.png'),
  ('Ajax', 806, 'https://en.wikipedia.org/wiki/AFC_Ajax'),
  ('Albacete',
   608,
   'https://www.google.com/imgres?imgurl=https://upload.wikimedia.org/wikipedia/en/thumb/3/37/Albacete_balompie.svg/1200px-Albacete_balompie.svg.png'

In [28]:
# new
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import cairosvg
from PIL import Image
import io
import json

# define columns and read
columns = ["color", "color_name", "hex", "R", "G", "B"]
colors_df = pd.read_csv('data/colors.csv', names=columns, header=None)

# function to convert .svg to .png
def convert_svg_to_png(svg_path, png_path):
    cairosvg.svg2png(url=svg_path, write_to=png_path)

def find_closest_color(rgb_color, colors_df):
    # calculate euclidean distance
    distances = np.sqrt(((colors_df[["R", "G", "B"]] - rgb_color) ** 2).sum(axis=1))
    # find the index of the smallest distance
    closest_index = distances.idxmin()
    closest_color = colors_df.iloc[closest_index]
    # return original color and closest match
    return pd.Series({"Original_R": rgb_color[0],
                      "Original_G": rgb_color[1],
                      "Original_B": rgb_color[2],
                      "Matched_Color": closest_color['color_name'],
                      "Matched_R": closest_color['R'],
                      "Matched_G": closest_color['G'],
                      "Matched_B": closest_color['B']})

def get_color_palette(img_path):
    if img_path.endswith('.svg'):
        png_path = img_path.replace('.svg', '.png')
        convert_svg_to_png(img_path, png_path)
        img_path = png_path
    img = Image.open(img_path)
    # Convert the PIL image to RGBA if not already

    if img.mode in ['RGBA', 'LA']:
        # Convert the PIL image to RGBA if not already
        if img.mode == 'LA':
            img = img.convert('RGBA')
        # Convert the PIL image to a NumPy array
        array = np.array(img)
        # Extract RGB and Alpha channels
        rgb = array[:,:,:3]
        alpha = array[:,:,3]
        mask = alpha != 0
    else:
        # Read the image using OpenCV for non-alpha modes
        array = cv2.imread(img_path)
        rgb = array[:,:,:3]
        mask = np.all(rgb != [0, 0, 0], axis=-1)  # Mask for non-black pixels

    # apply mask and extract unique rgb values
    opaque_rgb = rgb[mask]
    unique_rgb = np.unique(opaque_rgb, axis=0)
    all_bgr = opaque_rgb[:,:3]
    all_rgb = all_bgr[:, [2, 1, 0]]  # swapping BGR to RGB
    unique_bgr = np.unique(all_bgr, axis=0)
    unique_rgb = unique_bgr[:, [2, 1, 0]]  # swapping BGR to RGB

    # Find closest matches
    matches_df = pd.DataFrame([find_closest_color(color, colors_df) for color in unique_rgb])
    histy = pd.DataFrame(all_bgr).groupby([0,1,2])[2].count().sort_values(ascending=False).rename("cnt").reset_index().rename(columns={0:'Original_B',1:'Original_G',2:'Original_R'})
    # merge the count information into matches_df
    matches_df = pd.merge(matches_df, histy, on=['Original_R', 'Original_G', 'Original_B'], how='left')
    matches_df = matches_df.rename(columns={'cnt': 'Original_Count'})
    # group by matched RGB colors and sum the original counts
    grouped_df = matches_df.groupby(['Matched_R', 'Matched_G', 'Matched_B']).agg({'Original_Count': 'sum'})
    # sort the results in descending order of the sum of original counts
    sorted_df = grouped_df.sort_values(by='Original_Count', ascending=False)
    # reset the index
    sorted_df = sorted_df.reset_index()
    sorted_df
    # calculate the total sum of 'cnt'
    total_cnt = sorted_df['Original_Count'].sum()
    # determine the threshold value
    threshold = 0.05 * total_cnt
    # filter the df
    filtered_sorted_df = sorted_df[sorted_df['Original_Count'] > threshold]
    # extract RGB values as tuples
    rgb_values = [tuple(x) for x in filtered_sorted_df[["Matched_R", "Matched_G", "Matched_B"]].values]
    
    return rgb_values



In [29]:
import os
import json

# function to extract image urls from wiki pages
def get_wikipedia_image_url(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        image = infobox.find('img')
        if image:
            return 'https:' + image['src']
    return None

# function to download image from a url
def download_image(image_url, file_name):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(image_url, headers=headers)
    if response.status_code == 200:
        with open(file_name, 'wb') as file:
            file.write(response.content)
        return True
    return False

# function to convert rgb to hex
def rgb_to_hex(rgb):
    return "#{:02x}{:02x}{:02x}".format(rgb[0], rgb[1], rgb[2])

# directory to save images
image_dir = 'data/logo'
os.makedirs(image_dir, exist_ok=True)

structured_data = {}

for team, id, link in linklist:
    data = {'team_name': team, 'team_id': id , 'original_link': link, 'image_url': None, 
            'download_status': False, 'file_path': None, 'palette_hex': None}
    
    if 'wikipedia.org/wiki' in link:
        image_url = get_wikipedia_image_url(link)
    else:
        image_url = link.split('imgurl=')[1].split('&')[0]
        image_url = requests.utils.unquote(image_url)

    data['image_url'] = image_url

    if image_url:
        file_extension = image_url.split('.')[-1].split('?')[0]
        sanitized_team_name = team.replace(' ', '_')  # replace spaces with underscores or any other sanitization you need
        print(sanitized_team_name)
        file_name = os.path.join(image_dir, sanitized_team_name + '.' + file_extension)
        data['download_status'] = download_image(image_url, file_name)
        data['file_path'] = file_name if data['download_status'] else None
        rgb_values = get_color_palette(file_name)
        data['palette_hex'] = [rgb_to_hex(color) for color in rgb_values]
    structured_data[id] = data

# file to save the structured data
output_file = 'team_logo_data.json'

# save structured data to a json file
with open(output_file, 'w') as file:
    json.dump(structured_data, file, indent=4)

print(f"Data saved to {output_file}")

AC_Milan
AFC_Bournemouth
AS_Monaco
AS_Roma
ATK_Mohun_Bagan
Ajax
Albacete
Almería
Angers
Argentina
Argentina_U20
Arsenal
Aston_Villa
Atalanta
Athletic_Club
Augsburg
Australia
Austria
Barcelona
Bastia
Bayer_Leverkusen
Bayern_Munich
Belgium
Bengaluru
Birmingham_City
Blackburn_Rovers
Boca_Juniors
Bologna
Bolton_Wanderers
Bordeaux
Borussia_Dortmund
Borussia_Mönchengladbach
Brazil
CD_Numancia_de_Soria
Caen
Cameroon
Canada
Carpi
Celta_Vigo
Charlton_Athletic
Chelsea
Chennaiyin
Chievo
Colombia
Costa_Rica
Croatia
Crystal_Palace
Czech_Republic
Czechoslovakia
Cádiz
Córdoba_CF
Darmstadt_98
Denmark
Deportivo_Alavés
East_Bengal
Ecuador
Egypt
Eibar
Eintracht_Frankfurt
Elche
Empoli
England
Espanyol
Everton
FC_Köln
FC_Porto
FSV_Mainz_05
Finland
Fiorentina
France
Frosinone
Fulham
Gazélec_Ajaccio
Genoa
German_DR
Germany
Getafe
Ghana
Gimnàstic_Tarragona
Girona
Goa
Granada
Guingamp
Hamburger_SV
Hannover_96
Hellas_Verona
Hertha_Berlin
Hoffenheim
Huesca
Hungary
Hyderabad
Hércules
Iceland
Ingolstadt
Inter_Mila