## FOOTBALL LOGO SCRAPER
##### For more compelling and effortless visualization applications, I had an idea to scrape the logos and getting a color palette from them to find the most dominant colors. <br> This will help me get the team-relevant coloring for various purposes. <br> I'll use Wikipedia to scrape the logos then ColorThief and Pillow to get the color palette.

In [1]:
%%time
import sqlite3
import pandas as pd
# SELECTs events from previously recorded event data in a staging table
# connect & read
conn = sqlite3.connect( "/Users/muratseyhan/football-analytics/data/statsbomb.db")
# the events of matches

with open('/Users/muratseyhan/football-analytics/data/full_season_matches.sql') as inserts:
    q1 = inserts.read()
df_scrape = pd.read_sql_query('''
SELECT   m.competition_name, home_team_name, home_team_country_name
FROM MATCH m
LEFT JOIN competition c
ON c.competition_id = m.competition_id
WHERE c.competition_gender = 'male'
GROUP BY m.competition_name, home_team_country_name, home_team_name

UNION

SELECT   m.competition_name, away_team_name, away_team_country_name
FROM MATCH m
LEFT JOIN competition c
ON c.competition_id = m.competition_id
WHERE c.competition_gender = 'male'
GROUP BY m.competition_name, away_team_country_name, away_team_name
ORDER BY 1,2,3
''', conn)

# close the connection
conn.close()
df_scrape

CPU times: user 1.79 s, sys: 69.6 ms, total: 1.86 s
Wall time: 302 ms


Unnamed: 0,competition_name,home_team_name,home_team_country_name
0,1. Bundesliga,Augsburg,Germany
1,1. Bundesliga,Bayer Leverkusen,Germany
2,1. Bundesliga,Bayern Munich,Germany
3,1. Bundesliga,Borussia Dortmund,Germany
4,1. Bundesliga,Borussia Mönchengladbach,Germany
...,...,...,...
232,UEFA Euro,Wales,Wales
233,UEFA Europa League,Bayern Munich,Germany
234,UEFA Europa League,Juventus,Italy
235,UEFA Europa League,Napoli,Italy


In [None]:
import requests
from bs4 import BeautifulSoup
import re
import time

def get_wikipedia_main_page_link(search_query):
    """
    This function takes a search query, performs a Google search,
    and returns the first main Wikipedia page link found in the search results.
    It specifically filters out non-article links such as images or categories.
    """
    # Google Search URL
    google_search_url = 'https://www.google.com/search'
    params = {'q': search_query}

    # Perform the Google search
    response = requests.get(google_search_url, params=params)

    # Parse the HTML content
    soup = BeautifulSoup(response.text, 'html.parser')

    # Find all links in the search result
    for link in soup.find_all('a'):
        href = link.get('href')

        # Check if the link is a main Wikipedia page link and return it
        if 'en.wikipedia.org/wiki/' in href:
            # Exclude links to non-article pages (images, categories, files)
            if not re.search(r'/wiki/(File|Category|Portal):', href):
                # Extract the actual link from the Google redirect URL
                start = href.find('http')  # Find the start of the actual URL
                end = href.find('&', start)  # Find the end of the actual URL
                wikipedia_link = href[start:end]
                return wikipedia_link

    # Return None if no main Wikipedia page link is found
    return None

print(len(df_scrape.home_team_name))

# Example usage
search_queries = list("Football club soccer " +df_scrape.home_team_name + " , " +df_scrape.home_team_country_name+ " logo")
zib = zip(df_scrape.home_team_name,df_scrape.home_team_country_name)
linklist = list()
nonelist = list()
# Iterate over the search queries and get Wikipedia main page links
for query in zib:
    search = "Football club soccer " + query[0] + " , " + query[1] + " logo"
    wiki_link = get_wikipedia_main_page_link(search)
    tup = (query[0],wiki_link)
    linklist.append(tup) if wiki_link != None else nonelist.append(tup)
    print(f"Query: {search}, Wikipedia Main Page Link: {wiki_link}")
    time.sleep(.3)  # Sleep to avoid overwhelming Google with requests

linklist[:10], len(nonelist), nonelist[:10]

In [None]:
import os

# List of links
teams_links = linklist

# Function to extract image URLs from Wikipedia pages
def get_wikipedia_image_url(page_url):
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    infobox = soup.find('table', {'class': 'infobox'})
    if infobox:
        image = infobox.find('img')
        if image:
            return 'https:' + image['src']
    return None

# Function to download image from a URL
def download_image(image_url, file_name):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
    response = requests.get(image_url, headers=headers)
    if response.status_code == 200:
        with open(file_name, 'wb') as file:
            file.write(response.content)
        return True
    return False


# Directory to save images
image_dir = 'data/logos'
os.makedirs(image_dir, exist_ok=True)

# Structured data
structured_data = []

for team, link in teams_links:
    data = {'team_name': team, 'original_link': link, 'image_url': None, 'download_status': False, 'file_path': None}
    if 'wikipedia.org/wiki' in link:
        image_url = get_wikipedia_image_url(link)
    else:
        image_url = link.split('imgurl=')[1].split('&')[0]
        image_url = requests.utils.unquote(image_url)

    data['image_url'] = image_url
    if image_url:
        file_extension = image_url.split('.')[-1].split('?')[0]
        sanitized_team_name = team.replace(' ', '_')  # Replace spaces with underscores or any other sanitization you need
        file_name = os.path.join(image_dir, sanitized_team_name + '.' + file_extension)
        data['download_status'] = download_image(image_url, file_name)
        data['file_path'] = file_name if data['download_status'] else None
    structured_data.append(data)

import json

# File to save the structured data
output_file = 'team_logos_data.json'

# Save structured data to a JSON file
with open(output_file, 'w') as file:
    json.dump(structured_data, file, indent=4)

print(f"{len(output_file), }Data saved to {output_file}")