### This code does the following:

#### 1) Access SportsReference website to pull college team names
#### 2) Convert these team names to usable addresses to access the logos on the local drive (.png files)
#### 3) Open all logo files, resize images for speed of processing, using computer vision to determine the color of each pixel on file (ignoring the full spectrum of "white" colors, which would otherwise be the dominant color in all images
#### 4) Determine what the dominant color in each image is for future use in color coding

In [7]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import numpy as np
import re
import os
import cv2
import csv

#establish variables for list/DB objects
name = []
School_list = []
Add_list = [] #list of web address versions of school names
Src_DB = [] #dataframe to store school names and their corresponding web address names
Rmv_Char = ['.', '(', ')', '&'] #characters to remove from school names
Clr_List = [] #list of colors

#establish variable for HTML address
sref = 'https://www.sports-reference.com/cbb/seasons/2017-school-stats.html'

#Parse School Names from Basic Season Stats page
r = requests.get(sref) #access html address
soup = BeautifulSoup(r.content, "lxml") #access content of html address
t = soup.find("table", id = "basic_school_stats") #find appropriate table on page

for i in t.find_all("tr")[:]: #find all table rows
    name = [a.get_text().encode('utf-8') for a in i.find_all("a")]
    for name in name:
        School_list.append(name)
        name = name.translate(None, ''.join(Rmv_Char)).replace("'", "").replace(" ", "-")
        Add_list.append(name.lower())

#exception handling to re-write school names that do not match off with the address for the .png file        
for p in range(0, len(Add_list)-1):
    if Add_list[p] == 'siu-edwardsville':
        Add_list[p] = 'southern-illinois-edwardsville'
    elif Add_list[p] == 'texas-rio-grande-valley':
        Add_list[p] = 'texas-pan-american'
    elif Add_list[p] == 'uc-davis':
        Add_list[p] = 'california-davis'
    elif Add_list[p] == 'uc-irvine':
        Add_list[p] = 'california-irvine'
    elif Add_list[p] == 'uc-riverside':
        Add_list[p] = 'california-riverside'
    elif Add_list[p] == 'uc-santa-barbara':
        Add_list[p] = 'california-santa-barbara'
    elif Add_list[p] == 'university-of-california':
        Add_list[p] = 'california'
    elif Add_list[p] == 'vmi':
        Add_list[p] = 'virginia-military-institute'
    elif Add_list[p] == 'william--mary':
        Add_list[p] = 'william-mary'
    
Src_DB = pd.DataFrame(
    {'SCHOOL_(SREF)': School_list,
     'Src': Add_list
    })    

In [8]:
q = 'C:\\Users\\E&M\\Pictures\\Team_Logos\\' #address for team logos on local drive
f = '.png'
addr_lst = [] #second list of address names (local)
for s in range(0, len(Add_list)):
    addr = q + Add_list[s] + f
    addr_lst.append(addr)
    
Src_DB['Address'] = addr_lst

In [9]:
#reduce size of img to speed up processing; pull BGR value of each pixel
clr_list = []
a_list = []
for a in addr_lst:
    img = cv2.imread(a, cv2.IMREAD_COLOR)
    img.resize(75,75,3)
    h, w, d = img.shape
    for y in range(0, h):
        for x in range(0,w):
            pxl = img[x,y]
            if ((pxl[0] >= 170) and (pxl[1] >= 170) and (pxl[2] >= 170)):
                pass
            else:
                a_list.append(a)
                clr_list.append(str(pxl))

In [10]:
#Seattle and Texas Pan American aren't pulling into the DF for some reason; doing thest two seperately until determining issue
img = cv2.imread('C:\Users\E&M\Pictures\Team_Logos\seattle.png', cv2.IMREAD_COLOR)
h,w,d = img.shape
for y in range(0,h):
    for x in range(0,w):
        pxl = img[x,y]
        if ((pxl[0] >= 170) and (pxl[1] >= 170) and (pxl[2] >= 170)):
            pass
        else:
            a_list.append('C:\Users\E&M\Pictures\Team_Logos\seattle.png')
            clr_list.append(str(pxl))

img = cv2.imread('C:\\Users\\E&M\\Pictures\\Team_Logos\\texas-pan-american.png', cv2.IMREAD_COLOR)
h,w,d = img.shape
for y in range(0,h):
    for x in range(0,w):
        pxl = img[x,y]
        if ((pxl[0] >= 170) and (pxl[1] >= 170) and (pxl[2] >= 170)):
            pass
        else:
            a_list.append('C:\Users\E&M\Pictures\Team_Logos\texas-pan-american.png')
            clr_list.append(str(pxl))

In [11]:
#dataframe to store list of colors and their corresponding team
clr_DF = pd.DataFrame({
    'Address': a_list,
    'Color_(BGR)': clr_list
    })

In [12]:
#determine dominant color for each team by grouping
clr_DF = clr_DF.groupby(['Address']).agg(lambda x:x.value_counts().index[0])
clr_DF.reset_index(inplace=True)

In [14]:
#create DF with all necessary color data and save to local drive
Full_Clr_DB = pd.merge(Src_DB, clr_DF, how='left', on='Address')
Full_Clr_DB['SCHOOL_(ESPN)'] = Full_Clr_DB['SCHOOL_(SREF)'].map(name_dict)
Full_Clr_DB.to_csv('Full_Clr_DB.csv')