# WebScraper

## DISCLAIMER

This file is deprecated. Its use case is now located in `util.py`.

In [2]:
import requests
import re
import typing
import datetime
from bs4 import BeautifulSoup
from dataclasses import dataclass
from enum import Enum, auto

In [201]:
HOME = "https://www.cbssports.com/college-basketball/gametracker"

@dataclass
class Team:
    name: str

class Page:
    def __init__(self, url: str):
        self._url: str = url
        self._page = None
        self._soup = None

    @property
    def page(self):
        if self._page is None:
            self._page = requests.get(self._url)
        return self._page
    
    @property
    def soup(self):
        if self._soup is None:
            self._soup = BeautifulSoup(self.page.content, "html.parser")
        return self._soup

class GamePage(Page):
    """
    Wrapper for a page describing a past game played.
    """
    class Category(Enum):
        RECAP = auto()
        BOX = auto()
        PLAYS = auto()

    def __init__(self, gid: str):
        assert(re.match(r"NCAAB_\d{8}_[A-Z]+@[A-Z]+", gid))
        self._gid: str = gid
        self._box = None
        self._plays = None
        self._home = None
        self._away = None
        Page.__init__(self, f"{HOME}/recap/{self._gid}/")

    def __repr__(self):
        return f"GamePage(gid={self._gid})"
    
    def _get_url(self, category: Category = Category.PLAYS):
        dest = None
        match category:
            case GamePage.Category.RECAP:
                dest = "recap"
            case GamePage.Category.BOX:
                dest = "boxscore"
            case GamePage.Category.PLAYS:
                dest = "playbyplay"
        if dest is not None:
            return f"{HOME}/{dest}/{self._gid}"
        return None
    
    def _get_soup(self, category: Category = Category.PLAYS):
        url = self._get_url(category=category)
        if url is not None:
            return BeautifulSoup(requests.get(url).content, "html.parser")
    
    @property
    def box_score(self):
        if self._box is None:
            self._box = self._get_soup(category=GamePage.Category.BOX)
        return self._box

    @property
    def plays(self):
        if self._plays is None:
            self._plays = self._get_soup(category=GamePage.Category.PLAYS)
        return self._plays
    
    @property
    def home(self):
        if self._home is None:
            self._home = re.match(r"NCAAB_\d+_[A-Z]+@([A-Z]+)", self._gid).group(1)
        return self._home
    
    @property
    def away(self):
        if self._away is None:
            self._away = re.match(r"NCAAB_\d+_([A-Z]+)@[A-Z]+", self._gid).group(1)
        return self._away

"""TODO: date, venue"""
class Game:
    PLAY_REGEX = r"\s*(\d+:\d+)\s+(?:\+(\d))?\s+((?:[A-Za-z0-9.'()]+ ?)+)\s+(\d+-\d+)?\s*"

    class Play:
        def __init__(self, raw: list):
            self.team, self.time, self.pts, self.desc, self.score = raw
        
        def __repr__(self):
            return f"Play(team={self.team}, time={self.time}, score={self.score})"

    def __init__(self, gid: str):
        assert(re.match(r"NCAAB_\d{8}_[A-Z]+@[A-Z]+", gid))
        self._gid = gid
        self._page = GamePage(gid)
        self._plays = None

    def __repr__(self):
        return f"Game(gid={self._gid})"

    @property
    def plays(self):
        if self._plays is None:
            self._plays = self._parse_plays()
        return self._plays
    
    @property
    def home(self):
        return self._page.home
    
    @property
    def away(self):
        return self._page.away
    
    @property
    def date(self):
        _, year, month, day = re.match(r".*(\d{4})(\d{2})(\d{2})", self.gid).groups()
        return datetime.date(year, month, day)

    @property
    def gid(self):
        return self._gid

    def _parse_plays(self):
        first, second = self._page.plays.find_all('div', {'class' : 'TableBase'})
        plays = [[], []]
        last_score = "0-0"
        for x in first.find_all('tr'):
            home = self.home in str(x)
            m = re.match(Game.PLAY_REGEX, x.text)
            if m:
                play = Game.Play([self.home if home else self.away] + list(m.groups()))
                play.pts = play.pts if play.pts is not None else 0
                if play.score is None:
                    play.score = last_score
                else:
                    last_score = play.score
                plays[0].append(play)
        for x in second.find_all('tr'):
            home = self.home in str(x)
            m = re.match(Game.PLAY_REGEX, x.text)
            if m:
                play = Game.Play([self.home if home else self.away] + list(m.groups()))
                play.pts = play.pts if play.pts is not None else 0
                if play.score is None:
                    play.score = last_score
                else:
                    last_score = play.score
                plays[1].append(play)
        return plays



In [202]:
unc = Page(f"https://www.cbssports.com/college-basketball/teams/UNC/north-carolina-tar-heels/schedule/")
soup = unc.soup

In [203]:
games = []

for x in soup.find_all('div', {'class' : "CellGame"}):
    for y in x.find_all('a'):
        game_ref_match = re.match(r"/college-basketball/gametracker/recap/(.+)/", y['href'])
        if game_ref_match is not None:
            games.append(Game(game_ref_match.group(1)))

In [204]:
games[0].plays

[[Play(team=UNC, time=20:00, score=0-0),
  Play(team=UNC, time=19:48, score=0-2),
  Play(team=RADFRD, time=19:36, score=3-2),
  Play(team=UNC, time=19:23, score=3-5),
  Play(team=RADFRD, time=19:05, score=3-5),
  Play(team=RADFRD, time=19:03, score=3-5),
  Play(team=RADFRD, time=18:54, score=6-5),
  Play(team=UNC, time=18:47, score=6-8),
  Play(team=RADFRD, time=18:16, score=6-8),
  Play(team=RADFRD, time=17:58, score=6-8),
  Play(team=UNC, time=17:53, score=6-8),
  Play(team=UNC, time=17:51, score=6-8),
  Play(team=UNC, time=17:47, score=6-8),
  Play(team=UNC, time=17:45, score=6-8),
  Play(team=UNC, time=17:44, score=6-10),
  Play(team=RADFRD, time=17:26, score=8-10),
  Play(team=UNC, time=17:13, score=8-13),
  Play(team=RADFRD, time=16:50, score=8-13),
  Play(team=UNC, time=16:48, score=8-13),
  Play(team=UNC, time=16:40, score=8-15),
  Play(team=RADFRD, time=16:22, score=10-15),
  Play(team=UNC, time=15:58, score=10-17),
  Play(team=RADFRD, time=15:58, score=10-17),
  Play(team=UNC