## Database thoughts

Here's what we want to be able to easily get: <br>
(1) List of players competing in game <br>
(2) List of games a player has played in <br>
(3) List of games based on date range <br>
(4) All stats for each game <br>
(5) Summation of stats for a particular player's games over a particular date range <br>

Tables <br>
(1) GAME_OVERALL<br>
(2) SKATER_GAME<br>
(3) GOALIE_GAME<br>
(4) PLAYER<br>
(5) TEAM

## Database building functions

In [77]:
import sys
import pandas as pd
import sqlalchemy
import glob
from IPython.core import display as ICD
import datetime

### Constants ###
DIR_GAMES = "./data/Games/"
FILE_GAMES = DIR_GAMES + "2015Games.csv"
PLAYER_STATS_FILE = "PlayerStats"
HOME = "Home"
AWAY = "Away"
DB_NAME = "test3.db"
TABLE_GAMEOVERALL = "GAME_STATS"
TABLE_PLAYERGAME = "SKATER_GAME"
TABLE_GOALIEGAME = "GOALIE_GAME"
TABLE_PLAYER = "PLAYER"
TABLE_TEAM = "TEAM"

def get_playerstats_files(gamename):
    team1 = GAMESDIR + gamename + '_' + PLAYER_STATS_FILE + HOME + '.csv'
    team2 = GAMESDIR + gamename + '_' + PLAYER_STATS_FILE + AWAY + '.csv'
    return team1, team2

def gamefile_to_gamename(gamefile):
    gamename = str(gamefile)
    gamename = gamename.replace('.csv', '')
    gamename = gamename.replace("./Games\\", '')
    return gamename

def append_columns(game, team1, team2):
    d_game['GameName'] = gamename
    d_team1['GameName'] = gamename
    d_team2['GameName'] = gamename
    d_team1['Result'] = TEAM1
    
def get_gamename(game):
    return str(2015) + "_" + str(game[0])

def build_database(db_name, csv_games):    
    engine = get_engine(db_name)
    d_games = pd.read_csv(csv_games)
    gamenames = []
    dates = []
    # TODO: get rid of this for loop
    for game in d_games.iterrows():
        gamename = get_gamename(game)
        gamenames.append(gamename)
        dt = datetime.datetime.strptime(game[1][1], '%Y-%m-%d')
        date = int(dt.timestamp())
        dates.append(date)
    d_games['DateTimestamp'] = dates
    d_games['GameName'] = gamenames
    d_games = d_games.rename(index=str, columns={"Unnamed: 6":"OT"})
    d_games = d_games.drop('Notes', 1)
    d_games.OT = d_games.OT=="OT"
    d_games.to_sql(TABLE_GAMEOVERALL, engine, if_exists='replace')

## Database lookup functions

In [55]:
def get_engine(db_name):
    return sqlalchemy.create_engine('sqlite:///' + db_name)

def view_database(db_name, table):
    engine = get_engine(db_name)
    d_db = pd.read_sql_table(table, engine)
    ICD.display(d_db)

def games_in_daterange(date1, date2):
    date1 = int(date1.timestamp())
    date2 = int(date2.timestamp())
    engine = sqlalchemy.create_engine('sqlite:///' + DATABASE_LOC)
    sql_query = 'SELECT * from ' + TABLE_GAMEOVERALL + ' WHERE DateTimestamp >= ' + str(date1) + ' AND ' + 'DateTimestamp <= ' + str(date2)
    games = pd.read_sql_query(sql_query, engine)
    return games

## Web scraping functions

In [118]:
import urllib.request
import pandas as pd
from bs4 import BeautifulSoup
import csv

URL_Games_20152016 = "http://www.hockey-reference.com/leagues/NHL_2016_games.html"

def build_games_csv(url):
    soup = BeautifulSoup(urllib.request.urlopen(url).read(), "lxml")

    tablehead = soup.find('thead')
    tablebody = soup.find('tbody')

    headers = [header.text for header in tablehead.find('tr').find_all('th')]
    headers.insert(0, "URL")
    rows = []

    for row in tablebody.find_all('tr'):
        cells = [cell.text for cell in row.find_all(['th','td'])]
        url = 'http://www.hockey-reference.com' + row.find('th').find('a')['href']
        cells.insert(0, url)
        rows.append(cells)

    with open(GAMESFILE, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=headers)
        writer.writeheader()
        for row in rows:
            row_dict = {}
            for i,cell in enumerate(row):
                row_dict[headers[i]] = cell
            writer.writerow(row_dict)


    
def get_playergame_csvname(gamename, home, skaters):
    if(home):
        return game
    return gamename + "_" + 
            
def build_playergame_csv(gamename, url):
    soup = BeautifulSoup(urllib.request.urlopen(url).read(), "lxml")
#     tables = soup.find_all('div', {"class":"table_wrapper"})
    table_skaters_away = soup.find_all('table')[2]
    table_goalies_away = soup.find_all('table')[3]
    table_skaters_home = soup.find_all('table')[4]
    table_goalies_home = soup.find_all('table')[5]
    print(table_skaters_away)
    print(table_goalies_away)
    print(table_skaters_home)
    print(table_goalies_home)

## Temporary

In [119]:
# get_games_csv(URL_Games_20152016)
# build_database(DB_NAME, FILE_GAMES)
# view_database(DB_NAME, TABLE_GAMEOVERALL)

build_playergame_csv("Game1", "http://www.hockey-reference.com/boxscores/201510070CGY.html")

<table class="sortable stats_table" data-cols-to-freeze="2" id="CGY_goalies"><caption>Goalies Table</caption>
<colgroup><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col><col></col></colgroup>
<thead>
<tr class="over_header">
<th aria-label="" class=" over_header " colspan="3" data-stat=""></th>
<th aria-label="" class=" over_header center" colspan="5" data-stat="header_goalies">Goalie Stats</th><th></th><th></th>
</tr>
<tr>
<th aria-label="Rank" class="ranker tooltip sort_default_asc show_partial_when_sorting center" data-stat="ranker" data-tip="Rank" scope="col">Rk</th>
<th aria-label="Player" class=" tooltip sort_default_asc left" data-stat="player" scope="col">Player</th>
<th aria-label="Decision" class=" tooltip sort_default_asc center" data-stat="decision" data-tip="Decision" scope="col">DEC</th>
<th aria-label="Goals Against" class=" tooltip center" data-over-header="Goalie Stats" data-stat="goals_against" data-tip="Goals Against"