<h1>Data Scraping and Simulation</h1>

Notes:
1. it may be better to not simulate DD's as it's own category. Ideally covariances account for that but there will probably be a lot of noise

In [None]:
import pickle
from bs4 import BeautifulSoup
import numpy as np
from scipy.stats import multivariate_normal
from operator import itemgetter

In [None]:
player_list_dict = pickle.load( open( "data/player_list_dict.pickle", "rb" ))
means = pickle.load( open( "data/means_dict.pickle", "rb" ) )
covs = pickle.load( open( "data/covs_dict.pickle", "rb" ) )

In [None]:
rosters_html = ""
with open('data/espn_rosters.htm', 'r') as myfile:
    rosters_html=myfile.read().replace('\n', '')

In [None]:
teams_list = []
soup = BeautifulSoup(rosters_html, 'html.parser')
table_list = soup.find_all("table", {"class": "playerTableTable tableBody"})
for table in table_list:
    team_data = {}
    team_row = table.find("tr", {"class": "playerTableBgRowHead tableHead playertableSectionHeader"})
    team_name = team_row.find("a").contents[0]
    team_data['name'] = team_name
    table_player_list = table.find_all("td", {"class": "playertablePlayerName"})
    team_roster = []
    team_roster_ids = []
    for player in table_player_list:
        player_name = player.find("a").contents[0]
        team_roster.append(player_name)
        try:
            team_roster_ids.append(mapping[player_name.upper()])
        except:
            # For now, ignore players that have not played all season
            print player_name
    team_data['roster'] = team_roster
    team_data['roster_id'] = team_roster_ids
    teams_list.append(team_data)

teams_list

In [None]:
# Create mapping from player name to NBA stats player ID
mapping = {}
player_list = player_list_dict['player_list']
print player_list_dict['attr_header']
for player in player_list:
    mapping[player[1].upper()] = player[0]
print player_list[0]


In [None]:
def get_cumulative_dist(team):
    """
    Given a team dictionary, returns the sum of the corresponding distributions.
    Assumes all players are independent.
    """
    total_mean = np.zeros(12)
    total_covs = np.zeros((12,12))
    
    player_ids = team['roster_id']
    for player_id in player_ids:
        total_mean += means[player_id]
        total_covs += covs[player_id]
        
    return total_mean, total_covs

def get_outcome_prob(team1, team2, num_samples=1000):
    """
    Returns expected win value of team1 vs. team2 matchup.
    If team1 wins, 1 point, if tie, .5 points, if lose, 0 points.
    """
    mean1, cov1 = get_cumulative_dist(team1)
    mean2, cov2 = get_cumulative_dist(team2)
    
    rv1 = multivariate_normal(mean1, cov1, allow_singular=True)
    rv2 = multivariate_normal(mean2, cov2, allow_singular=True)
    
    total_won = 0
    total_tied = 0
    for i in xrange(num_samples):
        sample1 = rv1.rvs()
        sample2 = rv2.rvs()
        
        won = 0
        tied = 0
        lost = 0
        
        normal_stats = [0,3,6,7,8,9,11] # more is better
        inverse_stats = [10] # more is worse
        
        for stat_index in normal_stats:
            if sample1[stat_index] > sample2[stat_index]:
                won += 1
            elif sample1[stat_index] == sample2[stat_index]:
                tied += 1
            else:
                lost += 1
        for stat_index in inverse_stats:
            if sample1[stat_index] < sample2[stat_index]:
                won += 1
            elif sample1[stat_index] == sample2[stat_index]:
                tied += 1
            else:
                lost += 1
        
        fg1_pct = float(sample1[1]) / sample1[2]
        fg2_pct = float(sample2[1]) / sample2[2]
        ft1_pct = float(sample1[4]) / sample1[5]
        ft2_pct = float(sample2[4]) / sample2[5]
        
        if fg1_pct > fg2_pct:
            won += 1
        elif fg1_pct == fg2_pct:
            tied += 1
        else:
            lost += 1
            
        if ft1_pct > ft2_pct:
            won += 1
        elif ft1_pct == ft2_pct:
            tied += 1
        else:
            lost += 1
            
        if won > lost:
            total_won += 1
        elif won == lost:
            total_tied += 1
            
    return float(total_won) / num_samples, float(total_tied) / num_samples

def get_record(team, other_teams):
    # Get expected record. Wins are +1, Ties are +.5
    total = 0.
    for other_team in other_teams:
        win_prob, tie_prob = get_outcome_prob(team, other_team)
        total += win_prob + .5 * tie_prob
    return total

def get_standings(teams_list):
    standings = []
    for i in xrange(len(teams_list)):
        team = teams_list[i]
        other_teams = [x for j,x in enumerate(teams_list) if j!=i]
        record = get_record(team, other_teams)
        standings.append((team['name'], record))
    return sorted(standings, key=itemgetter(1), reverse=True)
    
print get_standings(teams_list)

In [None]:
names_list = [team['name'] for team in teams_list]
names_list

<h1>Trade Simulator</h1>