In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# Planning

### What data do we have?
- Everyone's picks and weights for n weeks
- The winners of every game

### League average pick
- Calculate a league average pick: an "expected estimate" of what the rest of the league will pick
    - Expected that the league average pick is closer to 0 than the actual score
    - Always positive -> just in the direction of the most picked team
    - Ranges from 0 - 16 
    - Closer to 0: league is pretty split
    - Closer to 16: league favoring one team
    
### League average Score
- Calculate a league average score: The actual average score for each game in the league
    - Ranges from 0 - 16
    - This is just the average number of points that the league gained
    - 

In [24]:
def html_to_df(week):
    filepath = f"data/week{week}/picks{week}.html"
    # na, correct, incorrect
    # week, game, correct points, incorrect points
    data = []

    with open(filepath, "r") as file:
        contents = file.read()
        picks_length = 16 # need to change for other weeks
        soup = BeautifulSoup(contents)

        rows = soup.find_all("tr", attrs={"class": "bg2"})
        person = 0
        for row in rows:
            picks = row.find_all("td", attrs={"class": ["na", "correct", "incorrect"]})
            if len(picks) != picks_length:
                continue
            i = 0
            for pick in picks:
                pick_class = pick["class"][0]
                if pick_class == "na":
                    #data.append([week, i, person, 0, 0])
                    i += 1
                    continue

                pick_text = pick.text
                pts = pick_text[pick_text.index("(")+1 : pick_text.index(")")]
                
                if pick_class == "correct":
                    data.append([week, i, person, int(pts), 0])
                else:
                    data.append([week, i, person, 0, int(pts)])

                i += 1
            person += 1
    
    df = pd.DataFrame(data, columns=["week", "game", "person", "correct_pts", "incorrect_pts"])

    return df

def get_scores(week):
    filepath = f"data/week{week}/scores{week}.html"
    data =[]
    with open(filepath, "r") as file:
        contents = file.read()
        picks_length = 16
        soup = BeautifulSoup(contents)
        row = soup.find("tr")
        only_games = row.findChildren(recursive=False)
        print(len(only_games))
        games = only_games[1:1+picks_length]
        game_id = 0
        for game in games:
            score_entries = game.find_all("td")
            away_team = score_entries[0].text
            away_score = int(score_entries[1].text)
            home_team = score_entries[2].text
            home_score = int(score_entries[3].text)
            data.append([week, game_id, away_team, away_score, home_team, home_score])
            game_id += 1
    
    df = pd.DataFrame(data, columns=["week", "game_id", "away_team", "away_score", "home_team", "home_score"])

    return df

def get_statistics(df):
    
    df["average_pick"] = round((df["correct_pts"] - df["incorrect_pts"]) / df["count"], 2)
    df["average_score"] = round(df["correct_pts"] / df["count"], 2)
    

def raw_to_df(week):
    raw_data = html_to_df(week)
    grouped = raw_data[["game", "correct_pts", "incorrect_pts"]].groupby(["game"])
    res = grouped.sum()
    res["count"] = grouped.game.count()
    get_statistics(res)
    res = res.reset_index()
    res.loc[:, "week"] = week
    res = res.set_index(["week", "game"])
    return res

def send_to_csv(df, path):
    df.to_csv(path)
            

### Scores getting

In [25]:
scores1 = get_scores(1)
scores2 = get_scores(2)
scores3 = get_scores(3)
scores4 = get_scores(4)
final_scores = pd.concat([scores1, scores2, scores3, scores4])

scores1.head()

20


Unnamed: 0,week,game_id,away_team,away_score,home_team,home_score
0,1,0,BUF,31,LAR,10
1,1,1,BAL,24,NYJ,9
2,1,2,CLE,26,CAR,24
3,1,3,IND,20,HOU,20
4,1,4,JAC,22,WAS,28


In [6]:
week1 = raw_to_df(1)
week2 = raw_to_df(2)
week3 = raw_to_df(3)
week4 = raw_to_df(4)
final_df = pd.concat([week1, week2, week3, week4])
final_df.tail(10)



Unnamed: 0_level_0,Unnamed: 1_level_0,correct_pts,incorrect_pts,count,average_pick,average_score
week,game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4,6,317,17,30,10.0,10.57
4,7,28,255,30,-7.57,0.93
4,8,29,263,30,-7.8,0.97
4,9,49,111,30,-2.07,1.63
4,10,236,35,30,6.7,7.87
4,11,138,34,31,3.35,4.45
4,12,102,43,31,1.9,3.29
4,13,335,18,31,10.23,10.81
4,14,120,24,31,3.1,3.87
4,15,10,143,31,-4.29,0.32


In [7]:
corr = final_df["average_pick"].corr(final_df["average_score"])
print("Correlation: ", corr)

Correlation:  0.9450708448288927


In [18]:
week1_raw = html_to_df(1)
week2_raw = html_to_df(2)
week3_raw = html_to_df(3)
week4_raw = html_to_df(4)
raw_df = pd.concat([week1_raw, week2_raw, week3_raw, week4_raw])
raw_df["correct"] = raw_df["correct_pts"] > 0




In [None]:
send_to_csv(raw_df, "data/raw_data.csv")