In [3]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import json

In [4]:
games = pd.read_json("../data/games_2021.json")
teams = pd.read_json('../data/big_10_teams_2021.json')
df = pd.merge(teams, games)

for index, row in df.iterrows():
    for team in row['teams']:
        
        # find home and away teams        
        if team['homeAway'] == 'home':
            # add home teams
            df.loc[index, 'home_team'] = team['school']
            for i in team['stats']:
                # add home stats
                stats = ('home_' + i['category'], i['stat'])
                df.loc[index, stats[0]] = stats[1]

        if team['homeAway'] == 'away':
            # add away teams
            df.loc[index, 'away_team'] = team['school']
            for i in team['stats']:
                # add away stats
                stats = ('away_' + i['category'], i['stat'])
                df.loc[index, stats[0]] = stats[1]

# Move home and away team columns to begininng
first_column = df.pop('home_team')
df.insert(1, 'home_team', first_column)

second_column = df.pop('away_team')
df.insert(2, 'away_team', second_column)

# Drop 'teams' columns
df = df.drop('teams', axis=1)

# First row of df
df.iloc[0].dropna()

id                      401331447
home_team                    Iowa
away_team                Michigan
season                       2021
week                           14
                          ...    
home_netPassingYards          175
home_totalYards               279
home_fourthDownEff            0-1
home_thirdDownEff            5-19
home_firstDowns                15
Name: 0, Length: 93, dtype: object

In [5]:
df = df[df.home_conference == 'Big Ten']
df = df[df.away_conference == 'Big Ten']

df = df[df["home_points"].notna()]
df = df[df["away_points"].notna()]
df = df[df["home_interceptionYards"].notna()]

# Makes a new column called "home_win" with value True
# if home_points is greater than away_points, False otherwise 
df["home_win"] = df["home_points"] > df["away_points"]

In [6]:
f = open("../data/2021_stats_advanced.json")
advanced_stats = json.load(f)

In [7]:
team_to_plays = {}
team_to_line_yards = {}
team_to_points_per_opportunity = {}

for dict in advanced_stats:
    team_to_plays[dict["team"]] = dict["offense"]["plays"]
    team_to_line_yards[dict["team"]] = dict["offense"]["lineYards"]
    team_to_points_per_opportunity[dict["team"]] = dict["offense"]["pointsPerOpportunity"]

In [34]:
df['home_plays'] = df['home_team'].map(team_to_plays)
df['away_plays'] = df['away_team'].map(team_to_plays)

df['home_line_yards'] = df['home_team'].map(team_to_line_yards)
df['away_line_yards'] = df['away_team'].map(team_to_line_yards)

df['home_points_per_opportunity'] = df['home_team'].map(team_to_points_per_opportunity)
df['away_points_per_opportunity'] = df['away_team'].map(team_to_points_per_opportunity)

df

Unnamed: 0,id,home_team,away_team,season,week,season_type,start_date,start_time_tbd,neutral_site,conference_game,...,away_kickReturnYards,away_kickReturnTDs,away_kickReturns,home_win,home_plays,away_plays,home_line_yards,away_line_yards,home_points_per_opportunity,away_points_per_opportunity
0,401331447,Iowa,Michigan,2021,14,regular,2021-12-05T01:00:00.000Z,False,True,True,...,,,,False,895,948,2.633333,3.211636,3.0,4.153846
10,401282731,Indiana,Michigan State,2021,7,regular,2021-10-16T16:00:00.000Z,False,False,True,...,32.0,0.0,3.0,False,817,855,2.71275,2.952164,3.442308,3.777778
14,401282818,Penn State,Rutgers,2021,12,regular,2021-11-20T17:00:00.000Z,False,False,True,...,,,,True,964,887,2.700708,3.120502,3.454545,3.409091
16,401282721,Iowa,Illinois,2021,12,regular,2021-11-20T19:00:00.000Z,False,False,True,...,0.0,0.0,3.0,True,895,853,2.633333,3.354694,3.0,3.15873
19,401282803,Wisconsin,Nebraska,2021,12,regular,2021-11-20T20:30:00.000Z,False,False,True,...,40.0,0.0,3.0,True,851,825,3.212754,3.137255,3.071429,3.972603
21,401282717,Penn State,Illinois,2021,8,regular,2021-10-23T16:00:00.000Z,False,False,True,...,,,,False,964,853,2.700708,3.354694,3.454545,3.15873
22,401282824,Rutgers,Wisconsin,2021,10,regular,2021-11-06T19:30:00.000Z,False,False,True,...,0.0,0.0,1.0,False,887,851,3.120502,3.212754,3.409091,3.071429
25,401282787,Purdue,Michigan State,2021,10,regular,2021-11-06T19:30:00.000Z,False,False,True,...,21.0,0.0,4.0,True,947,855,2.793372,2.952164,3.925,3.777778
26,401282788,Rutgers,Michigan State,2021,6,regular,2021-10-09T16:00:00.000Z,False,False,True,...,10.0,0.0,1.0,False,887,855,3.120502,2.952164,3.409091,3.777778
27,401282779,Michigan,Northwestern,2021,8,regular,2021-10-23T16:00:00.000Z,False,False,True,...,46.0,0.0,3.0,True,948,802,3.211636,2.861667,4.153846,2.965517


In [33]:
columns = ["home_plays", "away_plays", "home_line_yards", "away_line_yards", "home_points_per_opportunity", "away_points_per_opportunity"]

X = df[columns]
y = df["home_win"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

clf = SVC(kernel='linear', C = 1.0)
clf.fit(X_train, y_train)
# score_train = clf.score(X_train, y_train)
score_test = clf.score(X_test, y_test)
score_test

0.42857142857142855

In [25]:
home = "Rutgers"
away = "Michigan"
todays_game = [team_to_plays[home], team_to_plays[away], 
               team_to_line_yards[home], team_to_line_yards[away], 
               team_to_points_per_opportunity[home], team_to_points_per_opportunity[away]]

clf.predict([todays_game])



array([ True])