## Importing base dataset with unique identifiers and machine learning targets 

In [1]:
import pandas as pd
df = pd.read_csv('baseGames.csv')
df = df.drop(columns=['ID'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16979 entries, 0 to 16978
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GAME_DATE_EST   16979 non-null  object 
 1   GAME_ID         16979 non-null  int64  
 2   SEASON          16979 non-null  int64  
 3   HOME_TEAM_ID    16979 non-null  int64  
 4   HOME_W%         16979 non-null  float64
 5   AWAY_TEAM_ID    16979 non-null  int64  
 6   AWAY_W%         16979 non-null  float64
 7   HOME_TEAM_WINS  16979 non-null  int64  
 8   MARGIN          16979 non-null  float64
dtypes: float64(3), int64(5), object(1)
memory usage: 1.2+ MB


## Importing vegas dataset with sportsbooks' lines
- Keeping unique game IDs, team IDs, and Average Line Spread 
- Average Line Spread is an average of 5 sportsbooks' lines (Pinnacle, Bovada, Betonline, Heritage, and 5dimes)

In [2]:
v12 = pd.read_csv('vegasData/vegas12.txt')
v13 = pd.read_csv('vegasData/vegas13.txt')
v14 = pd.read_csv('vegasData/vegas14.txt')
v15 = pd.read_csv('vegasData/vegas15.txt')
v16 = pd.read_csv('vegasData/vegas16.txt')
v17 = pd.read_csv('vegasData/vegas17.txt')
v18 = pd.read_csv('vegasData/vegas18.txt')

vegas = pd.concat([v18,v17,v16,v15,v14,v13,v12])
vegas = vegas[['TeamId','GameId','Average_Line_Spread','Spread']]
vegas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17208 entries, 0 to 2457
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   TeamId               17208 non-null  int64  
 1   GameId               17208 non-null  int64  
 2   Average_Line_Spread  17204 non-null  float64
 3   Spread               17208 non-null  float64
dtypes: float64(2), int64(2)
memory usage: 672.2 KB


## Joining datasets to add gambling lines to base dataset

In [3]:
# Join df & ranking (add column of W_PCT for home team)
df = pd.merge(df, vegas,  how='left', left_on=['GAME_ID','HOME_TEAM_ID'], right_on = ['GameId','TeamId'])
df = df.drop(columns=['GameId','TeamId','Spread'])
df = df.rename(columns={'Average_Line_Spread':'SPREAD'})

## Finalize and export base dataset + gambling point spreads

In [4]:
# Rearrange columns as needed
cols = ['GAME_DATE_EST','GAME_ID','SEASON','HOME_TEAM_ID','HOME_W%','AWAY_TEAM_ID','AWAY_W%','SPREAD','HOME_TEAM_WINS','MARGIN']
df = df[cols]

# Export to new csv file
df.to_csv('baseGamesAndOdds.csv')