In [35]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import re
import time
import asyncio
from io import StringIO

In [36]:
START_YEAR = 2007
END_YEAR=2025
LOOKBACK_PERIOD = range(START_YEAR, END_YEAR+1)
LOOKBACK_PERIOD=list(LOOKBACK_PERIOD)
if 2020 in LOOKBACK_PERIOD:
    LOOKBACK_PERIOD.remove(2020)
print(LOOKBACK_PERIOD)

[2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2021, 2022, 2023, 2024, 2025]


In [37]:
base = "https://www.teamrankings.com/ncaa-basketball/"
features = {"Last 10 Rating":"ranking/last-10-games-by-other",
            "Strength of Schedule":"ranking/schedule-strength-by-other",
#             "Offensive Efficiency":"stat/offensive-efficiency",
#             "Floor %":"stat/floor-percentage", 
            "Shooting %":"stat/shooting-pct",
            "Effective Field Goal %":"stat/effective-field-goal-pct",
            "Three Point %":"stat/three-point-pct",
            "Two Point %":"stat/two-point-pct",
            "Free Throw %":"stat/free-throw-pct",
            "FT Attempted per FG Attempted": "stat/fta-per-fga",
            "Turnovers Per Possession":"stat/turnovers-per-possession",
            "True Shooting %":"stat/true-shooting-percentage",
            "Offensive Rebounding %":"stat/offensive-rebounding-pct",
            "Defensive Rebounding %":"stat/defensive-rebounding-pct",
            "Total Rebounding %":"stat/total-rebounding-percentage",
            "Block %":"stat/block-pct", 
            "Steal %":"stat/steal-pct",
            "Assist/Turnover Ratio":"stat/assist--per--turnover-ratio",
            "Defensive Efficiency":"stat/defensive-efficiency",
            "Effective Possesion Ratio":"stat/effective-possession-ratio",
            "Win %":"stat/win-pct-all-games",
            "Win % Close Games":"stat/win-pct-close-games",
            "Possessions Per Game":"stat/possessions-per-game",
            }


In [38]:
def remove_scores(row):
    x = re.sub("\((\d+\-\d+)\)", "", row)
    x = x.rstrip()
    return x

In [39]:
def normalize(df, range):
    min_max_scaler=preprocessing.MinMaxScaler(feature_range=range)
    df = min_max_scaler.fit_transform(df.values)
    return df

In [40]:
def get_stats(years):
    full_data = pd.DataFrame()

    for i in years:
        print("Entering Year: ", i)
        stats=pd.DataFrame()
        suffix = "?date={}-03-01".format(i)
        for key, value in features.items():
            table=pd.DataFrame()
            url = base+value+suffix
            # print('Heading to url: ', url)
            req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
            webpage = urlopen(req).read()
            html = soup(webpage, "html.parser")
            table = pd.read_html(StringIO(str(html)))
            table = table[0].iloc[:, 1:3]
            table = table.set_axis(["Team", key], axis=1)
            if table[key].dtype!='float64':
                table[key] = table[key].replace('--', np.nan)
                table[key] = table[key].str.rstrip('%').astype(np.float64)
            if key=='Last 10 Rating' or key=='Strength of Schedule':
                table['Team'] = table['Team'].apply(remove_scores)
            if key!='Team' or key!='Year':    
                table[key] = normalize(table[[key]], range=(0, 1))
                # print(table)
                # table[key] = table[key]*table['Strength of Schedule']*table['Last 10 Rating']
                # print(table)
            if key=='Defensive Efficiency':
                table[key]=table[key]*-1
            # table=table.drop(['Last 10 Rating', 'Strength of Schedule'], axis=1)
            if len(stats)==0:
                stats = table
            else:
                # print("entered else statement")
                stats = stats.merge(table, how='inner', on='Team')
        stats['Year'] = i
        full_data = pd.concat([full_data, stats], ignore_index=True)
    return full_data

In [41]:

x = get_stats(LOOKBACK_PERIOD)

# key_list = ['Year', 'Team']
# drop_list = [w for w in x.columns if w not in key_list]
# print(x[drop_list])
# test = x[drop_list].multiply(x['Strength of Schedule'], axis=0).multiply(x['Last 10 Rating'], axis=0)
# test[key_list] = x[key_list]
# test = test.drop(['Strength of Schedule', 'Last 10 Rating'], axis=1)
# first_column = test.pop('Team')
  
# insert column using insert(position,column_name,first_column) function
# print(test)
# test.insert(0, 'Team', first_column)
# print(test)

Entering Year:  2007
Entering Year:  2008
Entering Year:  2009
Entering Year:  2010
Entering Year:  2011
Entering Year:  2012
Entering Year:  2013
Entering Year:  2014
Entering Year:  2015
Entering Year:  2016
Entering Year:  2017
Entering Year:  2018
Entering Year:  2019
Entering Year:  2021
Entering Year:  2022
Entering Year:  2023
Entering Year:  2024
Entering Year:  2025


In [42]:
print(x)

                Team  Last 10 Rating  Strength of Schedule  Shooting %  \
0            Florida        1.000000              0.931298    1.000000   
1     North Carolina        0.941767              1.000000    0.875000   
2            Arizona        0.887550              0.912214    0.778409   
3            Ohio St        0.881526              0.908397    0.727273   
4               Duke        0.875502              0.946565    0.687500   
...              ...             ...                   ...         ...   
6301       Coppin St        0.224000              0.270968    0.068702   
6302     Maryland ES        0.220800              0.167742    0.167939   
6303    Prairie View        0.193600              0.209677    0.450382   
6304   AR-Pine Bluff        0.192000              0.109677    0.511450   
6305  Miss Valley St        0.000000              0.000000    0.000000   

      Effective Field Goal %  Three Point %  Two Point %  Free Throw %  \
0                   1.000000       0.

In [43]:
# norm_features = ["Floor %","Shooting %", "Effective Field Goal %", "Three Point %", "Two Point %", "Free Throw %", "True Shooting %", "Offensive Rebounding %","Defensive Rebounding %","Total Rebounding %", "Block %"]

# for item in norm_features:
#     x[item] = x[item] / 100

In [44]:
# x["Net Efficiency Margin"] = x["Offensive Efficiency"] - x["Defensive Efficiency"]

In [45]:
x.to_csv("../data/{}-{}_MBB_Historical-Data.csv".format(START_YEAR, END_YEAR))

In [46]:
print(x)

                Team  Last 10 Rating  Strength of Schedule  Shooting %  \
0            Florida        1.000000              0.931298    1.000000   
1     North Carolina        0.941767              1.000000    0.875000   
2            Arizona        0.887550              0.912214    0.778409   
3            Ohio St        0.881526              0.908397    0.727273   
4               Duke        0.875502              0.946565    0.687500   
...              ...             ...                   ...         ...   
6301       Coppin St        0.224000              0.270968    0.068702   
6302     Maryland ES        0.220800              0.167742    0.167939   
6303    Prairie View        0.193600              0.209677    0.450382   
6304   AR-Pine Bluff        0.192000              0.109677    0.511450   
6305  Miss Valley St        0.000000              0.000000    0.000000   

      Effective Field Goal %  Three Point %  Two Point %  Free Throw %  \
0                   1.000000       0.

In [47]:
clean_hist = pd.read_csv('data/Bracket_Historical_Data.csv')
x = pd.read_csv('data/2012-2025_MBB_Historical-Data.csv')

unknown_teams=[]
for year in x['Year'].unique():
    temp_clean_hist=pd.DataFrame()
    temp_features_df=pd.DataFrame()
    team_list =[]
    temp_clean_hist = clean_hist[clean_hist['Year']==year]
    temp_features_df = x[x['Year']==year]
    team_list = temp_clean_hist[['Team 1', 'Team 2']].to_numpy().flatten()
    team_list = np.unique(team_list)
    for item in team_list:
        if item not in temp_features_df['Team'].values:
            unknown_teams.append(item)
unknown_teams=np.unique(unknown_teams)
print(unknown_teams)



FileNotFoundError: [Errno 2] No such file or directory: 'data/Bracket_Historical_Data.csv'