In [51]:
import pandas as pd
from bs4 import BeautifulSoup as soup
from urllib.request import Request, urlopen
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import re
from fuzzywuzzy import process, fuzz

In [52]:
def get_stats(years):
    all_data = {}
    for i in years:
        url="https://basketball.realgm.com/ncaa/team-stats/{}/Advanced_Stats/Team_Totals/0".format(i)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        html = soup(webpage, "html.parser")
        table = pd.read_html(str(html))
        all_data[i] = table[0]
    return all_data
    

In [None]:
stat_years = ['2025', '2024', '2023', '2022']

In [53]:
dfs = get_stats(stat_years)
print(dfs)

{'2024':        #                       Team    TS%   eFG%  Total S%  ORB%  DRB%  TRB%  \
0      1              McNeese State  0.584  0.558     157.2  32.4  71.2  52.4   
1      2                      UConn  0.600  0.571     160.5  35.7  76.7  57.7   
2      3                    Houston  0.526  0.497     147.5  37.0  72.7  54.1   
3      4               Saint Mary's  0.555  0.534     150.2  38.3  81.1  60.0   
4      5                    Gonzaga  0.596  0.572     159.1  32.8  74.3  55.4   
5      6                     Auburn  0.580  0.541     157.9  31.9  70.9  52.4   
6      7                    Arizona  0.580  0.550     157.9  35.7  78.6  57.7   
7      8                 Iowa State  0.550  0.519     150.9  30.2  71.8  50.9   
8      9              James Madison  0.576  0.549     155.2  30.6  75.7  53.8   
9     10                     Purdue  0.593  0.560     161.8  37.1  76.5  58.4   
10    11             Morehead State  0.574  0.551     153.5  34.4  77.2  57.4   
11    12           

In [54]:
def get_sos(years):
    sos_data=pd.DataFrame()
    for i in years:
        url = "https://www.teamrankings.com/ncaa-basketball/ranking/schedule-strength-by-other?date={}-03-14".format(i)
        req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
        webpage = urlopen(req).read()
        html = soup(webpage, "html.parser")
        table = pd.read_html(str(html))
        table[0]['Year'] = i
        sos_data = pd.concat([sos_data, table[0]])
    return sos_data



In [55]:
sos_df = get_sos(stat_years)
pd.set_option('display.max_rows', 1000)
print(sos_df)


     Rank                   Team  Rating   Hi  Low  Last  Year
0       1          Purdue (28-3)    12.6    1  189     1  2024
1       2         Houston (28-3)    12.1    2  225     2  2024
2       3        Alabama (21-10)    12.1    1  215     3  2024
3       4       Tennessee (24-7)    11.8    1  230     4  2024
4       5         Kansas (22-10)    11.6    2  213     5  2024
..    ...                    ...     ...  ...  ...   ...   ...
353   354    S Car State (15-16)    -9.2  148  356   354  2022
354   355      Incar Word (7-25)    -9.3  142  355   355  2022
355   356     Ark Pine Bl (7-24)    -9.5   31  356   356  2022
356   357  Hsn Christian (11-18)   -10.3   49  357   357  2022
357   358     Delaware St (2-26)   -10.7  350  358   358  2022

[1083 rows x 7 columns]


In [56]:
def remove_scores(row):
    x = re.sub("\((\d+\-\d+)\)", "", row)
    return x


In [57]:
def clean_sos_df(df):
    labels = ['Hi', 'Low', 'Last']
    df = df.drop(columns=labels)
    df['Team'] = df['Team'].apply(remove_scores)
    return df
    
    

In [58]:
def scale(df):
    min_max_scaler=preprocessing.MinMaxScaler(feature_range=(0.5,1))
    df[['Rating']] = min_max_scaler.fit_transform(df[['Rating']].values)
    return df
    

In [62]:
sos_df_cleaned = clean_sos_df(sos_df)
sos_df_norm = scale(sos_df_cleaned)
print(sos_df_norm)

     Rank            Team    Rating  Year
0       1         Purdue   1.000000  2024
1       2        Houston   0.989316  2024
2       3        Alabama   0.989316  2024
3       4      Tennessee   0.982906  2024
4       5         Kansas   0.978632  2024
..    ...             ...       ...   ...
353   354    S Car State   0.534188  2022
354   355     Incar Word   0.532051  2022
355   356    Ark Pine Bl   0.527778  2022
356   357  Hsn Christian   0.510684  2022
357   358    Delaware St   0.502137  2022

[1083 rows x 4 columns]
