In [110]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as stats

In [111]:
#Create Table of Team Names and Team IDs. The TeamID column will be used to join the other data tables into a single dataframe.
teams = pd.read_csv("mens-march-mania-2022/MTeams.csv")
teams.drop(['FirstD1Season', 'LastD1Season'], axis=1, inplace=True)
teams

Unnamed: 0,TeamID,TeamName
0,1101,Abilene Chr
1,1102,Air Force
2,1103,Akron
3,1104,Alabama
4,1105,Alabama A&M
...,...,...
367,1468,Bellarmine
368,1469,Dixie St
369,1470,Tarleton St
370,1471,UC San Diego


In [112]:
#Import data table that shows the ranking of each team according to various ranking systems at various points in the season.
#Only RankingDayNum = 133, which corresponds to the day before the tournament begins. 
#The imported data includes a column called SystemName, which includes all the ranking systems. This column should be pivoted
#to create a different column for each ranking system
rankings = pd.read_csv("mens-march-mania-2022/MMasseyOrdinals.csv")
rankings = rankings[rankings['RankingDayNum'] == 133]
rankings

Unnamed: 0,Season,RankingDayNum,SystemName,TeamID,OrdinalRank
121021,2003,133,AP,1112,2
121022,2003,133,AP,1163,23
121023,2003,133,AP,1166,15
121024,2003,133,AP,1173,16
121025,2003,133,AP,1181,7
...,...,...,...,...,...
4339985,2021,133,WOB,1467,233
4339986,2021,133,WOB,1468,176
4339987,2021,133,WOB,1469,290
4339988,2021,133,WOB,1470,256


In [113]:
#merge the team table with the rankings table to get the team name with the rankings
df = teams.merge(rankings, how='inner', on='TeamID')
df

Unnamed: 0,TeamID,TeamName,Season,RankingDayNum,SystemName,OrdinalRank
0,1101,Abilene Chr,2014,133,7OT,343
1,1101,Abilene Chr,2014,133,ADE,297
2,1101,Abilene Chr,2014,133,BBT,347
3,1101,Abilene Chr,2014,133,BIH,346
4,1101,Abilene Chr,2014,133,BLS,346
...,...,...,...,...,...,...
309504,1471,UC San Diego,2021,133,TRP,262
309505,1471,UC San Diego,2021,133,WIL,276
309506,1471,UC San Diego,2021,133,WLK,220
309507,1471,UC San Diego,2021,133,WMV,255


In [114]:
#Import data table that shows the seed number of each team of each tournament.
#Remove all leading or trailing letters (which denote conference) from the Seed column and convert to integer
seeds = pd.read_csv("mens-march-mania-2022/MNCAATourneySeeds.csv")
seeds.dropna()
seeds['Seed'] = seeds['Seed'].str[1:]
seeds['Seed'] = seeds['Seed'].str.lstrip('0')
seeds['Seed'] = seeds['Seed'].str.rstrip('abcd')
seeds['Seed'] = seeds['Seed'].astype(int)

In [115]:
#merge df with seeds table
df = seeds.merge(df, how='inner', on=['TeamID', 'Season'])

In [116]:
#Get Regular Season Win-Loss Record
record = pd.read_csv("mens-march-mania-2022/MRegularSeasonCompactResults.csv")
record = record.drop(['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], axis=1)
wins= record.groupby(['Season', 'WTeamID']).count()
wins = wins.rename(columns={'WTeamID':'TeamID', 'LTeamID': 'Wins'})
losses = record.groupby(['Season', 'LTeamID']).count()
losses = losses.rename(columns={'LTeamID':'TeamID', 'WTeamID': 'Losses'})
wins=wins['Wins']
losses=losses['Losses']

In [117]:
#Merge number of wins and number of losses to df
df = pd.merge(df, wins, left_on=['TeamID', 'Season'], right_on=['WTeamID', 'Season'])
df = pd.merge(df, losses, left_on=['TeamID', 'Season'], right_on=['LTeamID', 'Season'])
df

Unnamed: 0,Season,Seed,TeamID,TeamName,RankingDayNum,SystemName,OrdinalRank,Wins,Losses
0,2003,1,1328,Oklahoma,133,AP,3,24,6
1,2003,1,1328,Oklahoma,133,ARG,5,24,6
2,2003,1,1328,Oklahoma,133,BIH,5,24,6
3,2003,1,1328,Oklahoma,133,BOB,3,24,6
4,2003,1,1328,Oklahoma,133,BRZ,7,24,6
...,...,...,...,...,...,...,...,...,...
61434,2021,16,1216,Hartford,133,TRP,194,15,8
61435,2021,16,1216,Hartford,133,WIL,190,15,8
61436,2021,16,1216,Hartford,133,WLK,181,15,8
61437,2021,16,1216,Hartford,133,WMV,167,15,8


In [118]:
#Create Win% Column
df["Win%"]=round(df["Wins"]/(df["Wins"]+df["Losses"])*100,4)

In [119]:
df

Unnamed: 0,Season,Seed,TeamID,TeamName,RankingDayNum,SystemName,OrdinalRank,Wins,Losses,Win%
0,2003,1,1328,Oklahoma,133,AP,3,24,6,80.0000
1,2003,1,1328,Oklahoma,133,ARG,5,24,6,80.0000
2,2003,1,1328,Oklahoma,133,BIH,5,24,6,80.0000
3,2003,1,1328,Oklahoma,133,BOB,3,24,6,80.0000
4,2003,1,1328,Oklahoma,133,BRZ,7,24,6,80.0000
...,...,...,...,...,...,...,...,...,...,...
61434,2021,16,1216,Hartford,133,TRP,194,15,8,65.2174
61435,2021,16,1216,Hartford,133,WIL,190,15,8,65.2174
61436,2021,16,1216,Hartford,133,WLK,181,15,8,65.2174
61437,2021,16,1216,Hartford,133,WMV,167,15,8,65.2174


In [120]:
#Get Tournament Win-Loss Record. The number of wins is the target feature
tourney= pd.read_csv("mens-march-mania-2022/MNCAATourneyCompactResults.csv")
tourney = tourney.drop(['DayNum', 'WScore', 'LScore', 'WLoc', 'NumOT'], axis=1)
t_wins= tourney.groupby(['Season', 'WTeamID']).count()
t_wins = t_wins.rename(columns={'WTeamID':'TeamID', 'LTeamID': 'T_Wins'})
#t_losses = tourney.groupby(['Season', 'LTeamID']).count()
#t_losses = t_losses.rename(columns={'LTeamID':'TeamID', 'WTeamID': 'T_Losses'})
t_wins=t_wins['T_Wins']
#t_losses=t_losses['T_Losses']

In [121]:
#Merge number of tournament wins to df. This will be the target feature
df = pd.merge(df, t_wins, how='left', left_on=['TeamID', 'Season'], right_on=['WTeamID', 'Season'])
#Replace NaN with 0's for teams without a win
df["T_Wins"] = df["T_Wins"].replace(np.nan, 0)
#df = pd.merge(df, t_losses, left_on=['TeamID', 'Season'], right_on=['LTeamID', 'Season'])

In [122]:
######## delete this
#df = df.iloc[0:1000, :]

In [123]:
df

Unnamed: 0,Season,Seed,TeamID,TeamName,RankingDayNum,SystemName,OrdinalRank,Wins,Losses,Win%,T_Wins
0,2003,1,1328,Oklahoma,133,AP,3,24,6,80.0000,3.0
1,2003,1,1328,Oklahoma,133,ARG,5,24,6,80.0000,3.0
2,2003,1,1328,Oklahoma,133,BIH,5,24,6,80.0000,3.0
3,2003,1,1328,Oklahoma,133,BOB,3,24,6,80.0000,3.0
4,2003,1,1328,Oklahoma,133,BRZ,7,24,6,80.0000,3.0
...,...,...,...,...,...,...,...,...,...,...,...
61434,2021,16,1216,Hartford,133,TRP,194,15,8,65.2174,0.0
61435,2021,16,1216,Hartford,133,WIL,190,15,8,65.2174,0.0
61436,2021,16,1216,Hartford,133,WLK,181,15,8,65.2174,0.0
61437,2021,16,1216,Hartford,133,WMV,167,15,8,65.2174,0.0


In [179]:
season_stats= pd.read_csv("mens-march-mania-2022/MRegularSeasonDetailedResults.csv")
season_stats = season_stats.drop(['DayNum', 'WLoc', 'NumOT'], axis=1)
#season_stats
FGM_W = season_stats.groupby(['Season', 'WTeamID'])['WFGM'].sum()
FGM_W

Season  WTeamID
2003    1102       271
        1103       390
        1104       439
        1105       179
        1106       322
                  ... 
2022    1468       254
        1469       197
        1470       200
        1471       199
        1472       149
Name: WFGM, Length: 6883, dtype: int64

In [166]:
#season_stats_w.iloc[:,0:20] # = season_stats_w.drop(index = [3,18])
season_stats_w = season_stats.drop(season_stats.iloc[[0,3,18]], axis=1)

In [167]:
season_stats_w

Unnamed: 0,WTeamID,WScore,LScore,WFGM,WFGA,WFGM3,WFGA3,WFTM,WFTA,WOR,...,LFGA3,LFTM,LFTA,LOR,LDR,LAst,LTO,LStl,LBlk,LPF
0,1104,68,62,27,58,3,14,11,18,14,...,10,16,22,10,22,8,18,9,2,20
1,1272,70,63,26,62,8,20,10,19,15,...,24,9,20,20,25,7,12,8,6,16
2,1266,73,61,24,58,8,18,17,29,17,...,26,14,23,31,22,9,12,2,5,23
3,1296,56,50,18,38,3,9,17,31,6,...,22,8,15,17,20,9,19,4,3,23
4,1400,77,71,30,61,6,14,11,13,17,...,16,17,27,21,15,12,10,7,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
100418,1400,79,76,28,67,3,20,20,23,14,...,13,15,23,5,24,10,15,3,5,21
100419,1411,66,63,24,59,2,20,16,28,12,...,21,15,24,5,23,10,19,13,2,23
100420,1422,68,49,23,56,13,32,9,13,11,...,24,8,11,10,18,5,16,8,2,12
100421,1438,69,68,31,65,2,12,5,9,10,...,17,18,22,11,25,14,14,3,9,11


In [124]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [125]:
X = df.drop(['Season', 'TeamID', 'TeamName', 'RankingDayNum', 'SystemName', 'T_Wins'], axis=1)
y = df['T_Wins']

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.4)

In [127]:
model = LinearRegression().fit(X_train, y_train)
model.score(X_train, y_train)
model.score(X_test, y_test)

0.3222853173328126

In [128]:
#from sklearn import svm

In [129]:
#clf = svm.SVC(kernel = 'linear')

In [130]:
#from sklearn.preprocessing import MinMaxScaler
3scaling = MinMaxScaler(feature_range=(-1,1)).fit(X_train)
#X_train = scaling.transform(X_train)
#X_test = scaling.transform(X_test)

SyntaxError: invalid syntax (<ipython-input-130-ea6b89bd028d>, line 2)

In [None]:
#clf.fit(X_train, y_train)

In [None]:
#y_pred = clf.predict(X_test)

In [None]:
#from sklearn import metrics

In [None]:
#metrics.accuracy_score(y_test, y_pred)

In [None]:
#y_pred