# Project 7

- [Report](https://docs.google.com/document/d/1d6JuigRGQrC9244Y_fzWR2EBKznfnKVnQH_Bf-OEOwI/edit?usp=sharing)
- [Slides](https://docs.google.com/presentation/d/1qbXJJV9wEzjcOUMc-ESLRDamedZsxWrcTiVqrR-AlJ8/edit?usp=sharing)
- [Dataset](https://www.kaggle.com/datasets/jonathanpilafas/2024-march-madness-statistical-analysis)

## Setup

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
import re
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
from sklearn import svm

## Exploring/Cleaning Data

In [None]:
df = pd.read_csv('DEV _ March Madness.csv')
exclude_columns = [col for col in df.columns if 'Rank' in col]
df = df.drop(columns=exclude_columns)

In [None]:
df = df[['Team Name', 'Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate', 'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 'Off.FT',
       'Off.2PT FG', 'Off.3PT FG', 'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Offense)', 
       'Avg Possession Length (Defense)', 'Active Coaching Length', 'Post-Season Tournament']]

def extract_years(years_string):
    years = re.findall(r'\d+', years_string)
    if years:
        return int(years[0])
    else:
        return None

df['Active Coaching Length'] = df['Active Coaching Length'].apply(extract_years)
df

In [None]:
plt.title('Average Coaching Time of Current Coach')
plt.xlabel('Years')
sns.barplot(data=df, x="Active Coaching Length", hue='Post-Season Tournament')

In [None]:
usu = df[df['Team Name'] == 'Utah State']

sns.regplot(data=df, x='Off.eFG %', y='Def.eFG %')
plt.title('Offensive vs Defensive Field Goal Efficiency Percentage')
plt.xlabel('Offensive field goal efficiency percentage')
plt.ylabel('Defensive field goal efficiency percentage')
plt.scatter(usu['Off.eFG %'], usu['Def.eFG %'], color='navy', s=150, marker='o')

In [None]:
plt.figure(figsize=[8, 6])
sns.scatterplot(data=df, x='Off.eFG %', y='Def.eFG %', hue='Post-Season Tournament')
plt.title('Offensive vs Defensive Field Goal Efficiency Percentage')
plt.xlabel('Offensive field goal efficiency percentage')
plt.ylabel('Defensive field goal efficiency percentage')
plt.scatter(usu['Off.eFG %'], usu['Def.eFG %'], color='navy', s=150, marker='o')

In [None]:
sns.regplot(data=df, x='Off.TO %', y='Def.TO %')
plt.title('Offensive vs Defensive Turnover Percentage')
plt.xlabel('Offensive turnover percentage')
plt.ylabel('Defensive turnover percentage')
plt.scatter(usu['Off.TO %'], usu['Def.TO %'], color='navy', s=150, marker='o')

In [None]:
sns.regplot(data=df, x='Off.OR %', y='Def.OR %')
plt.title('Offensive vs Defensive Rebound Percentage')
plt.xlabel('Offensive rebound percentage')
plt.ylabel('Defensive rebound percentage')
plt.scatter(usu['Off.OR %'], usu['Def.OR %'], color='navy', s=150, marker='o')

In [None]:
sns.regplot(data=df, x='Off.FT Rate', y='Def.FT Rate')
plt.title('Offensive vs Defensive Free Throw Rate')
plt.xlabel('Offensive free throw rate')
plt.ylabel('Defensive free throw rate')
plt.scatter(usu['Off.FT Rate'], usu['Def.FT Rate'], color='navy', s=150, marker='o')

In [None]:
sns.regplot(data=df, x='Off.FT', y='Def.FT')
plt.title('Offensive vs Defensive Free Throws')
plt.xlabel('Offensive free throws')
plt.ylabel('Defensive free throws')
plt.scatter(usu['Off.FT'], usu['Def.FT'], color='navy', s=150, marker='o')

In [None]:
sns.regplot(data=df, x='Off.2PT FG', y='Def.2PT FG')
plt.title('Offensive vs Defensive 2 Pt Field Goals')
plt.xlabel('Offensive 2 pt field goals')
plt.ylabel('Defensive 2 pt field goals')
plt.scatter(usu['Off.2PT FG'], usu['Def.2PT FG'], color='navy', s=150, marker='o')

In [None]:
sns.regplot(data=df, x='Off.3PT FG', y='Def.3PT FG')
plt.title('Offensive vs Defensive 3 Pt Field Goals')
plt.xlabel('Offensive 3 pt field goals')
plt.ylabel('Defensive 3 pt field goals')
plt.scatter(usu['Off.3PT FG'], usu['Def.3PT FG'], color='navy', s=150, marker='o')

In [None]:
sns.regplot(data=df, x='Avg Possession Length (Offense)', y='Avg Possession Length (Defense)')
plt.title('Offensive vs Defensive Average Possession Length')
plt.xlabel('Offensive average possession length')
plt.ylabel('Defensive average possession length')
plt.scatter(usu['Avg Possession Length (Offense)'], usu['Avg Possession Length (Defense)'], color='navy', s=150, marker='o')

In [None]:
# 'March Madness', 'Not In a Post-Season Tournament', 'NIT', 'CBI', 'CIT'
byTourneyDF = df.groupby('Post-Season Tournament')['Team Name'].count().reset_index().sort_values('Team Name')

plt.figure(figsize=(12, 6))
sns.barplot(data=byTourneyDF, x='Post-Season Tournament', y='Team Name')
plt.title('Tournament Team Counts')
plt.xlabel('Tournament')
plt.ylabel('Teams Qualified')

## Logistic Regression

### All Classes

#### All Attributes

In [None]:
X = df[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 'Off.FT',
       'Off.2PT FG', 'Off.3PT FG', 'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Offense)', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()
display(class_counts.index)

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index] 

lm = LogisticRegression(class_weight=weights)
lm.fit(X, y)

y_pred = lm.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=class_counts.index)

display('support = {}'.format(s))
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
mat = confusion_matrix(y, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
X = df[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 'Off.FT',
       'Off.2PT FG', 'Off.3PT FG', 'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Offense)', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)
    
avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Offensive Attributes

In [None]:
X = df[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
        'Off.FT',
       'Off.2PT FG', 'Off.3PT FG',  'Avg Possession Length (Offense)',  'Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Defensive Attributes

In [None]:
X = df[[       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 
        'Def.FT', 'Def.2PT FG', 'Def.3PT FG',  'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Coaching Attribute

In [None]:
X = df[['Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

### Grouped Into Two Classes

In [None]:
def transform(value):
    if value != 'March Madness':
        return 0
    return 1

binaryDF = df.copy()
binaryDF['Post-Season Tournament'] = [transform(value) for value in binaryDF['Post-Season Tournament']]

In [None]:
sns.barplot(data=binaryDF, x="Active Coaching Length", hue='Post-Season Tournament')
plt.title('Average Coaching Time of Current Coach')
plt.xlabel('Years')

In [None]:
plt.figure(figsize=[12, 6])
sns.scatterplot(data=binaryDF, x='Off.eFG %', y='Def.eFG %', hue='Post-Season Tournament')
plt.title('Offensive vs Defensive Field Goal Efficiency Percentage')
plt.xlabel('Offensive field goal efficiency percentage')
plt.ylabel('Defensive field goal efficiency percentage')
plt.scatter(usu['Off.eFG %'], usu['Def.eFG %'], color='navy', s=150, marker='o')

In [None]:
plt.figure(figsize=[12, 6])
sns.scatterplot(data=binaryDF, x='Off.2PT FG', y='Def.2PT FG', hue='Post-Season Tournament')
plt.title('Offensive vs Defensive 2 Pt Field Goals')
plt.xlabel('Offensive 2 pt field goals')
plt.ylabel('Defensive 2 pt field goals')
plt.scatter(usu['Off.2PT FG'], usu['Def.2PT FG'], color='navy', s=150, marker='o')

In [None]:
plt.figure(figsize=[12, 6])
sns.scatterplot(data=binaryDF, x='Off.3PT FG', y='Def.3PT FG', hue='Post-Season Tournament')
plt.title('Offensive vs Defensive 3 Pt Field Goals')
plt.xlabel('Offensive 3 pt fields')
plt.ylabel('Defensive 3 pt fields')
plt.scatter(usu['Off.3PT FG'], usu['Def.3PT FG'], color='navy', s=150, marker='o')

#### All Attributes

In [None]:
X = binaryDF[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 'Off.FT',
       'Off.2PT FG', 'Off.3PT FG', 'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Offense)', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

lm = LogisticRegression(class_weight=weights)
lm.fit(X, y)

y_pred = lm.predict(X)

p,r,f,s = precision_recall_fscore_support(y, y_pred, labels=class_counts.index)
display(s)
display('precision = {}'.format(p))
display('recall = {}'.format(r))
display('f-score = {}'.format(f))

In [None]:
mat = confusion_matrix(y, y_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False)
plt.title('Confusion Matrix')
plt.xlabel('true label')
plt.ylabel('predicted label')

In [None]:
X = binaryDF[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 'Off.FT',
       'Off.2PT FG', 'Off.3PT FG', 'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Offense)', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Offensive Attributes

In [None]:
X = binaryDF[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
        'Off.FT',
       'Off.2PT FG', 'Off.3PT FG',  'Avg Possession Length (Offense)',  'Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Defensive Attributes

In [None]:
X = binaryDF[[
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 
     'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Coach Attribute

In [None]:
X = binaryDF[['Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]
    
scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    lm = LogisticRegression(class_weight=weights)
    lm.fit(X_train, y_train)

    y_pred = lm.predict(X_test)

    p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
    scores['p'].append(p)
    scores['r'].append(r)
    scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

## SVM

### All Classes

#### All Attributes

In [None]:
X = df[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 'Off.FT',
       'Off.2PT FG', 'Off.3PT FG', 'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Offense)', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    #    clf = svm.SVC(kernel='linear', class_weight=weights)
       clf = svm.SVC(kernel='poly', degree=5, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Offensive Attributes

In [None]:
X = df[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
        'Off.FT',
       'Off.2PT FG', 'Off.3PT FG',  'Avg Possession Length (Offense)', 'Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    #    clf = svm.SVC(kernel='linear', class_weight=weights)
       clf = svm.SVC(kernel='poly', degree=5, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Defensive Attributes

In [None]:
X = df[[
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 
       'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    #    clf = svm.SVC(kernel='linear', class_weight=weights)
       clf = svm.SVC(kernel='poly', degree=5, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Coach Attribute

In [None]:
X = df[['Active Coaching Length']]
y = df['Post-Season Tournament']

class_counts = df['Post-Season Tournament'].value_counts()

total_samples = len(df)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=df['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

       clf = svm.SVC(kernel='linear', class_weight=weights)
    #    clf = svm.SVC(kernel='poly', degree=5, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

### Classes Grouped Into Two Classes

#### All Attributes

In [None]:
X = binaryDF[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 'Off.FT',
       'Off.2PT FG', 'Off.3PT FG', 'Def.FT', 'Def.2PT FG', 'Def.3PT FG', 'Avg Possession Length (Offense)', 'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    #    clf = svm.SVC(kernel='linear', class_weight=weights)
       clf = svm.SVC(kernel='poly', degree=2, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Offensive Attributes

In [None]:
X = binaryDF[['Off.eFG %', 'Off.TO %', 'Off.OR %', 'Off.FT Rate',
        'Off.FT',
       'Off.2PT FG', 'Off.3PT FG',  'Avg Possession Length (Offense)',  'Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    #    clf = svm.SVC(kernel='linear', class_weight=weights)
       clf = svm.SVC(kernel='poly', degree=2, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Defensive Attributes

In [None]:
X = binaryDF[[
       'Def.eFG %', 'Def.TO %', 'Def.OR %', 'Def.FT Rate', 
        'Def.FT', 'Def.2PT FG', 'Def.3PT FG',  'Avg Possession Length (Defense)', 'Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    #    clf = svm.SVC(kernel='linear', class_weight=weights)
       clf = svm.SVC(kernel='poly', degree=2, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))

#### Coach Attribute

In [None]:
X = binaryDF[['Active Coaching Length']]
y = binaryDF['Post-Season Tournament']

class_counts = binaryDF['Post-Season Tournament'].value_counts()

total_samples = len(binaryDF)

class_weights = compute_class_weight(class_weight='balanced', classes=class_counts.index.to_numpy(), y=binaryDF['Post-Season Tournament'])

weights = {}
for index, tourney in enumerate(class_counts.index):
    weights[tourney] = class_weights[index]

scores = {'p': [], 'r': [], 'f': []}
for _ in range(30):
       X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)#, random_state=42)

    #    clf = svm.SVC(kernel='linear', class_weight=weights)
       clf = svm.SVC(kernel='poly', degree=2, class_weight=weights)#, class_weight={0:1, 1:0.2}) # try degree=2,4
    #    clf = svm.SVC(kernel='rbf', gamma=90, class_weight=weights) # try gamma=.1,1
       clf.fit(X_train, y_train)

       y_pred = clf.predict(X_test)
       p,r,f,s = precision_recall_fscore_support(y_test, y_pred, labels=class_counts.index)
       scores['p'].append(p)
       scores['r'].append(r)
       scores['f'].append(f)

avgP = 0
for pre in scores['p']:
    avgP += pre
avgP /= len(scores['p'])

avgR = 0
for re in scores['r']:
    avgR += re
avgR /= len(scores['r'])

avgF = 0
for f in scores['f']:
    avgF += f
avgF /= len(scores['f'])

display('precision = {}'.format(avgP))
display('recall = {}'.format(avgR))
display('f-score = {}'.format(avgF))