# Senior Project 1 Presentation
Ismail Conze
Nick Chowa
Kaylyn Matthews

## Introduction

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score, cross_val_predict

## Preprocessing

### Methods

In [3]:
def pull_team(team, games):
    teams_games = games.loc[(games['TEAM_ABBREVIATION_HOME'] == team) |
                            (games['TEAM_ABBREVIATION_AWAY'] == team)]
    print('Number of games')
    print(len(teams_games))
    return teams_games


In [4]:
def find_outliers(x):
    q1 = np.percentile(x, 25)
    q3 = np.percentile(x, 75)
    iqr = q3 - q1
    floor = q1 - 1.5 * iqr
    ceiling = q3 + 1.5 * iqr
    outlier_indices = list(x.index[(x < floor) | (x > ceiling)])
    outlier_values = list(x[outlier_indices])
    return outlier_indices, outlier_values

In [5]:
def remove_outliers(x):
    indices = []
    for c in x.columns:
        if not x[c].map(type).eq(str).any():
            if not c == "GAME_ID" or c == "GAME_DATE":
                indices += find_outliers(x[c])[0]
    x = x.drop(indices)
    return x

In [10]:
def clean_team(x):
    # separate numerical features and categorical features
    categorical_columns = []
    numeric_columns = []
    for c in x.columns:
        if x[c].map(type).eq(str).any():
            categorical_columns.append(c)
        else:
            numeric_columns.append(c)

    # create two dataframes to hold the two types
    data_numeric = x[numeric_columns]
    data_categorical = pd.DataFrame(x[categorical_columns])

    # replace missing values in numerical columns with median and then add the two types back together
    imp = SimpleImputer(missing_values=np.nan, strategy='median')
    data_numeric = pd.DataFrame(imp.fit_transform(data_numeric), columns=data_numeric.columns, index=data_numeric.index)
    x = pd.concat([data_numeric, data_categorical], axis=1)
    return x

### Process

Read in the dataset

In [2]:
 df = pd.read_csv('games.csv')
 df['WL_HOME'] = [0 if x == 'L' else 1 for x in df['WL_HOME']]

Here we create a subeset of the dataset containing all of the games for the selected team from the dataset. We then check for any missing values in the dataframe and replace them with that team's average performance for that category. After doing so we check again for missing values to ensure that there are none.

In [7]:
x = pull_team("MIN", df)
print('Missing Values', x.isnull().sum())
x = clean_team(x)
print('Missing Values', x.isnull().sum())
x = remove_outliers(x)

Number of games
411
Missing Values GAME_ID                     0
GAME_DATE                   0
TEAM_ABBREVIATION_HOME      0
TEAM_ABBREVIATION_HOME.1    0
FGM_HOME                    0
FGA_HOME                    0
FG_PCT_HOME                 0
FG3M_HOME                   0
FG3A_HOME                   0
FG3_PCT_HOME                0
FTM_HOME                    0
FTA_HOME                    0
FT_PCT_HOME                 0
OREB_HOME                   0
DREB_HOME                   0
REB_HOME                    0
AST_HOME                    0
STL_HOME                    0
BLK_HOME                    0
TOV_HOME                    0
PF_HOME                     0
PTS_HOME                    0
PTS_2ND_CHANCE_HOME         1
PTS_PAINT_HOME              1
TEAM_ABBREVIATION_AWAY      0
FGM_AWAY                    0
FGA_AWAY                    0
FG_PCT_AWAY                 0
FG3M_AWAY                   0
FG3A_AWAY                   0
FG3_PCT_AWAY                0
FTM_AWAY                    0
FTA_A

In [8]:
teamIF = x.drop(['WL_HOME', 'GAME_ID', 'GAME_DATE', 'TEAM_ABBREVIATION_HOME.1', 'TEAM_ABBREVIATION_HOME',
                     'TEAM_ABBREVIATION_AWAY'], axis=1)
teamOF = x.WL_HOME

In [9]:
print(teamIF)
print(teamOF)

      FGM_HOME  FGA_HOME  FG_PCT_HOME  FG3M_HOME  FG3A_HOME  FG3_PCT_HOME  \
14        43.0      82.0        0.524        4.0       15.0         0.267   
19        37.0      76.0        0.487        4.0       10.0         0.400   
33        38.0      85.0        0.447        4.0       12.0         0.333   
59        36.0      82.0        0.439        8.0       21.0         0.381   
85        41.0      78.0        0.526        5.0       13.0         0.385   
...        ...       ...          ...        ...        ...           ...   
6096      43.0      94.0        0.457        7.0       39.0         0.179   
6113      43.0      86.0        0.500        8.0       24.0         0.333   
6130      46.0      85.0        0.541       10.0       23.0         0.435   
6140      38.0      91.0        0.418       13.0       42.0         0.310   
6152      39.0      87.0        0.448       10.0       33.0         0.303   

      FTM_HOME  FTA_HOME  FT_PCT_HOME  OREB_HOME  ...  AST_AWAY  STL_AWAY  