In [1]:
# Import Necessary Libraries

import sys
import pandas as pd
from sklearn import tree
from sklearn import neighbors
from sklearn import svm
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn import metrics
import numpy as np
import plotly.express as px
import ipywidgets as widgets

In [2]:
# Import CSV file using pd.read

battingData = pd.read_csv('batting.csv')
#print(battingData.describe())

# When printing battingData (csv data stored as a dataframe, need arises to convert percentages 'strings' to floats)

columns = battingData.columns
for i in [10,11,17,18,19,20,21,22,23,24,25,26,27]:
    
    battingData[columns[i]] = battingData[columns[i]].str.rstrip('%').astype('float') / 100.0
    

#Verify Inputs Imported/Converted Correctly
#print(battingData)            
#print(battingData.columns)

In [37]:
# Create Testing and Training Sets

independentSet = battingData[columns[3:28]]
independentColumns = independentSet.columns
dependentSet = battingData[columns[28]]

# Split the data set into testing and training sets, setting test_size to percentage
# of those to be held back for testing. Generally, try to target 20% for test

independentTrain, independentTest, dependentTrain, dependentTest = train_test_split(
    independentSet, dependentSet, test_size=0.20, random_state=0)


# Providing Ability to quickly investigate different regression algorithms to see which provides cleanest
# fit to the data. 

method = 'Decision Tree'

if method == 'Linear Regression':
    regressor = LinearRegression()
    regressor.fit(independentTrain, dependentTrain)
    coeff_Data = pd.DataFrame(regressor.coef_, independentSet.columns, columns=['Coefficient'])
    print(coeff_Data)

elif method == 'Decision Tree':
    regressor = tree.DecisionTreeRegressor( max_depth = 10 )
    regressor.fit(independentTrain, dependentTrain)

elif method == "Support Vector":
    regressor = svm.SVR(kernel='poly')
    regressor.fit(independentTrain, dependentTrain)
    
elif method == "Nearest Neighbor":
    regressor = neighbors.KNeighborsRegressor( n_neighbors = 3, weights='distance')
    regressor.fit(independentTrain, dependentTrain)


In [52]:
prediction = regressor.predict(independentTest)
resultFrame = pd.DataFrame({'Actual': dependentTest, 'Predicted': prediction})

#print(resultFrame)

In [45]:
# Print XY Scatter Plot using plotly. The closer to the line, the smaller the error. 

fig = px.scatter(resultFrame, x="Actual", y="Predicted",  trendline="ols")
fig.update_layout(title_text='Testing Actual vs. Predicted for Testing Data Set', title_x=0.5)
fig.update_xaxes(range=[0.2, 0.45])
fig.update_yaxes(range=[0.2, 0.45])
fig.show()

print('Mean Absolute Error:', metrics.mean_absolute_error(dependentTest, prediction))
print('Mean Squared Error:', metrics.mean_squared_error(dependentTest, prediction))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(dependentTest, prediction)))


Mean Absolute Error: 0.03248949318910256
Mean Squared Error: 0.0015818324693034494
Root Mean Squared Error: 0.03977225753340448


In [49]:
predictionAllPlayers = regressor.predict(independentSet)
resultFrameAllPlayers = pd.DataFrame({'Name': battingData[columns[1]],'Actual': dependentSet, 'Predicted': predictionAllPlayers})
#print(resultFrameAllPlayers)

# Print XY Scatter Plot using plotly. The closer to the line, the smaller the error in the prediction. 

fig = px.scatter(resultFrameAllPlayers, x="Actual", y="Predicted", hover_name="Name", trendline="ols")
fig.update_layout(title_text='Testing Actual vs. Predicted for Entire Data Set', title_x=0.5)
fig.update_xaxes(range=[0.2, 0.45])
fig.update_yaxes(range=[0.2, 0.5])
fig.show()

print('Mean Absolute Error:', metrics.mean_absolute_error(dependentSet, predictionAllPlayers))
print('Mean Squared Error:', metrics.mean_squared_error(dependentSet, predictionAllPlayers))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(dependentSet, predictionAllPlayers)))

Mean Absolute Error: 0.009662962740384615
Mean Squared Error: 0.00035940332278697196
Root Mean Squared Error: 0.01895793561511833


In [50]:
# Add Widget to select Player and See Predictions vs. Actual performance via dropdown box
# For dropbox selection to work, must run this and the next cell
# Code adopted and modified from:
# https://stackoverflow.com/questions/45754356/jupyter-notebook-widgets-create-dependent-dropdowns


# Define function to call and sort all values based on the given input (in this case player's name)

ALL = 'ALL'
def sortedValues(array):
    unique = array.unique().tolist()
    unique.sort()
    unique.insert(0, ALL)
    return unique

dropdownName = widgets.Dropdown(options = sortedValues(resultFrameAllPlayers.Name))
outputName = widgets.Output()

# Define Function to handle the change in event(i.e. selecting a new name from the dropdown)

def dropdownNameEventhandler(change):
    outputName.clear_output()
    with outputName:
        if (change.new == ALL):
            display(resultFrameAllPlayers)
        else:
            display(resultFrameAllPlayers[resultFrameAllPlayers.Name == change.new])

dropdownName.observe(dropdownNameEventhandler, names='value')
display(dropdownName)

Dropdown(options=('ALL', 'A.J. Pollock', 'Aaron Judge', 'Adalberto Mondesi', 'Adam Eaton', 'Adam Engel', 'Adam…

In [51]:
display((outputName))

Output()