In [1]:
library(tidyverse)
library(repr)
library(digest)
library(gridExtra)
library(cowplot)
library(dplyr)
library(tidymodels)
library(GGally)
library(splines)

“package ‘tidyverse’ was built under R version 4.1.2”
── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.2 ──
[32m✔[39m [34mggplot2[39m 3.4.0      [32m✔[39m [34mpurrr  [39m 0.3.4 
[32m✔[39m [34mtibble [39m 3.1.8      [32m✔[39m [34mdplyr  [39m 1.0.10
[32m✔[39m [34mtidyr  [39m 1.2.0      [32m✔[39m [34mstringr[39m 1.4.0 
[32m✔[39m [34mreadr  [39m 2.1.3      [32m✔[39m [34mforcats[39m 0.5.1 
“package ‘ggplot2’ was built under R version 4.1.2”
“package ‘tibble’ was built under R version 4.1.2”
“package ‘tidyr’ was built under R version 4.1.2”
“package ‘readr’ was built under R version 4.1.2”
“package ‘dplyr’ was built under R version 4.1.2”
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
“package ‘repr’ was built under R version 4.

## Team Factor Calculations
Data from baseball savant and razzball is used to calculate "Team factor". Team factor consists of the average fantasy points per game for each team from 2022 standardized to a normal distribution. Abnormalities in the usual standardization of a Normal(0,1) distribution are in place in order to prevent extreme transformation on the final prediction, as the standard deviation is decreased significantly and the mean is centered at 1. Team factor is simply meant to allow for a slight influence on the team (surrounding players and park) that each player is on for the final predictions. Changes to the Blue Jays, Tigers and Mets ballpark dimensions are accounted for with an approximate estimate on how these changes will impact offensive production. 

In [2]:
data <- read_csv("data/2022data.csv")
head(data)

[1m[22mNew names:
[36m•[39m `` -> `...12`
[1mRows: [22m[34m469[39m [1mColumns: [22m[34m12[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (2): last_name, first_name
[32mdbl[39m (9): player_id, year, b_strikeout, b_walk, b_rbi, b_total_bases, r_total...
[33mlgl[39m (1): ...12

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


last_name,first_name,player_id,year,b_strikeout,b_walk,b_rbi,b_total_bases,r_total_stolen_base,b_game,r_run,...12
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>
Pujols,Albert,405395,2022,55,28,68,169,1,109,42,
Cabrera,Miguel,408234,2022,101,28,43,126,1,112,25,
Molina,Yadier,425877,2022,40,5,24,79,2,78,19,
Cano,Robinson,429664,2022,25,4,4,19,0,33,5,
Suzuki,Kurt,435559,2022,29,15,15,41,0,51,10,
Cruz Jr.,Nelson,443558,2022,119,49,64,151,4,124,50,


In [3]:
selected <- data %>% select(-"player_id", -"...12")
head(selected)

last_name,first_name,year,b_strikeout,b_walk,b_rbi,b_total_bases,r_total_stolen_base,b_game,r_run
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Pujols,Albert,2022,55,28,68,169,1,109,42
Cabrera,Miguel,2022,101,28,43,126,1,112,25
Molina,Yadier,2022,40,5,24,79,2,78,19
Cano,Robinson,2022,25,4,4,19,0,33,5
Suzuki,Kurt,2022,29,15,15,41,0,51,10
Cruz Jr.,Nelson,2022,119,49,64,151,4,124,50


In [4]:
names <- selected %>% unite("player", last_name:first_name, sep = ", ", remove = T) %>% 
rename("K" = "b_strikeout", "BB" = "b_walk", "RBI" = "b_rbi", "TB" = "b_total_bases", "SB" = "r_total_stolen_base", 
           "R" = "r_run", "G" = "b_game") %>% mutate(Fpointsg = (BB + RBI + TB + SB + R - K)/G)

head(names)

player,year,K,BB,RBI,TB,SB,G,R,Fpointsg
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
"Pujols, Albert",2022,55,28,68,169,1,109,42,2.3211009
"Cabrera, Miguel",2022,101,28,43,126,1,112,25,1.0892857
"Molina, Yadier",2022,40,5,24,79,2,78,19,1.1410256
"Cano, Robinson",2022,25,4,4,19,0,33,5,0.2121212
"Suzuki, Kurt",2022,29,15,15,41,0,51,10,1.0196078
"Cruz Jr., Nelson",2022,119,49,64,151,4,124,50,1.6048387


In [5]:
teamdata <- read_csv("data/Razzpreds.csv") %>% select(Name, Team)
head(teamdata)

[1mRows: [22m[34m743[39m [1mColumns: [22m[34m28[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m  (5): Name, Team, Bats, ESPN, YAHOO
[32mdbl[39m (23): #, G, PA, AB, R, HR, RBI, SB, H, 1B, 2B, 3B, TB, SO, BB, HBP, SF, ...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


Name,Team
<chr>,<chr>
Juan Soto,SD
Jose Altuve,HOU
Mookie Betts,LAD
Trea Turner,PHI
Freddie Freeman,LAD
Bo Bichette,TOR


In [6]:
names1 <- extract(teamdata,Name,c("First_Name","Last_Name"), "([^ ]+) (.*)") 
head(names1)

First_Name,Last_Name,Team
<chr>,<chr>,<chr>
Juan,Soto,SD
Jose,Altuve,HOU
Mookie,Betts,LAD
Trea,Turner,PHI
Freddie,Freeman,LAD
Bo,Bichette,TOR


In [7]:
real <- names1 %>% unite("player", Last_Name:First_Name, sep = ", ", remove = T)
head(real)

player,Team
<chr>,<chr>
"Soto, Juan",SD
"Altuve, Jose",HOU
"Betts, Mookie",LAD
"Turner, Trea",PHI
"Freeman, Freddie",LAD
"Bichette, Bo",TOR


In [8]:
write_csv(real, "data/playertm.csv")

In [9]:
full <- right_join(real, names, by = 'player') %>% filter(Team != "NA" & Team != "FA")
head(full)

player,Team,year,K,BB,RBI,TB,SB,G,R,Fpointsg
<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
"Soto, Juan",SD,2022,96,135,62,237,6,153,93,2.856209
"Altuve, Jose",HOU,2022,87,66,57,281,18,141,103,3.106383
"Betts, Mookie",LAD,2022,104,55,82,305,12,142,117,3.288732
"Turner, Trea",PHI,2022,131,45,100,304,27,160,101,2.7875
"Freeman, Freddie",LAD,2022,102,84,100,313,13,159,117,3.301887
"Bichette, Bo",TOR,2022,155,41,93,306,13,159,91,2.446541


In [10]:
## Calculating team averages (dividing by five to assist with scaling)
team_avg <- full %>% group_by(Team) %>% summarize(avg = mean(Fpointsg)/5)
head(team_avg)

Team,avg
<chr>,<dbl>
ARI,0.3016269
ATL,0.3421113
BAL,0.2927034
BOS,0.3238261
CHC,0.2872872
CHW,0.3063281


In [11]:
## Standardizing team average points to a N(1, sd^(1/100)) disribution
team_avg$standardized <- ((team_avg$avg - mean(team_avg$avg))/(sd(team_avg$avg))^(1/100)) + 1
team_avg

Team,avg,standardized
<chr>,<dbl>,<dbl>
ARI,0.3016269,0.9794853
ATL,0.3421113,1.0212407
BAL,0.2927034,0.9702816
BOS,0.3238261,1.0023814
CHC,0.2872872,0.9646953
CHW,0.3063281,0.9843341
CIN,0.2405591,0.9165001
CLE,0.3730269,1.053127
COL,0.3410039,1.0200986
DET,0.2446115,0.9206798


In [12]:
## Adding impacts from new park dimensions:
team_avg[10,]$standardized = team_avg[10,]$standardized + 0.03
team_avg[29,]$standardized = team_avg[29,]$standardized + 0.03
team_avg[18,]$standardized = team_avg[18,]$standardized + 0.015
team_avg

Team,avg,standardized
<chr>,<dbl>,<dbl>
ARI,0.3016269,0.9794853
ATL,0.3421113,1.0212407
BAL,0.2927034,0.9702816
BOS,0.3238261,1.0023814
CHC,0.2872872,0.9646953
CHW,0.3063281,0.9843341
CIN,0.2405591,0.9165001
CLE,0.3730269,1.053127
COL,0.3410039,1.0200986
DET,0.2446115,0.9506798


In [13]:
write_csv(team_avg, "data/teamfactor.csv")