# Predicting Baseball Salaries, Part II
## Data Cleaning

Now that I've collected my new data that I want to use to predict baseball salaries, I can get to work combining my dataframes and cleaning things up to make them workable.

I've first got to load in my libraries and read in my two pickled files.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [29]:
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [30]:
salaries = pd.read_pickle('salary_info')
hitting_stats = pd.read_pickle('hitting_data_v1')

In [31]:
salaries.head()

Unnamed: 0,name,team,position,salary
0,MaxScherzer,WSH,SP,42142857
1,StephenStrasburg,WSH,SP,36428571
2,MikeTrout,LAA,CF,34083333
3,ZackGreinke,ARI,SP,32421884
4,DavidPrice,BOS,SP,31000000


In [32]:
salaries.name.value_counts()

BrandonWoodruff    1
RichardUrena       1
J.A.Happ           1
RougnedOdor        1
NelsonCruz         1
                  ..
JocPederson        1
SamTuivailala      1
TysonRoss          1
CarlosMartinez     1
JasonKipnis        1
Name: name, Length: 877, dtype: int64

In [33]:
hitting_stats.head()

Unnamed: 0,FirstName,LastName,FullName,Position,Team,Games,At_Bats,Runs,Hits,Doubles,Triples,Homeruns,RBIs,Walks,Strikeouts,StolenBases,CaughtStealing,BattingAverage,OnBasePercentage,SluggingPercentage,OnBaseSluggingPercent,CaughStealing
0,Enny,Romero,Enny Romero,P,KC,4,1,1,1,1,0,0,0,0,0,0,,1.0,1.0,2.0,3.0,0
1,Kolby,Allard,Kolby Allard,P,ATL,3,1,1,1,0,0,0,0,0,0,0,,1.0,1.0,1.0,2.0,0
2,Kyle,Gibson,Kyle Gibson,P,MIN,1,2,2,2,0,0,0,0,0,0,0,,1.0,1.0,1.0,2.0,0
3,Derek,Law,Derek Law,P,SF,7,1,1,1,0,0,0,0,0,0,0,,1.0,1.0,1.0,2.0,0
4,Vidal,Nuno,Vidal Nuno,P,TB,1,2,0,2,0,0,0,1,0,0,0,,1.0,1.0,1.0,2.0,0


In [34]:
hitting_stats.FullName.value_counts()

Javy Guerra         2
Jose Ramirez        2
Jaime Garcia        1
Noah Syndergaard    1
Felix Hernandez     1
                   ..
Justin Anderson     1
Tzu-Wei Lin         1
Josh Harrison       1
Will Harris         1
Franchy Cordero     1
Name: FullName, Length: 1268, dtype: int64

In [35]:
hitting_stats[hitting_stats.FullName.isin(['Javy Guerra', 'Jose Ramirez'])]

Unnamed: 0,FirstName,LastName,FullName,Position,Team,Games,At_Bats,Runs,Hits,Doubles,Triples,Homeruns,RBIs,Walks,Strikeouts,StolenBases,CaughtStealing,BattingAverage,OnBasePercentage,SluggingPercentage,OnBaseSluggingPercent,CaughStealing
38,Jose,Ramirez,Jose Ramirez,3B,CLE,157,578,110,156,38,4,39,105,106,80,34,,0.27,0.387,0.552,0.939,6
670,Javy,Guerra,Javy Guerra,SS,SD,13,16,2,2,0,0,0,1,3,9,0,,0.125,0.263,0.125,0.388,0
961,Javy,Guerra,Javy Guerra,P,MIA,30,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,0.0,0.0,0
1134,Jose,Ramirez,Jose Ramirez,P,ATL,7,0,0,0,0,0,0,0,0,0,0,,0.0,0.0,0.0,0.0,0


In [36]:
hitting_stats['name'] = hitting_stats['FirstName'] + hitting_stats['LastName']
hitting_stats['name'].head()

0     EnnyRomero
1    KolbyAllard
2     KyleGibson
3       DerekLaw
4      VidalNuno
Name: name, dtype: object

In [37]:
df = pd.merge(hitting_stats, salaries, how='left', on='name')
df.head()

Unnamed: 0,FirstName,LastName,FullName,Position,Team,Games,At_Bats,Runs,Hits,Doubles,Triples,Homeruns,RBIs,Walks,Strikeouts,StolenBases,CaughtStealing,BattingAverage,OnBasePercentage,SluggingPercentage,OnBaseSluggingPercent,CaughStealing,name,team,position,salary
0,Enny,Romero,Enny Romero,P,KC,4,1,1,1,1,0,0,0,0,0,0,,1.0,1.0,2.0,3.0,0,EnnyRomero,,,
1,Kolby,Allard,Kolby Allard,P,ATL,3,1,1,1,0,0,0,0,0,0,0,,1.0,1.0,1.0,2.0,0,KolbyAllard,,,
2,Kyle,Gibson,Kyle Gibson,P,MIN,1,2,2,2,0,0,0,0,0,0,0,,1.0,1.0,1.0,2.0,0,KyleGibson,MIN,SP,8125000.0
3,Derek,Law,Derek Law,P,SF,7,1,1,1,0,0,0,0,0,0,0,,1.0,1.0,1.0,2.0,0,DerekLaw,,,
4,Vidal,Nuno,Vidal Nuno,P,TB,1,2,0,2,0,0,0,1,0,0,0,,1.0,1.0,1.0,2.0,0,VidalNuno,,,


In [38]:
df.isnull().sum()

FirstName                   0
LastName                    0
FullName                    0
Position                    0
Team                        0
Games                       0
At_Bats                     0
Runs                        0
Hits                        0
Doubles                     0
Triples                     0
Homeruns                    0
RBIs                        0
Walks                       0
Strikeouts                  0
StolenBases                 0
CaughtStealing           1270
BattingAverage              0
OnBasePercentage            0
SluggingPercentage          0
OnBaseSluggingPercent       0
CaughStealing               0
name                        0
team                      496
position                  496
salary                    496
dtype: int64

In [39]:
hitting_stats[hitting_stats.FullName == 'Max Scherzer']

Unnamed: 0,FirstName,LastName,FullName,Position,Team,Games,At_Bats,Runs,Hits,Doubles,Triples,Homeruns,RBIs,Walks,Strikeouts,StolenBases,CaughtStealing,BattingAverage,OnBasePercentage,SluggingPercentage,OnBaseSluggingPercent,CaughStealing,name
554,Max,Scherzer,Max Scherzer,P,WSH,32,70,8,17,2,0,0,6,1,14,1,,0.243,0.274,0.271,0.545,0,MaxScherzer


In [40]:
salaries[salaries.name == 'MaxScherzer']

Unnamed: 0,name,team,position,salary
0,MaxScherzer,WSH,SP,42142857


In [41]:
df[df['team'].isnull() == True]

Unnamed: 0,FirstName,LastName,FullName,Position,Team,Games,At_Bats,Runs,Hits,Doubles,Triples,Homeruns,RBIs,Walks,Strikeouts,StolenBases,CaughtStealing,BattingAverage,OnBasePercentage,SluggingPercentage,OnBaseSluggingPercent,CaughStealing,name,team,position,salary
0,Enny,Romero,Enny Romero,P,KC,4,1,1,1,1,0,0,0,0,0,0,,1.000,1.000,2.000,3.000,0,EnnyRomero,,,
1,Kolby,Allard,Kolby Allard,P,ATL,3,1,1,1,0,0,0,0,0,0,0,,1.000,1.000,1.000,2.000,0,KolbyAllard,,,
3,Derek,Law,Derek Law,P,SF,7,1,1,1,0,0,0,0,0,0,0,,1.000,1.000,1.000,2.000,0,DerekLaw,,,
4,Vidal,Nuno,Vidal Nuno,P,TB,1,2,0,2,0,0,0,1,0,0,0,,1.000,1.000,1.000,2.000,0,VidalNuno,,,
7,A.J.,Cole,A.J. Cole,P,NYY,6,3,1,1,0,0,1,1,0,1,0,,.333,.333,1.333,1.667,0,A.J.Cole,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1262,Mike,Wright Jr.,Mike Wright Jr.,P,BAL,3,1,0,0,0,0,0,0,0,0,0,,.000,.000,.000,.000,0,MikeWright Jr.,,,
1265,Mike,Zagurski,Mike Zagurski,P,MIL,2,0,0,0,0,0,0,0,0,0,0,,.000,.000,.000,.000,0,MikeZagurski,,,
1266,Daniel,Zamora,Daniel Zamora,P,NYM,15,0,0,0,0,0,0,0,0,0,0,,.000,.000,.000,.000,0,DanielZamora,,,
1267,Rob,Zastryzny,Rob Zastryzny,P,CHC,6,0,0,0,0,0,0,0,0,0,0,,.000,.000,.000,.000,0,RobZastryzny,,,


In [44]:
df[df.isnull() == True]

Unnamed: 0,FirstName,LastName,FullName,Position,Team,Games,At_Bats,Runs,Hits,Doubles,Triples,Homeruns,RBIs,Walks,Strikeouts,StolenBases,CaughtStealing,BattingAverage,OnBasePercentage,SluggingPercentage,OnBaseSluggingPercent,CaughStealing,name,team,position,salary
0,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1265,,,,,,,,,,,,,,,,,,,,,,,,,,
1266,,,,,,,,,,,,,,,,,,,,,,,,,,
1267,,,,,,,,,,,,,,,,,,,,,,,,,,
1268,,,,,,,,,,,,,,,,,,,,,,,,,,
