In [1]:
# Import dependencies

import pandas as pd
import numpy as np
pd.set_option('max_colwidth', 400)

In [2]:
# Extract the Data, separating on the semicolon
hitting_df = pd.read_csv('Resources/Raw-Data/2023 MLB Player Stats - Batting.csv', encoding='iso-8859-1', sep=';')
hitting_df.head()

Unnamed: 0,Rk,Name,Age,Tm,Lg,G,PA,AB,R,H,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
0,1,CJ Abrams*,22,WSN,NL,89,340,316,47,82,...,0.306,0.434,0.739,105,137,5,8,3,0,0
1,2,José Abreu,36,HOU,AL,95,400,368,33,90,...,0.293,0.353,0.646,79,130,11,3,0,5,1
2,3,Ronald Acuna Jr.,25,ATL,NL,97,446,391,86,129,...,0.408,0.578,0.986,160,226,7,4,0,2,2
3,4,Willy Adames,27,MIL,NL,89,383,336,44,71,...,0.291,0.411,0.702,90,138,9,3,0,5,0
4,5,Riley Adams,27,WSN,NL,23,87,79,4,22,...,0.337,0.506,0.844,133,40,4,1,1,0,0


In [3]:
# Generate a summary of our Hitting dataframe
hitting_df.info

<bound method DataFrame.info of       Rk               Name  Age   Tm  Lg   G   PA   AB   R    H  ...    OBP  \
0      1         CJ Abrams*   22  WSN  NL  89  340  316  47   82  ...  0.306   
1      2         José Abreu   36  HOU  AL  95  400  368  33   90  ...  0.293   
2      3   Ronald Acuna Jr.   25  ATL  NL  97  446  391  86  129  ...  0.408   
3      4       Willy Adames   27  MIL  NL  89  383  336  44   71  ...  0.291   
4      5        Riley Adams   27  WSN  NL  23   87   79   4   22  ...  0.337   
..   ...                ...  ...  ...  ..  ..  ...  ...  ..  ...  ...    ...   
690  691  Masataka Yoshida*   29  BOS  AL  86  370  335  52  107  ...  0.381   
691  692        Alex Young*   29  CIN  NL   2    0    0   0    0  ...  0.000   
692  693       Jared Young*   27  CHC  NL  13   39   35   7    6  ...  0.256   
693  694        Seby Zavala   29  CHW  AL  59  156  143  14   23  ...  0.208   
694  695        Mike Zunino   32  CLE  AL  42  140  124  11   22  ...  0.271   

       

In [4]:
# Grab the data types of each column 
hitting_df.dtypes

Rk        int64
Name     object
Age       int64
Tm       object
Lg       object
G         int64
PA        int64
AB        int64
R         int64
H         int64
2B        int64
3B        int64
HR        int64
RBI       int64
SB        int64
CS        int64
BB        int64
SO        int64
BA      float64
OBP     float64
SLG     float64
OPS     float64
OPS+      int64
TB        int64
GDP       int64
HBP       int64
SH        int64
SF        int64
IBB       int64
dtype: object

In [5]:
# Grab a list of the columns 
hitting_df.columns

Index(['Rk', 'Name', 'Age', 'Tm', 'Lg', 'G', 'PA', 'AB', 'R', 'H', '2B', '3B',
       'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS', 'OPS+',
       'TB', 'GDP', 'HBP', 'SH', 'SF', 'IBB'],
      dtype='object')

In [6]:
# Drop the "Rk" ("Rank") Colummn 
hitting_df.drop(columns=['Rk'], inplace=True)
hitting_df.tail()

Unnamed: 0,Name,Age,Tm,Lg,G,PA,AB,R,H,2B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
690,Masataka Yoshida*,29,BOS,AL,86,370,335,52,107,22,...,0.381,0.501,0.883,135,168,11,6,0,1,0
691,Alex Young*,29,CIN,NL,2,0,0,0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0
692,Jared Young*,27,CHC,NL,13,39,35,7,6,0,...,0.256,0.371,0.628,67,13,1,1,0,0,0
693,Seby Zavala,29,CHW,AL,59,156,143,14,23,3,...,0.208,0.308,0.515,40,44,3,1,2,2,0
694,Mike Zunino,32,CLE,AL,42,140,124,11,22,7,...,0.271,0.306,0.578,63,38,3,1,0,0,0


In [7]:
# Spell out the "Lg" and "Tm" columns and then preview updated data frame

hitting_df = hitting_df.rename(columns={'Tm': 'Team'})
hitting_df = hitting_df.rename(columns={'Lg': 'League'})
hitting_df.head()

Unnamed: 0,Name,Age,Team,League,G,PA,AB,R,H,2B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
0,CJ Abrams*,22,WSN,NL,89,340,316,47,82,17,...,0.306,0.434,0.739,105,137,5,8,3,0,0
1,José Abreu,36,HOU,AL,95,400,368,33,90,16,...,0.293,0.353,0.646,79,130,11,3,0,5,1
2,Ronald Acuna Jr.,25,ATL,NL,97,446,391,86,129,26,...,0.408,0.578,0.986,160,226,7,4,0,2,2
3,Willy Adames,27,MIL,NL,89,383,336,44,71,16,...,0.291,0.411,0.702,90,138,9,3,0,5,0
4,Riley Adams,27,WSN,NL,23,87,79,4,22,5,...,0.337,0.506,0.844,133,40,4,1,1,0,0


In [8]:
# # Clean up the names - Remove odd characters & then preview the first 20 names

hitting_df['Name'] = hitting_df['Name'].str.replace('¬†', ' ')
hitting_df['Name'] = hitting_df['Name'].str.replace('√©', ' ')
hitting_df['Name'] = hitting_df['Name'].str.replace('√∫', ' ')
hitting_df['Name'] = hitting_df['Name'].str.replace('#', ' ')
hitting_df['Name'] = hitting_df['Name'].str.replace('*', ' ')

names = hitting_df['Name']
names

0             CJ Abrams 
1             José Abreu
2       Ronald Acuna Jr.
3           Willy Adames
4            Riley Adams
             ...        
690    Masataka Yoshida 
691          Alex Young 
692         Jared Young 
693          Seby Zavala
694          Mike Zunino
Name: Name, Length: 695, dtype: object

In [9]:
# Extract all the Teams - ensure all 30 teams (and no more) are included
teams = hitting_df['Team'].unique()

# Display all unique values
teams

array(['WSN', 'HOU', 'ATL', 'MIL', 'SEA', 'LAA', 'OAK', 'ARI', 'CHW',
       'SFG', 'TOT', 'COL', 'BOS', 'NYY', 'NYM', 'MIA', 'CHC', 'PIT',
       'TBR', 'STL', 'CLE', 'SDP', 'DET', 'BAL', 'KCR', 'LAD', 'CIN',
       'TOR', 'PHI', 'MIN', 'TEX'], dtype=object)

'TOT' is short for "Total" and is used for players who played on multiple teams in 2023.

In [10]:
# Extract all the Leagues
leagues = hitting_df['League'].unique()

# Display all unique values
leagues

array(['NL', 'AL', 'MLB'], dtype=object)

'MLB' is paired with 'TOT' from above - For players who played on multiple teams, the data will include stats for their team. with each team along with a combined. row to total up. their 2023 stats.

In [11]:
# Search for null values
nulls = hitting_df.isnull()

# Display the DataFrame indicating null values
nulls

Unnamed: 0,Name,Age,Team,League,G,PA,AB,R,H,2B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
691,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
692,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
693,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# Drop all rows containing null values
cleaned_hitting_df = hitting_df.dropna()

# Display the cleaned DataFrame
cleaned_hitting_df

Unnamed: 0,Name,Age,Team,League,G,PA,AB,R,H,2B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
0,CJ Abrams,22,WSN,NL,89,340,316,47,82,17,...,0.306,0.434,0.739,105,137,5,8,3,0,0
1,José Abreu,36,HOU,AL,95,400,368,33,90,16,...,0.293,0.353,0.646,79,130,11,3,0,5,1
2,Ronald Acuna Jr.,25,ATL,NL,97,446,391,86,129,26,...,0.408,0.578,0.986,160,226,7,4,0,2,2
3,Willy Adames,27,MIL,NL,89,383,336,44,71,16,...,0.291,0.411,0.702,90,138,9,3,0,5,0
4,Riley Adams,27,WSN,NL,23,87,79,4,22,5,...,0.337,0.506,0.844,133,40,4,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,Masataka Yoshida,29,BOS,AL,86,370,335,52,107,22,...,0.381,0.501,0.883,135,168,11,6,0,1,0
691,Alex Young,29,CIN,NL,2,0,0,0,0,0,...,0.000,0.000,0.000,0,0,0,0,0,0,0
692,Jared Young,27,CHC,NL,13,39,35,7,6,0,...,0.256,0.371,0.628,67,13,1,1,0,0,0
693,Seby Zavala,29,CHW,AL,59,156,143,14,23,3,...,0.208,0.308,0.515,40,44,3,1,2,2,0


In [13]:
# Double check the new dataframe for nulls

null_counts = cleaned_hitting_df.isnull().sum()
null_counts

Name      0
Age       0
Team      0
League    0
G         0
PA        0
AB        0
R         0
H         0
2B        0
3B        0
HR        0
RBI       0
SB        0
CS        0
BB        0
SO        0
BA        0
OBP       0
SLG       0
OPS       0
OPS+      0
TB        0
GDP       0
HBP       0
SH        0
SF        0
IBB       0
dtype: int64

In [14]:
# FINAL CLEANED DATA FRAME 

cleaned_hitting_df

Unnamed: 0,Name,Age,Team,League,G,PA,AB,R,H,2B,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
0,CJ Abrams,22,WSN,NL,89,340,316,47,82,17,...,0.306,0.434,0.739,105,137,5,8,3,0,0
1,José Abreu,36,HOU,AL,95,400,368,33,90,16,...,0.293,0.353,0.646,79,130,11,3,0,5,1
2,Ronald Acuna Jr.,25,ATL,NL,97,446,391,86,129,26,...,0.408,0.578,0.986,160,226,7,4,0,2,2
3,Willy Adames,27,MIL,NL,89,383,336,44,71,16,...,0.291,0.411,0.702,90,138,9,3,0,5,0
4,Riley Adams,27,WSN,NL,23,87,79,4,22,5,...,0.337,0.506,0.844,133,40,4,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
690,Masataka Yoshida,29,BOS,AL,86,370,335,52,107,22,...,0.381,0.501,0.883,135,168,11,6,0,1,0
691,Alex Young,29,CIN,NL,2,0,0,0,0,0,...,0.000,0.000,0.000,0,0,0,0,0,0,0
692,Jared Young,27,CHC,NL,13,39,35,7,6,0,...,0.256,0.371,0.628,67,13,1,1,0,0,0
693,Seby Zavala,29,CHW,AL,59,156,143,14,23,3,...,0.208,0.308,0.515,40,44,3,1,2,2,0


In [16]:
# Export the new CSV for our Cleaned-Data folder

# Define the file path for the CSV file in the "cleaned-data" folder
output_file_path = 'Resources/Cleaned-Data/cleaned_hitting_data.csv'

# Export the DataFrame to a CSV file
cleaned_hitting_df.to_csv('hitting_data.csv', encoding='utf-8-sig', index=False)