<a href="https://colab.research.google.com/github/matthewbegun/MXN500/blob/main/MXN500_2024_LEC_03_Py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MXN500 Lecture 3 (Python)

In [31]:
# installs, only run once!
!pip install Lahman

Collecting Lahman
  Downloading lahman-0.0.1-py2.py3-none-any.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Lahman
Successfully installed Lahman-0.0.1


In [2]:
# this is some set up code for displaying all results (like in R)
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


## Data Types in Python

Python is originally designed to not care about types as much as other programming langauges.
However, the libraries we use for data analysis (`pandas` and `numpy` and anything based on them) do care and so we have to.

In [3]:
# A "string" or character variable in R is anything entered in quotes.
character_string = "This is a string"
single_character = "a"
character_list = ["a", "b", "b", "c", "a",]

# the python equivalent to `str()` is `repr()` but it is slightly different.
# if we want the type we will need to get it

type(character_string)
repr(character_string)

type(single_character)
repr(single_character)

type(character_list)
repr(character_list)

type(character_list[0])
repr(character_list[0])

str

"'This is a string'"

str

"'a'"

list

"['a', 'b', 'b', 'c', 'a']"

str

"'a'"

In [4]:
# Selecting with lists - this is actually a little bit harder in python
# we are going to use a "list comprehension" to iterate over the list and
# take our selections
index_list = [0,1,3,]
[character_list[i] for i in index_list]

# Sorting with lists
index_order = [3,0,2,]
[character_list[i] for i in index_order]

['a', 'b', 'c']

['c', 'a', 'b']

In [5]:
# A categorical variable is how `pandas` treats categories of data.
import pandas as pd # should really go at the top of the notebook...
df = pd.DataFrame({'character_list': character_list})
df["character_list"] # dtype:objet means strings, not categories

df["cats"] = df["character_list"].astype("category")
df["cats"] # dtype: category, with three 'categories'

df["cats"].dtypes # types are for variables or values, dtypes are for data in `pandas`


0    a
1    b
2    b
3    c
4    a
Name: character_list, dtype: object

0    a
1    b
2    b
3    c
4    a
Name: cats, dtype: category
Categories (3, object): ['a', 'b', 'c']

CategoricalDtype(categories=['a', 'b', 'c'], ordered=False)

In [6]:
# To create a data frame with categorical variables, we can:
hml_df = pd.DataFrame({"category": ["High", "Medium", "Low",]}, dtype="category")
hml_df.describe()
hml_df["category"].dtypes


Unnamed: 0,category
count,3
unique,3
top,High
freq,1


CategoricalDtype(categories=['High', 'Low', 'Medium'], ordered=False)

In [7]:
# Categories default to being an sorted alphabetically (no set order)
# To reorder the categories you can:
hml_df["category"].cat.reorder_categories(["Low", "Medium", "High",], ordered=True)

0      High
1    Medium
2       Low
Name: category, dtype: category
Categories (3, object): ['Low' < 'Medium' < 'High']

In [8]:
# Compare the result:
hml_df["category"].cat.categories # Order we don't want.
hml_df["category"].cat.reorder_categories(["Low", "Medium", "High",], ordered=True).cat.categories # Order we want.

Index(['High', 'Low', 'Medium'], dtype='object')

Index(['Low', 'Medium', 'High'], dtype='object')

In [None]:
# In python strings do not default to categories, you have to make it happen


In [9]:
# Look what happens to these "numbers". Need to be careful to make sure you
# data is the correct type.
num_df = pd.DataFrame({'numbers': ["3.14", "-2", "15"]})
num_df.describe()
num_df.dtypes # object means strings!

num_df["numbers"].mean() # error message because it's not numeric


Unnamed: 0,numbers
count,3.0
unique,3.0
top,3.14
freq,1.0


numbers    object
dtype: object

TypeError: Could not convert 3.14-215 to numeric

In [10]:
# To make the data numeric need to change it's type
# now_numbers <- as.numeric(num_df$numbers)
# str(now_numbers)
# mean(now_numbers)
now_numbers = pd.to_numeric(num_df['numbers'], errors='coerce')
now_numbers.mean()

5.38

In [11]:
# In python, a regular list can store objects with different types!
more_numbers = [2, 6, "*8",]
more_numbers

# but not in a dataframe
more_df = pd.DataFrame({'numbers': [2, 6, "*8"]})
more_df.dtypes
more_df["numbers"].mean() # mixed data types means an error message for mean

[2, 6, '*8']

numbers    object
dtype: object

TypeError: unsupported operand type(s) for +: 'int' and 'str'

In [12]:
# change in the data frame
num_df["numeric_numbers"] = pd.to_numeric(num_df['numbers'])
num_df["numeric_numbers"].dtypes
num_df["numeric_numbers"].describe()


dtype('float64')

count     3.000000
mean      5.380000
std       8.718555
min      -2.000000
25%       0.570000
50%       3.140000
75%       9.070000
max      15.000000
Name: numeric_numbers, dtype: float64

## Functions in Python
In Python we can define a functions by using the following syntax. Python doesn't use braces `{}` for function definition, instead it uses indentation.

```python
def name_of_function(argument_1, argument_2):
    function_logic_goes_here
    return return_value
```
The first line ends with a `:` to start the indentation block below.

Make sure all the code for the function is indented the same amount (4 spaces).

In [13]:
# Some basic examples:
def add_one(x):
    x_new = x + 1
    return x_new

repr(add_one)

# In this case, a function called "add_one" is a function which takes some
# input, x, adds one to it, and returns the result.

'<function add_one at 0x7a39a1ea0310>'

In [14]:
# Test it out
add_one(5)
x = add_one(0)
x
add_one([1,2,3,4]) # this doesn't work out of the box in python, unlike R

6

1

TypeError: can only concatenate list (not "int") to list

In [15]:
# And break it
add_one("a")
# In python the error message is slightly more useful, it's telling us there's a type
# issue (str v int)

TypeError: can only concatenate str (not "int") to str

In [17]:
# Here's another function:
def make_negative(x):
    x_new = -x
    return x_new
make_negative(-6)



6

In [18]:
# get a list of numbers
[i for i in range(10)]
# try and make them negative
make_negative([i for i in range(10)]) # errors again

# to make functions work with arrays of numbers we need custom functions in python
# have a play with numpy arrays to see what I mean

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

TypeError: bad operand type for unary -: 'list'

In [19]:
# Check how they work.
x = 1
add_one(x)
make_negative(x)

# This `x` is different from the `x` in the function. Confusing!
# Better to use meaningful variable names.

2

-1

In [20]:
# Interested in applying them one after each other? Composition of functions:
x_new = make_negative(add_one(x))
x_new


-2

## Pipes in Python (sort of)
In `pandas` we can "pipe" using the default methods of the dataframe. Just add the next method/function until done (see example).

If we want to multiline we need to use `()` around the whole expression otherwise Python will try and check indentation.


In [29]:
# We need a dataframe or series object from pandas to use the pipe function, so our simple example doesn't work
# pandas also has tools for applying functions to columns, or series, from a dataset
num_df["numeric_numbers"].apply(add_one) # apply our `add_one` function to each value of a column
num_df["numeric_numbers"].apply(add_one).apply(make_negative) # chain apply

# multiline version - note this is common in data science but not "Pythonic"
(
  num_df["numeric_numbers"]
    .apply(add_one)
    .apply(make_negative)
)


0     4.14
1    -1.00
2    16.00
Name: numeric_numbers, dtype: float64

0    -4.14
1     1.00
2   -16.00
Name: numeric_numbers, dtype: float64

0    -4.14
1     1.00
2   -16.00
Name: numeric_numbers, dtype: float64

Let's consider some real data and real use cases. We are going to use data
from the `Lahman` package. This package is all about baseball.

In [32]:
# Only install once, to save time
# !pip install Lahman

# the libary has a lowercase name for importing
import lahman

# having a look at what is included with the package, since there's no docstring or help :(
dir(lahman)

Unpacking data...


['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_accessors',
 '_fetch_data',
 '_lahman_fnames',
 'allstar_full',
 'appearances',
 'awards_managers',
 'awards_players',
 'awards_share_managers',
 'awards_share_players',
 'batting',
 'batting_post',
 'college_playing',
 'fielding',
 'fielding_of',
 'fielding_ofsplit',
 'fielding_post',
 'hall_of_fame',
 'home_games',
 'managers',
 'managers_half',
 'parks',
 'people',
 'pitching',
 'pitching_post',
 'salaries',
 'schools',
 'series_post',
 'teams',
 'teams_franchises',
 'teams_half']

In [33]:
# get the batting data and inspect it a little
batting = lahman.batting()
batting.shape
batting.head()


(108789, 22)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0


In [34]:
# Let's look a one player, Manny Ramirez
# first we create a boolean series based on our "filter"
batting["playerID"]=="ramirma02"
# then we filter our dataframe using that boolean series
batting[batting["playerID"]=="ramirma02"]
# and let's assign it so we can reuse it later
manny = batting[batting["playerID"]=="ramirma02"]


0         False
1         False
2         False
3         False
4         False
          ...  
108784    False
108785    False
108786    False
108787    False
108788    False
Name: playerID, Length: 108789, dtype: bool

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
71564,ramirma02,1993,1,CLE,AL,22,53,5,9,1,...,5.0,0.0,0.0,2,8.0,0.0,0.0,0.0,0.0,3.0
72630,ramirma02,1994,1,CLE,AL,91,290,51,78,22,...,60.0,4.0,2.0,42,72.0,4.0,0.0,0.0,4.0,6.0
73824,ramirma02,1995,1,CLE,AL,137,484,85,149,26,...,107.0,6.0,6.0,75,112.0,6.0,5.0,2.0,5.0,13.0
75091,ramirma02,1996,1,CLE,AL,152,550,94,170,45,...,112.0,8.0,5.0,85,104.0,8.0,3.0,0.0,9.0,18.0
76329,ramirma02,1997,1,CLE,AL,150,561,99,184,40,...,88.0,2.0,3.0,79,115.0,5.0,7.0,0.0,4.0,19.0
77613,ramirma02,1998,1,CLE,AL,150,571,108,168,35,...,145.0,5.0,3.0,76,121.0,6.0,6.0,0.0,10.0,18.0
78926,ramirma02,1999,1,CLE,AL,147,522,131,174,34,...,165.0,2.0,4.0,96,131.0,9.0,13.0,0.0,9.0,12.0
80274,ramirma02,2000,1,CLE,AL,118,439,92,154,34,...,122.0,1.0,1.0,86,117.0,9.0,3.0,0.0,4.0,9.0
81635,ramirma02,2001,1,BOS,AL,142,529,93,162,33,...,125.0,0.0,1.0,81,147.0,25.0,8.0,0.0,2.0,9.0
82960,ramirma02,2002,1,BOS,AL,120,436,84,152,31,...,107.0,0.0,0.0,73,85.0,14.0,8.0,0.0,1.0,13.0


In [None]:
# Remember to use a logical operator to filter.
x = 1  # Single equals assigns in this case.
x == 1 # Double equals is asked are these two things equal.
x == 2

True

False

In [35]:
# What info do we have on Manny?
manny

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
71564,ramirma02,1993,1,CLE,AL,22,53,5,9,1,...,5.0,0.0,0.0,2,8.0,0.0,0.0,0.0,0.0,3.0
72630,ramirma02,1994,1,CLE,AL,91,290,51,78,22,...,60.0,4.0,2.0,42,72.0,4.0,0.0,0.0,4.0,6.0
73824,ramirma02,1995,1,CLE,AL,137,484,85,149,26,...,107.0,6.0,6.0,75,112.0,6.0,5.0,2.0,5.0,13.0
75091,ramirma02,1996,1,CLE,AL,152,550,94,170,45,...,112.0,8.0,5.0,85,104.0,8.0,3.0,0.0,9.0,18.0
76329,ramirma02,1997,1,CLE,AL,150,561,99,184,40,...,88.0,2.0,3.0,79,115.0,5.0,7.0,0.0,4.0,19.0
77613,ramirma02,1998,1,CLE,AL,150,571,108,168,35,...,145.0,5.0,3.0,76,121.0,6.0,6.0,0.0,10.0,18.0
78926,ramirma02,1999,1,CLE,AL,147,522,131,174,34,...,165.0,2.0,4.0,96,131.0,9.0,13.0,0.0,9.0,12.0
80274,ramirma02,2000,1,CLE,AL,118,439,92,154,34,...,122.0,1.0,1.0,86,117.0,9.0,3.0,0.0,4.0,9.0
81635,ramirma02,2001,1,BOS,AL,142,529,93,162,33,...,125.0,0.0,1.0,81,147.0,25.0,8.0,0.0,2.0,9.0
82960,ramirma02,2002,1,BOS,AL,120,436,84,152,31,...,107.0,0.0,0.0,73,85.0,14.0,8.0,0.0,1.0,13.0


In [36]:
# Let's look at some tools to get some summary stats:
# number of unique values
manny["yearID"].nunique()


19

In [37]:
# concatenate strings
# plus defaults to no seperator
"a" + "b"

a = "hello"
b = "world"

a + " " + b


'ab'

'hello world'

In [38]:
# sums
manny["G"].sum()

2302

In [39]:
# too much info in this summary, not great formatting
manny.describe()

Unnamed: 0,yearID,stint,G,AB,R,H,2B,3B,HR,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
count,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
mean,2002.666667,1.095238,109.619048,392.571429,73.52381,122.571429,26.047619,0.952381,26.428571,87.190476,1.809524,1.571429,63.285714,86.333333,10.285714,5.190476,0.095238,4.285714,11.571429
std,5.747463,0.300793,48.211488,185.332828,38.542988,59.696375,13.335952,1.023533,15.01523,47.71333,2.249868,1.859339,30.65476,41.392431,7.843104,3.58635,0.436436,3.303678,6.734771
min,1993.0,1.0,5.0,17.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0
25%,1998.0,1.0,91.0,290.0,51.0,78.0,22.0,0.0,17.0,60.0,0.0,0.0,42.0,72.0,5.0,3.0,0.0,1.0,6.0
50%,2003.0,1.0,130.0,449.0,84.0,149.0,30.0,1.0,31.0,102.0,1.0,1.0,75.0,94.0,9.0,6.0,0.0,4.0,12.0
75%,2008.0,1.0,150.0,550.0,99.0,168.0,34.0,2.0,38.0,122.0,2.0,3.0,82.0,117.0,15.0,8.0,0.0,7.0,18.0
max,2011.0,2.0,154.0,571.0,131.0,185.0,45.0,3.0,45.0,165.0,8.0,6.0,100.0,147.0,28.0,13.0,2.0,10.0,22.0


In [118]:
# `lambda` means I'm defining a temporary function inside of another function
# f"{}" is a formatted string, which I can put variables inside of
s = manny.agg(
    span=('yearID', lambda x: f"{x.min()}-{x.max()}"),
    numYears=('yearID', 'nunique'),
    numTeams=('teamID', 'nunique'),
    tH=('H', 'sum'),
    tAB=('AB', 'sum'),
    tHR=('HR', 'sum'),
    tRBI=('RBI', 'sum')
)
s

# We can use unstack to pivot long and create an index
# use dropna() to get rid of the missing entries
s = s.unstack().dropna()
s

# calculating Batting Average (BA) is harder because agg() doesn't want to do it
# so we have to do it manually after wards
s['BA','BA'] = s['H', 'tH',] / s['AB','tAB',]
s


Unnamed: 0,yearID,teamID,H,AB,HR,RBI
span,1993-2011,,,,,
numYears,19,,,,,
numTeams,,5.0,,,,
tH,,,2574.0,,,
tAB,,,,8244.0,,
tHR,,,,,555.0,
tRBI,,,,,,1831.0


yearID  span        1993-2011
        numYears           19
teamID  numTeams          5.0
H       tH             2574.0
AB      tAB            8244.0
HR      tHR             555.0
RBI     tRBI           1831.0
dtype: object

yearID  span        1993-2011
        numYears           19
teamID  numTeams          5.0
H       tH             2574.0
AB      tAB            8244.0
HR      tHR             555.0
RBI     tRBI           1831.0
BA      BA           0.312227
dtype: object

In [119]:
# Might be interesting to break the stats down by team:
s = manny.groupby('teamID').agg(
    span=('yearID', lambda x: f"{x.min()}-{x.max()}"),
    numYears=('yearID', 'nunique'),
    numTeams=('teamID', 'nunique'),
    tH=('H', 'sum'),
    tAB=('AB', 'sum'),
    tHR=('HR', 'sum'),
    tRBI=('RBI', 'sum')
)
# this actually makes BA much easier to get
s['BA'] = s['tH'] / s['tAB']
s

Unnamed: 0_level_0,span,numYears,numTeams,tH,tAB,tHR,tRBI,BA
teamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
BOS,2001-2008,8,1,1232,3953,274,868.0,0.311662
CHA,2010-2010,1,1,18,69,1,2.0,0.26087
CLE,1993-2000,8,1,1086,3470,236,804.0,0.312968
LAN,2008-2010,3,1,237,735,44,156.0,0.322449
TBA,2011-2011,1,1,1,17,0,1.0,0.058824


In [121]:
# Might be interesting to break the stats down by team:
s = manny.groupby('teamID').agg(
    span=('yearID', lambda x: f"{x.min()}-{x.max()}"),
    numYears=('yearID', 'nunique'),
    numTeams=('teamID', 'nunique'),
    tH=('H', 'sum'),
    tAB=('AB', 'sum'),
    tHR=('HR', 'sum'),
    tRBI=('RBI', 'sum')
).sort_values('span')
# still have to add BA
s['BA'] = s['tH'] / s['tAB']
s

Unnamed: 0_level_0,span,numYears,numTeams,tH,tAB,tHR,tRBI,BA
teamID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
CLE,1993-2000,8,1,1086,3470,236,804.0,0.312968
BOS,2001-2008,8,1,1232,3953,274,868.0,0.311662
LAN,2008-2010,3,1,237,735,44,156.0,0.322449
CHA,2010-2010,1,1,18,69,1,2.0,0.26087
TBA,2011-2011,1,1,1,17,0,1.0,0.058824


## Joins in Python

In Python we use `merge` to do all our joins, and specify the type we want.


In [136]:
# Let's start with a toy example:
df1 = pd.DataFrame({"key": [1, 2, 3], "first_letters": ["a", "b", "c"]})
df1
df2 = pd.DataFrame({"key": [1,2, None, 4], "last_letters": ["w", "x", "y", "z"]})
df2


Unnamed: 0,key,first_letters
0,1,a
1,2,b
2,3,c


Unnamed: 0,key,last_letters
0,1.0,w
1,2.0,x
2,,y
3,4.0,z


Lot's of different ways to combine this data!

In [137]:
# Rows must exist in both data sets,
# Bi-directional:
pd.merge(df1, df2, on="key")
# pandas default join is "inner"
# if the key is obvious, pandas can work it out
pd.merge(df1, df2)


Unnamed: 0,key,first_letters,last_letters
0,1,a,w
1,2,b,x


Unnamed: 0,key,first_letters,last_letters
0,1,a,w
1,2,b,x


In [138]:
# Combine everything with outer:
pd.merge(df1, df2, how='outer')


Unnamed: 0,key,first_letters,last_letters
0,1.0,a,w
1,2.0,b,x
2,3.0,c,
3,,,y
4,4.0,,z


In [140]:
# Rows must exist in the first data set.
# Directional joins - order matters:
pd.merge(df1, df2, how='left') # add df2 into df1
pd.merge(df2, df1, how='left') # add df2 into df1


Unnamed: 0,key,first_letters,last_letters
0,1,a,w
1,2,b,x
2,3,c,


Unnamed: 0,key,last_letters,first_letters
0,1.0,w,a
1,2.0,x,b
2,,y,
3,4.0,z,


In [141]:
# Now lets combine some real data sets
# Can use Master and batting data from Lahman package
batting = lahman.batting()
batting.shape
people = lahman.people()
people.shape

(108789, 22)

(20093, 24)

In [142]:
# first let's understand what we will join
# Manny - just get his data
manny = batting[batting["playerID"]=="ramirma02"]
manny.shape

(21, 22)

In [146]:
# combine using common entries in both
inner_join_batting_people = pd.merge(manny, people)
# what have we accompolished? people data added to the end of each row for manny
inner_join_batting_people.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,ramirma02,1993,1,CLE,AL,22,53,5,9,1,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
1,ramirma02,1994,1,CLE,AL,91,290,51,78,22,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
2,ramirma02,1995,1,CLE,AL,137,484,85,149,26,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
3,ramirma02,1996,1,CLE,AL,152,550,94,170,45,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
4,ramirma02,1997,1,CLE,AL,150,561,99,184,40,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02


In [150]:
# What if we used left_join instead
left_join_batting_people = pd.merge(manny, people, how='left')
left_join_batting_people.shape
left_join_batting_people.head()
# same thing, because common rows in Manny player data were the constraint.

(21, 45)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,ramirma02,1993,1,CLE,AL,22,53,5,9,1,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
1,ramirma02,1994,1,CLE,AL,91,290,51,78,22,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
2,ramirma02,1995,1,CLE,AL,137,484,85,149,26,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
3,ramirma02,1996,1,CLE,AL,152,550,94,170,45,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02
4,ramirma02,1997,1,CLE,AL,150,561,99,184,40,...,Ramirez,Manuel Aristides,225.0,72.0,R,R,1993-09-02,2011-04-06,ramim002,ramirma02


In [151]:
# What if we change the order?
right_join_batting_people = pd.merge(manny, people, how='right')
right_join_batting_people.shape
right_join_batting_people.head()


# We don't just get the information about Manny.
# Can see we have no stats for the other players.
# Join order is important!
# Have to think about what you need before you join your data.

(20113, 45)

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,nameLast,nameGiven,weight,height,bats,throws,debut,finalGame,retroID,bbrefID
0,aardsda01,,,,,,,,,,...,Aardsma,David Allan,215.0,75.0,R,R,2004-04-06,2015-08-23,aardd001,aardsda01
1,aaronha01,,,,,,,,,,...,Aaron,Henry Louis,180.0,72.0,R,R,1954-04-13,1976-10-03,aaroh101,aaronha01
2,aaronto01,,,,,,,,,,...,Aaron,Tommie Lee,190.0,75.0,R,R,1962-04-10,1971-09-26,aarot101,aaronto01
3,aasedo01,,,,,,,,,,...,Aase,Donald William,190.0,75.0,R,R,1977-07-26,1990-10-03,aased001,aasedo01
4,abadan01,,,,,,,,,,...,Abad,Fausto Andres,184.0,73.0,L,L,2001-09-10,2006-04-13,abada001,abadan01
