# CS544 Foundations of Analytics
# Professor Suresh Kalathur
# Final Project - Mike Zhong

In [1]:
import pandas as pd

In [2]:
__author__ = "Mike Zhong"

## Part 3) 
## The merged data frame is missing an important column, the player's position. Since each fantasy team has roster requirements, it is crucial that we know the player's position. To do this, I roughly scraped a separate site and cleaned up the data into two columns (not shown), the player's name and position. We can now join the two dataframes on the player's name

In [3]:
df = pd.read_csv("./data/merged.csv")
df.head()

Unnamed: 0,Player,Team,Opponent,Location,Pts*,Att,Cmp,Yds,TD,Int,...,Yds.1,TD.1,2Pt.1,Rec,Yds.2,TD.2,2Pt.2,FL,TD.3,Week
0,Andrew Luck,IND,@DET,away,43,47,31,385,4,0,...,21,0,0,0,0,0,0,0,0,1
1,Drew Brees,NO,@OAK,away,39,42,28,423,4,0,...,5,0,0,0,0,0,0,1,0,1
2,AJ Green,CIN,NYJ,home,36,0,0,0,0,0,...,0,0,0,12,180,1,0,0,0,1
3,DeAngelo Williams,PIT,WAS,home,34,0,0,0,0,0,...,143,2,0,6,28,0,0,0,0,1
4,Jameis Winston,TB,ATL,home,33,32,23,281,4,1,...,3,0,0,0,0,0,0,0,0,1


In [4]:
player_pos = pd.read_csv("./data/player_pos.csv")
player_pos.head()

Unnamed: 0,Player,Position
0,David Johnson,RB
1,Ezekiel Elliott,RB
2,LeSean McCoy,RB
3,DeMarco Murray,RB
4,Le'Veon Bell,RB


In [5]:
new_df = pd.merge(df, player_pos, on='Player', how='left')
new_df.head(20)

Unnamed: 0,Player,Team,Opponent,Location,Pts*,Att,Cmp,Yds,TD,Int,...,TD.1,2Pt.1,Rec,Yds.2,TD.2,2Pt.2,FL,TD.3,Week,Position
0,Andrew Luck,IND,@DET,away,43,47,31,385,4,0,...,0,0,0,0,0,0,0,0,1,QB
1,Drew Brees,NO,@OAK,away,39,42,28,423,4,0,...,0,0,0,0,0,0,1,0,1,QB
2,AJ Green,CIN,NYJ,home,36,0,0,0,0,0,...,0,0,12,180,1,0,0,0,1,WR
3,DeAngelo Williams,PIT,WAS,home,34,0,0,0,0,0,...,2,0,6,28,0,0,0,0,1,RB
4,Jameis Winston,TB,ATL,home,33,32,23,281,4,1,...,0,0,0,0,0,0,0,0,1,QB
5,Brandin Cooks,NO,@OAK,away,33,0,0,0,0,0,...,0,0,6,143,2,0,0,0,1,WR
6,Spencer Ware,KC,@SD,away,32,0,0,0,0,0,...,1,0,7,129,0,0,0,0,1,RB
7,Willie Snead,NO,@OAK,away,32,0,0,0,0,0,...,0,0,9,172,1,0,0,0,1,WR
8,Antonio Brown,PIT,WAS,home,32,0,0,0,0,0,...,0,0,8,126,2,0,0,0,1,WR
9,Matthew Stafford,DET,IND,home,31,39,31,340,3,0,...,0,0,0,0,0,0,0,0,1,QB


### Looks pretty good by initial inspection but we want to make sure there are no missing values in the new Position column

In [6]:
new_df.loc[:, 'Position'].isnull().sum()

23

In [7]:
# Clearly there are 23 missing values that we still need to fill in. Must find their index
null_vec = new_df.loc[:, 'Position'].isnull()
new_df[null_vec.values]

Unnamed: 0,Player,Team,Opponent,Location,Pts*,Att,Cmp,Yds,TD,Int,...,TD.1,2Pt.1,Rec,Yds.2,TD.2,2Pt.2,FL,TD.3,Week,Position
163,Odell Beckham Jr,NYG,@NO,away,16,0,0,0,0,0,...,0,0,8,86,0,0,0,0,2,
248,Odell Beckham Jr,NYG,@WAS,away,19,0,0,0,0,0,...,0,0,7,121,0,0,0,0,3,
365,Philly Brown,CAR,ATL,home,15,0,0,0,0,0,...,0,0,5,48,1,0,0,0,4,
462,Odell Beckham Jr,NYG,GB,home,16,0,0,0,0,0,...,0,0,5,56,1,0,0,0,5,
505,Odell Beckham Jr,NYG,@BAL,away,41,0,0,0,0,0,...,0,0,8,222,2,0,1,0,6,
775,Rob Kelley,WAS,CIN,home,14,0,0,0,0,0,...,1,0,0,0,0,0,0,0,8,
843,Odell Beckham Jr,NYG,@PHI,away,20,0,0,0,0,0,...,0,0,4,46,2,0,0,0,9,
929,Odell Beckham Jr,NYG,@CIN,away,25,0,0,0,0,0,...,0,0,10,97,1,0,0,0,10,
1000,Rob Kelley,WAS,@MIN,away,10,0,0,0,0,0,...,0,0,1,-2,0,0,0,0,10,
1016,Rob Kelley,WAS,@GB,away,31,0,0,0,0,0,...,3,0,0,0,0,0,0,0,11,


### Upon closer inspection, it is obvious why they are missing. Differences in naming schemes, use of nicknames, and my original script only took first and last names, excluding suffixes like Jr or III. Of course, with some football knowledge, we can enter these players into our player position dataframe and fix it

In [8]:
temp_df = pd.DataFrame([['Odell Beckham Jr', 'Rob Kelley', 'Philly Brown', 'Ted Ginn Jr', 'Robert Griffin III'],
                        ['WR', 'RB', 'WR', 'WR', 'QB']]).T

temp_df

Unnamed: 0,0,1
0,Odell Beckham Jr,WR
1,Rob Kelley,RB
2,Philly Brown,WR
3,Ted Ginn Jr,WR
4,Robert Griffin III,QB


In [9]:
temp_df.columns = ['Player', 'Position']
temp_df

Unnamed: 0,Player,Position
0,Odell Beckham Jr,WR
1,Rob Kelley,RB
2,Philly Brown,WR
3,Ted Ginn Jr,WR
4,Robert Griffin III,QB


In [10]:
player_pos = player_pos.append(temp_df, ignore_index=True)

In [11]:
# quick check to see it is there
player_pos[player_pos.loc[:, 'Player'] == 'Odell Beckham Jr']

Unnamed: 0,Player,Position
584,Odell Beckham Jr,WR


### Now we can repeat the steps from before and join the main dataframe with the new player position data frame

In [12]:
new_df = pd.merge(df, player_pos, on='Player', how='left')
new_df.head(20)

Unnamed: 0,Player,Team,Opponent,Location,Pts*,Att,Cmp,Yds,TD,Int,...,TD.1,2Pt.1,Rec,Yds.2,TD.2,2Pt.2,FL,TD.3,Week,Position
0,Andrew Luck,IND,@DET,away,43,47,31,385,4,0,...,0,0,0,0,0,0,0,0,1,QB
1,Drew Brees,NO,@OAK,away,39,42,28,423,4,0,...,0,0,0,0,0,0,1,0,1,QB
2,AJ Green,CIN,NYJ,home,36,0,0,0,0,0,...,0,0,12,180,1,0,0,0,1,WR
3,DeAngelo Williams,PIT,WAS,home,34,0,0,0,0,0,...,2,0,6,28,0,0,0,0,1,RB
4,Jameis Winston,TB,ATL,home,33,32,23,281,4,1,...,0,0,0,0,0,0,0,0,1,QB
5,Brandin Cooks,NO,@OAK,away,33,0,0,0,0,0,...,0,0,6,143,2,0,0,0,1,WR
6,Spencer Ware,KC,@SD,away,32,0,0,0,0,0,...,1,0,7,129,0,0,0,0,1,RB
7,Willie Snead,NO,@OAK,away,32,0,0,0,0,0,...,0,0,9,172,1,0,0,0,1,WR
8,Antonio Brown,PIT,WAS,home,32,0,0,0,0,0,...,0,0,8,126,2,0,0,0,1,WR
9,Matthew Stafford,DET,IND,home,31,39,31,340,3,0,...,0,0,0,0,0,0,0,0,1,QB


In [13]:
# check to see if any of the values are missing
new_df.isnull().values.any()

False

### Great! We have no missing values, but the column names aren't very descriptive, let's rename some of the columns

In [15]:
new_df.columns

Index(['Player', 'Team', 'Opponent', 'Location', 'Pts*', 'Att', 'Cmp', 'Yds',
       'TD', 'Int', '2Pt', 'Att.1', 'Yds.1', 'TD.1', '2Pt.1', 'Rec', 'Yds.2',
       'TD.2', '2Pt.2', 'FL', 'TD.3', 'Week', 'Position'],
      dtype='object')

### By inspection, we can see that Att.1 and all .1 columns correspond to rushing numbers. Similary, .2 corresponds to receiving numbers and .3 is for miscellaneous stats. We can now rename them to something more friendly. NOTE: FL stands for fumbles lost, each fumble lost results in -2 points

In [16]:
new_df.columns = ['Player', 'Team', 'Opponent', 'Location', 'Pts', 'Att', 'Cmp', 'Yds',
       'TD', 'Int', '2Pt', 'Rush Att', 'Rush Yds', 'Rush TD', 'Rush 2Pt', 'Rec', 'Rec Yds',
       'Rec TD', 'Rec 2Pt', 'FL', 'Misc TD', 'Week', 'Position']

In [18]:
# save dataframe for use in further analysis
# new_df.to_csv("./data/complete_data.csv", index=False)