# Week 3 -- Data Cleaning

In [57]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import warnings
warnings.simplefilter('ignore')

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Functions

In [49]:
def check_names(df1, df2):
    df_players = list(df1['Player'])    
    for i in range(len(df2['Player'])):
        name = df2['Player'][i]
        split_name = name.split(' ')
        if (split_name[0] + ' ' + split_name[1]) in df_players:
            df2['Player'][i] = split_name[0] + ' ' + split_name[1]
        else:
            df2['Player'][i] = name

In [50]:
def name_update(dateframe):
    dataframe['Player'] = dataframe['Player'].map(lambda x: x.strip())

In [58]:
def update_int_dtype(dataframe, column_list):
    for column in column_list:
        dataframe[column] = dataframe[column].map(lambda x: int(x))

In [59]:
def update_float_dtype(dataframe, column_list):
    for column in column_list:
        for i in range(len(dataframe[column])):
            item = dataframe[column][i]
            if item == '':
                dataframe[column][i] = '0.0'
            else:
                dataframe[column][i] = item    

In [60]:
def fill_blanks(dataframe, column_list):
    for column in column_list:
        for i in range(len(dataframe[column])):
            item = dataframe[column][i]
            if item == '':
                dataframe[column][i] = 0
            else:
                dataframe[column][i] = item

In [88]:
df = pd.read_pickle('player_stats')
fantasy_pts = pd.read_pickle('fantasy_weeks')
defense = pd.read_pickle('defense_data')

In [89]:
fantasy_pts.shape

(558, 5)

In [90]:
lastwk = pd.read_pickle('fantweeks_1_2')
lastwk

Unnamed: 0,Player,Team,Position,Ttl_Fant,Week_1,Week_2
0,Russell Wilson,SEA,QB,66.2,31.8,34.4
1,Josh Allen,BUF,QB,62.7,28.2,34.5
2,Cam Newton,NE,QB,61.3,25.7,35.6
3,Kyler Murray,ARI,QB,60.4,27.3,33.1
4,Dak Prescott,DAL,QB,57.4,17.6,39.8
5,Aaron Jones,GB,RB,55.2,13.6,41.6
6,Matt Ryan,ATL,QB,53.4,24.9,28.5
7,Aaron Rodgers,GB,QB,49.0,30.8,18.2
8,Alvin Kamara,NO,RB,48.1,18.7,29.4
9,Patrick Mahomes II,KC,QB,47.9,20.4,27.5


In [91]:
fantasy_pts = pd.merge(fantasy_pts, lastwk, 'left', on='Player')
fantasy_pts.head()

Unnamed: 0,Player,Team_x,Position_x,TTL,Week_3,Team_y,Position_y,Ttl_Fant,Week_1,Week_2
0,Russell Wilson,SEA,QB,103.0,36.8,SEA,QB,66.2,31.8,34.4
1,Josh Allen,BUF,QB,94.9,32.2,BUF,QB,62.7,28.2,34.5
2,Patrick Mahomes II,KC,QB,87.9,40.0,KC,QB,47.9,20.4,27.5
3,Dak Prescott,DAL,QB,86.9,29.5,DAL,QB,57.4,17.6,39.8
4,Kyler Murray,ARI,QB,85.1,24.7,ARI,QB,60.4,27.3,33.1


In [92]:
fantasy_pts.drop(columns=['Team_y', 'Position_y', 'Ttl_Fant'], inplace=True)
fantasy_pts.rename(columns = {'Team_x': 'Team', 'Position_x': 'Position'}, inplace = True)
fantasy_pts.columns

Index(['Player', 'Team', 'Position', 'TTL', 'Week_3', 'Week_1', 'Week_2'], dtype='object')

In [93]:
fantasy_pts = fantasy_pts[['Player', 'Team', 'Position', 'TTL', 'Week_1', 'Week_2', 'Week_3']]
fantasy_pts.head()

Unnamed: 0,Player,Team,Position,TTL,Week_1,Week_2,Week_3
0,Russell Wilson,SEA,QB,103.0,31.8,34.4,36.8
1,Josh Allen,BUF,QB,94.9,28.2,34.5,32.2
2,Patrick Mahomes II,KC,QB,87.9,20.4,27.5,40.0
3,Dak Prescott,DAL,QB,86.9,17.6,39.8,29.5
4,Kyler Murray,ARI,QB,85.1,27.3,33.1,24.7


In [94]:
print(fantasy_pts.shape)

(558, 7)


In [95]:
fantasy_pts.to_pickle('fantweeks_1_3')

In [96]:
print(df.shape)
df.head()

(461, 28)


Unnamed: 0,Player,Team,Position,Age,Games,GamesStarted,CompletedPasses,PassesAttempted,PassingYds,PassingTDs,Interceptions,RushingAttempts,RushingYds,RushingYdspAtt,RushingTDs,Targeted,Receptions,ReceivingYds,YdspReception,ReceivingTDs,Fumbles,LostFumbles,TtlTDs,TwoPTConversions,TwoPTConversionPasses,FDFantasyPts,PositionRank,OverallRank
0,Alvin Kamara,NOR,RB,25,3,2,0,0,0,0,0,31,153,4.94,3,31,27,285,10.56,3,0,0,6,,,93.3,1,1
1,Russell Wilson,SEA,QB,32,3,3,79,103,925,14,1,14,90,6.43,0,0,0,0,,0,1,0,0,,1.0,103.0,1,2
2,Aaron Jones,GNB,RB,26,3,3,0,0,0,0,0,50,303,6.06,4,18,10,95,9.5,1,1,0,5,,,74.8,2,3
3,Calvin Ridley,ATL,WR,26,3,3,0,0,0,0,0,2,6,3.0,0,35,21,349,16.62,4,0,0,4,,,70.0,1,4
4,Josh Allen,BUF,QB,24,3,3,81,114,1038,10,1,22,84,3.82,2,0,0,0,,0,3,3,2,,,94.9,2,5


In [97]:
print(fantasy_pts.shape)
fantasy_pts.head()

(558, 7)


Unnamed: 0,Player,Team,Position,TTL,Week_1,Week_2,Week_3
0,Russell Wilson,SEA,QB,103.0,31.8,34.4,36.8
1,Josh Allen,BUF,QB,94.9,28.2,34.5,32.2
2,Patrick Mahomes II,KC,QB,87.9,20.4,27.5,40.0
3,Dak Prescott,DAL,QB,86.9,17.6,39.8,29.5
4,Kyler Murray,ARI,QB,85.1,27.3,33.1,24.7


In [116]:
check_names(df, fantasy_pts)

In [117]:
name_update(df)
name_update(fantasy_pts)

In [118]:
fantasy_pts

Unnamed: 0,Player,Team,Position,TTL,Week_1,Week_2,Week_3
0,Russell Wilson,SEA,QB,103.0,31.8,34.4,36.8
1,Josh Allen,BUF,QB,94.9,28.2,34.5,32.2
2,Patrick Mahomes,KC,QB,87.9,20.4,27.5,40.0
3,Dak Prescott,DAL,QB,86.9,17.6,39.8,29.5
4,Kyler Murray,ARI,QB,85.1,27.3,33.1,24.7
5,Alvin Kamara,NO,RB,79.8,18.7,29.4,31.7
6,Aaron Rodgers,GB,QB,73.5,30.8,18.2,24.5
7,Cam Newton,NE,QB,73.5,25.7,35.6,12.2
8,Aaron Jones,GB,RB,69.8,13.6,41.6,14.6
9,Matt Ryan,ATL,QB,65.8,24.9,28.5,12.4


In [119]:
dataframe = pd.merge(df, fantasy_pts, 'left', on='Player')

In [120]:
dataframe.drop(columns = ['Team_y', 'Position_y'], inplace = True)
dataframe.rename(columns = {'Team_x': 'Team', 'Position_x': 'Position'}, inplace = True)

In [121]:
print(dataframe.shape)
dataframe.head(10)

(461, 32)


Unnamed: 0,Player,Team,Position,Age,Games,GamesStarted,CompletedPasses,PassesAttempted,PassingYds,PassingTDs,Interceptions,RushingAttempts,RushingYds,RushingYdspAtt,RushingTDs,Targeted,Receptions,ReceivingYds,YdspReception,ReceivingTDs,Fumbles,LostFumbles,TtlTDs,TwoPTConversions,TwoPTConversionPasses,FDFantasyPts,PositionRank,OverallRank,TTL,Week_1,Week_2,Week_3
0,Alvin Kamara,NOR,RB,25,3,2,0,0,0,0,0,31,153,4.94,3,31,27,285,10.56,3,0,0,6,,,93.3,1,1,79.8,18.7,29.4,31.7
1,Russell Wilson,SEA,QB,32,3,3,79,103,925,14,1,14,90,6.43,0,0,0,0,,0,1,0,0,,1.0,103.0,1,2,103.0,31.8,34.4,36.8
2,Aaron Jones,GNB,RB,26,3,3,0,0,0,0,0,50,303,6.06,4,18,10,95,9.5,1,1,0,5,,,74.8,2,3,69.8,13.6,41.6,14.6
3,Calvin Ridley,ATL,WR,26,3,3,0,0,0,0,0,2,6,3.0,0,35,21,349,16.62,4,0,0,4,,,70.0,1,4,59.5,24.9,22.9,11.7
4,Josh Allen,BUF,QB,24,3,3,81,114,1038,10,1,22,84,3.82,2,0,0,0,,0,3,3,2,,,94.9,2,5,94.9,28.2,34.5,32.2
5,Dalvin Cook,MIN,RB,25,3,3,0,0,0,0,0,48,294,6.13,4,9,5,24,4.8,0,1,1,4,3.0,,62.3,3,6,59.8,20.8,15.1,23.9
6,Patrick Mahomes,KAN,QB,25,3,3,82,121,898,9,0,10,80,8.0,1,0,0,0,,0,0,0,1,,1.0,87.9,3,7,87.9,20.4,27.5,40.0
7,Tyler Lockett,SEA,WR,28,3,3,0,0,0,0,0,0,0,,0,29,24,259,10.79,4,0,0,4,,,61.9,2,8,49.9,9.2,12.7,28.0
8,Dak Prescott,DAL,QB,27,3,3,96,143,1188,5,2,14,74,5.29,3,0,0,0,,0,2,2,3,,,86.9,4,9,86.9,17.6,39.8,29.5
9,Nick Chubb,CLE,RB,25,3,3,0,0,0,0,0,51,292,5.73,4,3,3,17,5.67,0,1,1,4,,,54.4,4,10,52.9,4.6,25.3,23.0


In [123]:
dataframe.fillna('0.0', inplace = True)

In [128]:
integers = ['Age', 'Games', 'GamesStarted', 'CompletedPasses', 'PassesAttempted', 'PassingYds', 'PassingTDs', 
            'Interceptions', 'RushingAttempts', 'RushingYds', 'RushingTDs', 'Targeted', 'Receptions', 
            'ReceivingYds', 'ReceivingTDs', 'Fumbles', 'LostFumbles', 'TtlTDs']

floats = ['RushingYdspAtt', 'YdspReception', 'FDFantasyPts', 'TTL','Week_1', 'Week_2', 'Week_3']

In [129]:
update_int_dtype(dataframe, integers)

In [130]:
two_pts = ['TwoPTConversions', 'TwoPTConversionPasses']
fill_blanks(dataframe, two_pts)
update_int_dtype(dataframe, two_pts)

In [131]:
update_float_dtype(dataframe, floats)

In [132]:
dataframe.head()

Unnamed: 0,Player,Team,Position,Age,Games,GamesStarted,CompletedPasses,PassesAttempted,PassingYds,PassingTDs,Interceptions,RushingAttempts,RushingYds,RushingYdspAtt,RushingTDs,Targeted,Receptions,ReceivingYds,YdspReception,ReceivingTDs,Fumbles,LostFumbles,TtlTDs,TwoPTConversions,TwoPTConversionPasses,FDFantasyPts,PositionRank,OverallRank,TTL,Week_1,Week_2,Week_3
0,Alvin Kamara,NOR,RB,25,3,2,0,0,0,0,0,31,153,4.94,3,31,27,285,10.56,3,0,0,6,0,0,93.3,1,1,79.8,18.7,29.4,31.7
1,Russell Wilson,SEA,QB,32,3,3,79,103,925,14,1,14,90,6.43,0,0,0,0,0.0,0,1,0,0,0,1,103.0,1,2,103.0,31.8,34.4,36.8
2,Aaron Jones,GNB,RB,26,3,3,0,0,0,0,0,50,303,6.06,4,18,10,95,9.5,1,1,0,5,0,0,74.8,2,3,69.8,13.6,41.6,14.6
3,Calvin Ridley,ATL,WR,26,3,3,0,0,0,0,0,2,6,3.0,0,35,21,349,16.62,4,0,0,4,0,0,70.0,1,4,59.5,24.9,22.9,11.7
4,Josh Allen,BUF,QB,24,3,3,81,114,1038,10,1,22,84,3.82,2,0,0,0,0.0,0,3,3,2,0,0,94.9,2,5,94.9,28.2,34.5,32.2


In [133]:
defense.head()

Unnamed: 0,Team,Ttl_Pts_Allowed,Ttl_Offense_Plays_Allowed,Yds_p_Play,Ttl_Yds,Rushing_Att,Rushing_Yds,Rushing_Yds_p_Att,Rushing_TDs,Passing_Att,Passing_Yds_p_Att,Completions,Yds_p_Completion,Passing_Yds,Passing_TDs,RZ_Att,RZ_TD,RZ_Percent,Ttl_Turnovers,Interceptions,Fumbles,Sacks
0,Baltimore Ravens,22,125,4.9,610,44,189,4.3,0,75,6.2,46,10.1,464,2,3,2,66.7%,5,2,3,6
1,Kansas City Chiefs,40,137,6.1,839,66,301,4.6,3,65,8.7,42,13.4,564,2,6,5,83.3%,2,2,0,6
2,Indianapolis Colts,45,154,4.4,676,70,280,4.0,1,75,6.1,47,9.7,454,4,6,4,66.7%,6,6,0,9
3,San Francisco 49ers,46,189,4.8,912,80,350,4.4,2,104,5.7,64,9.2,588,2,4,2,50%,4,2,2,5
4,Los Angeles Chargers,57,188,5.4,1011,71,328,4.6,1,111,6.6,72,10.1,730,3,9,2,22.2%,2,1,1,6


In [154]:
def_fantasy = fantasy_pts[fantasy_pts['Position'] == 'DST']
def_fantasy.rename(columns={'Player': 'TEAM'}, inplace = True)

In [155]:
defense.rename(columns={'Team': 'TEAM'}, inplace=True)

In [156]:
defense_df = pd.merge(defense, def_fantasy, 'left', on='TEAM')

In [157]:
defense_df.drop(columns=['Team', 'Position'], inplace=True)

In [158]:
defense_df.head()

Unnamed: 0,TEAM,Ttl_Pts_Allowed,Ttl_Offense_Plays_Allowed,Yds_p_Play,Ttl_Yds,Rushing_Att,Rushing_Yds,Rushing_Yds_p_Att,Rushing_TDs,Passing_Att,Passing_Yds_p_Att,Completions,Yds_p_Completion,Passing_Yds,Passing_TDs,RZ_Att,RZ_TD,RZ_Percent,Ttl_Turnovers,Interceptions,Fumbles,Sacks,TTL,Week_1,Week_2,Week_3
0,Baltimore Ravens,22,125,4.9,610,44,189,4.3,0,75,6.2,46,10.1,464,2,3,2,66.7%,5,2,3,6,31.0,15.0,15.0,1.0
1,Kansas City Chiefs,40,137,6.1,839,66,301,4.6,3,65,8.7,42,13.4,564,2,6,5,83.3%,2,2,0,6,19.0,7.0,5.0,7.0
2,Indianapolis Colts,45,154,4.4,676,70,280,4.0,1,75,6.1,47,9.7,454,4,6,4,66.7%,6,6,0,9,45.0,4.0,15.0,26.0
3,San Francisco 49ers,46,189,4.8,912,80,350,4.4,2,104,5.7,64,9.2,588,2,4,2,50%,4,2,2,5,21.0,4.0,5.0,12.0
4,Los Angeles Chargers,57,188,5.4,1011,71,328,4.6,1,111,6.6,72,10.1,730,3,9,2,22.2%,2,1,1,6,14.0,11.0,1.0,2.0


In [159]:
nocomma = defense_df['Ttl_Yds'][3]
nocomma.split(',')

['912']

In [165]:
comma = defense_df['Ttl_Yds'][4]
len(comma.split(','))

2

In [161]:
one, two = comma.split(',')
one + two

'1011'

In [162]:
for column in def_int:
    for i in range(len(defense_df[column]:
        if len(i.split(',')) > 1:
            one, two = i.split(',')
            i = one + two

In [163]:
def_int = ['Ttl_Pts_Allowed', 'Ttl_Offense_Plays_Allowed', 'Ttl_Yds', 'Rushing_Att', 'Rushing_Yds', 'Rushing_TDs', 
           'Passing_Att', 'Completions', 'Passing_Yds', 'Passing_TDs', 'RZ_Att', 'RZ_TD', 'Ttl_Turnovers', 
           'Interceptions', 'Fumbles', 'Sacks']

def_floats = ['Yds_p_Play', 'Rushing_Yds_p_Att', 'Passing_Yds_p_Att', 'Yds_p_Completion', 'TTL', 'Week_1', 
              'Week_2', 'Week_3']

In [164]:
update_int_dtype(defense_df, def_int)

ValueError: invalid literal for int() with base 10: '1,011'

In [300]:
for column in def_floats:
    for i in range(len(defense_df[column])):
        item = defense_df[column][i]
        if len(item) < 3:
            defense_df[column][i] = item + '.0'
        else:
            defense_df[column][i] = item

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [301]:
for column in def_floats:
    for i in range(len(defense_df[column])):
        item = str(defense_df[column][i])
        ones, tenths = item.split('.')
        ones = int(ones)
        tenths = int(tenths) * .1
        defense_df[column][i] = ones + tenths

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys


In [302]:
defense_df['RZ_Percent'] = defense_df['RZ_Percent'].map(lambda x: x.strip('%'))

In [303]:
for column in defense_df['RZ_Percent']:
    for i in range(len(defense_df['RZ_Percent'])):
        item = defense_df['RZ_Percent'][i]
        if len(item) < 3:
            defense_df['RZ_Percent'][i] = item + '.0'
        else:
            defense_df['RZ_Percent'][i] = item

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [304]:
for i in range(len(defense_df['RZ_Percent'])):
    item = str(defense_df['RZ_Percent'][i])
    ones, tenths = item.split('.')
    ones = int(ones)
    tenths = int(tenths) * .1
    defense_df['RZ_Percent'][i] = ones + tenths

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [305]:
dataframe.to_pickle('players')
defense_df.to_pickle('defense')