# Cleaning Defense Dataset

Author: Ermina Mujan

Date: April 19, 2021

This dataset is from kaggle user mur418. It contains data on NFL defensive players for 2019 actuals and 2020 projections

### Import packages and data

In [1]:
import pandas as pd # we are using pandas for data manipulation and analysis

import missingno as msno # we are using missingno to visualize the distribution of NaN(Not a Number) values

In [2]:
# bring in the defense dataset as a pandas dataframe 

df = pd.read_csv('../../fixtures/raw_data/defense_stats_and_projections_kaggle.csv')

### Inspect the dataset's shape, head, and column names

In [3]:
# see how many rows and columns are in this dataset

shape_info = df.shape # set the dataframe's "shape" to a variable

In [8]:
print('This dataset evaluates {} defensive teams based on data from {} columns' # print this phrase with variables added in
      .format(shape_info[0], # using the .format method, insert the number of rows in the dataframe
              shape_info[1])) # using the .format method, insert the number of columns in the dataframe

This dataset evaluates 32 defensive teams based on data from 17 columns


In [5]:
df.head() # look at the first 5 rows of the defense dataset

Unnamed: 0.1,Unnamed: 0,TEAM NAME,2019 SCK,2019 INT,2019 FR,2019 TD,2019 PA,2019 YA,2019 FPTS,2020 SCK,2020 INT,2020 FR,2020 TD,2020 PA,2020 YA,2020 FPTS,2020 OUTLOOK
0,1,Steelers D/ST,54,20,18,4,285,4866,181.0,44,13,12,3,303,5443,137.08,"Following a rough start to the 2019 season, Pi..."
1,2,Bills D/ST,44,14,9,1,251,4772,131.0,41,13,11,3,306,5352,134.16,Yet another strong performance from Sean McDer...
2,3,49ers D/ST,48,12,15,5,296,4509,164.0,44,13,13,3,321,5489,124.74,After four consecutive seasons finishing 29th ...
3,4,Colts D/ST,41,15,8,3,355,5549,114.0,42,11,10,3,314,5446,122.57,The Colts' D/ST has finished 13th in fantasy p...
4,5,Patriots D/ST,47,25,11,7,195,4414,225.0,41,14,8,2,314,5438,122.18,The Patriots were the top-scoring fantasy D/ST...


In [6]:
df.tail() # look at the last 5 rows of the defense dataset

Unnamed: 0.1,Unnamed: 0,TEAM NAME,2019 SCK,2019 INT,2019 FR,2019 TD,2019 PA,2019 YA,2019 FPTS,2020 SCK,2020 INT,2020 FR,2020 TD,2020 PA,2020 YA,2020 FPTS,2020 OUTLOOK
27,28,Jaguars D/ST,47,10,9,2,383,6007,79.0,39,13,10,3,404,5834,68.07,No outlook available.
28,29,Falcons D/ST,28,12,8,3,375,5693,82.0,35,11,9,3,391,5949,57.26,No outlook available.
29,30,Washington D/ST,46,13,9,1,417,6162,65.0,37,12,8,3,415,5836,57.17,No outlook available.
30,31,Giants D/ST,36,10,6,3,431,6037,57.0,36,12,7,2,405,5979,51.68,No outlook available.
31,32,Panthers D/ST,53,14,7,1,464,5992,72.0,35,11,8,3,430,6044,46.82,No outlook available.


In [7]:
list(df) # listing all column names

['Unnamed: 0',
 'TEAM NAME',
 '2019 SCK',
 '2019 INT',
 '2019 FR',
 '2019 TD',
 '2019 PA',
 '2019 YA',
 '2019 FPTS',
 '2020 SCK',
 '2020 INT',
 '2020 FR',
 '2020 TD',
 '2020 PA',
 '2020 YA',
 '2020 FPTS',
 '2020 OUTLOOK']

### Begin cleaning the dataset by improving readability of column names and dropping unnecessary columns and rows

In [9]:
# Dropping the following columns: 'Unnamed: 0' and '2020 OUTLOOK'

df = df.drop(['Unnamed: 0', '2020 OUTLOOK',], axis=1) 

In [10]:
# shows how many many columns have been deleted

new_shape = df.shape[1]

print('{} columns have been removed from the dataset'
      .format( abs(new_shape-shape_info[1])))

2 columns have been removed from the dataset


In [11]:
# removing spaces from old column names

df_old_cols = list(df) # instantiate a new list with old column names in it

df_new_cols = [x.replace(" ", "").capitalize() for x in df_old_cols] # remove the spaces and make everything lowercase

In [12]:
df_new_cols[0] # make sure it worked by checking the first column

'Teamname'

In [13]:
df.columns = df_new_cols # make df_new_cols the column names

df.head() # check to make sure that this change stuck

Unnamed: 0,Teamname,2019sck,2019int,2019fr,2019td,2019pa,2019ya,2019fpts,2020sck,2020int,2020fr,2020td,2020pa,2020ya,2020fpts
0,Steelers D/ST,54,20,18,4,285,4866,181.0,44,13,12,3,303,5443,137.08
1,Bills D/ST,44,14,9,1,251,4772,131.0,41,13,11,3,306,5352,134.16
2,49ers D/ST,48,12,15,5,296,4509,164.0,44,13,13,3,321,5489,124.74
3,Colts D/ST,41,15,8,3,355,5549,114.0,42,11,10,3,314,5446,122.57
4,Patriots D/ST,47,25,11,7,195,4414,225.0,41,14,8,2,314,5438,122.18


In [15]:
df.tail() # check out the last 5 rows in the dataset

Unnamed: 0,Teamname,2019sck,2019int,2019fr,2019td,2019pa,2019ya,2019fpts,2020sck,2020int,2020fr,2020td,2020pa,2020ya,2020fpts
27,Jaguars D/ST,47,10,9,2,383,6007,79.0,39,13,10,3,404,5834,68.07
28,Falcons D/ST,28,12,8,3,375,5693,82.0,35,11,9,3,391,5949,57.26
29,Washington D/ST,46,13,9,1,417,6162,65.0,37,12,8,3,415,5836,57.17
30,Giants D/ST,36,10,6,3,431,6037,57.0,36,12,7,2,405,5979,51.68
31,Panthers D/ST,53,14,7,1,464,5992,72.0,35,11,8,3,430,6044,46.82


### Export the dataframe to a csv

In [16]:
df.to_csv('../../fixtures/cleaned_data/cleaned_defense_stats_and_projections.csv')