# Data exploration and assessment - OPL

## Section 1: exploring the main data set

In [1]:
# Download the data set zip
!wget https://github.com/sstangl/openpowerlifting-static/raw/gh-pages/openpowerlifting-latest.zip

--2020-01-25 16:12:05--  https://github.com/sstangl/openpowerlifting-static/raw/gh-pages/openpowerlifting-latest.zip
Loaded CA certificate '/etc/ssl/certs/ca-certificates.crt'
Resolving github.com (github.com)... 140.82.118.3
Connecting to github.com (github.com)|140.82.118.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/sstangl/openpowerlifting-static/gh-pages/openpowerlifting-latest.zip [following]
--2020-01-25 16:12:06--  https://raw.githubusercontent.com/sstangl/openpowerlifting-static/gh-pages/openpowerlifting-latest.zip
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.60.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.60.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 69565456 (66M) [application/zip]
Saving to: ‘openpowerlifting-latest.zip’


2020-01-25 16:12:18 (5.88 MB/s) - ‘openpowerlifting-latest.zip’ saved [69565456/695654

In [4]:
# unzip the csv
!unzip openpowerlifting-latest.zip

Archive:  openpowerlifting-latest.zip
   creating: openpowerlifting-2020-01-03/
  inflating: openpowerlifting-2020-01-03/LICENSE.txt  
  inflating: openpowerlifting-2020-01-03/openpowerlifting-2020-01-03.csv  
  inflating: openpowerlifting-2020-01-03/README.txt  


In [58]:
# import pandas and numpy
import pandas as pd
import numpy as np

In [59]:
# create notes so can record notes as we go along for easy reference once we're done

notes = []

In [60]:
# load the csv into a data frame

df_opl = pd.read_csv('openpowerlifting-2020-01-03/openpowerlifting-2020-01-03.csv',low_memory=False)

In [4]:
# Check  number of rows

notes.append(f"Row count: {len(df_opl):,}")

notes

['Row count: 1,731,478']

In [5]:
# list columns we have in the data
df_opl.columns

Index(['Name', 'Sex', 'Event', 'Equipment', 'Age', 'AgeClass',
       'BirthYearClass', 'Division', 'BodyweightKg', 'WeightClassKg',
       'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg', 'Best3SquatKg',
       'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg', 'Best3BenchKg',
       'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg', 'Deadlift4Kg',
       'Best3DeadliftKg', 'TotalKg', 'Place', 'Wilks', 'McCulloch',
       'Glossbrenner', 'IPFPoints', 'Tested', 'Country', 'Federation', 'Date',
       'MeetCountry', 'MeetState', 'MeetName'],
      dtype='object')

In [6]:
# Use iloc to skim data to get an idea of data types
# data set is too wide to fit on page so check in halves
col_count = int(len(df_opl.columns)/2)

df_opl.iloc[:5,:col_count]

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,Squat1Kg,Squat2Kg,Squat3Kg,Squat4Kg,Best3SquatKg,Bench1Kg,Bench2Kg,Bench3Kg,Bench4Kg
0,Abbie Murphy,F,SBD,Wraps,29.0,24-34,,F-OR,59.8,60,80.0,92.5,105.0,,105.0,45.0,50.0,55.0,
1,Abbie Tuong,F,SBD,Wraps,29.0,24-34,,F-OR,58.5,60,100.0,110.0,120.0,,120.0,55.0,62.5,67.5,
2,Ainslee Hooper,F,B,Raw,40.0,40-44,40-49,F-OR,55.4,56,,,,,,27.5,32.5,-35.0,
3,Amy Moldenhauer,F,SBD,Wraps,23.0,20-23,,F-OR,60.0,60,-105.0,-105.0,105.0,,105.0,67.5,72.5,-75.0,
4,Andrea Rowan,F,SBD,Wraps,45.0,45-49,40-49,F-OR,104.0,110,120.0,130.0,140.0,,140.0,70.0,75.0,80.0,


In [7]:
df_opl.iloc[:5,col_count:]

Unnamed: 0,Best3BenchKg,Deadlift1Kg,Deadlift2Kg,Deadlift3Kg,Deadlift4Kg,Best3DeadliftKg,TotalKg,Place,Wilks,McCulloch,Glossbrenner,IPFPoints,Tested,Country,Federation,Date,MeetCountry,MeetState,MeetName
0,55.0,110.0,120.0,130.0,,130.0,290.0,4,324.16,324.16,286.42,511.15,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
1,67.5,130.0,140.0,145.0,,145.0,332.5,2,378.07,378.07,334.16,595.65,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
2,32.5,,,,,,32.5,1,38.56,38.56,34.12,313.97,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
3,72.5,132.5,-140.0,-140.0,,132.5,310.0,3,345.61,345.61,305.37,547.04,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup
4,80.0,150.0,160.0,170.0,,170.0,390.0,3,321.25,338.91,274.56,550.08,,,GPC-AUS,2018-10-27,Australia,VIC,Melbourne Cup


In [8]:
# Check how pandas has data typed the columns, which dtypes are present
df_opl.dtypes.drop_duplicates().values

array([dtype('O'), dtype('float64')], dtype=object)

In [129]:
# Examine float columns

df_opl.select_dtypes(include='float').columns

Index(['Age', 'BodyweightKg', 'Squat1Kg', 'Squat2Kg', 'Squat3Kg', 'Squat4Kg',
       'Best3SquatKg', 'Bench1Kg', 'Bench2Kg', 'Bench3Kg', 'Bench4Kg',
       'Best3BenchKg', 'Deadlift1Kg', 'Deadlift2Kg', 'Deadlift3Kg',
       'Deadlift4Kg', 'Best3DeadliftKg', 'TotalKg', 'Wilks', 'McCulloch',
       'Glossbrenner', 'IPFPoints'],
      dtype='object')

In [9]:
# add these columns to notes, they are expected to be float columns
notes.append(f"Float columns: {','.join(df_opl.select_dtypes(include='float').columns)}")

In [10]:
# Examine object columns
df_opl.select_dtypes(include='object').columns

Index(['Name', 'Sex', 'Event', 'Equipment', 'AgeClass', 'BirthYearClass',
       'Division', 'WeightClassKg', 'Place', 'Tested', 'Country', 'Federation',
       'Date', 'MeetCountry', 'MeetState', 'MeetName'],
      dtype='object')

In [11]:
# Some columns were expected to contain text data. Some were expected to be numeric. Remove those expected to be numeric for closer examination later and 
# write the remainder to notes to confirm as text columns in the data model

expected_numeric = ['AgeClass', 'BirthYearClass','WeightClassKg', 'Place']

text_cols = set(df_opl.select_dtypes(include='object').columns) - set(expected_numeric)

notes.append(f"Text columns: {','.join(text_cols)}")

In [12]:
# Most dtypes make sense but some are typed as object whereas the name would indicate a numeric value
# take a look at the data to check

df_opl[expected_numeric].sample(n=15)

Unnamed: 0,AgeClass,BirthYearClass,WeightClassKg,Place
51709,24-34,24-39,90,DQ
775213,18-19,19-23,57,1
1695528,55-59,,67.5,1
1074629,24-34,24-39,90,11
1192620,,,67.3,10
1209539,,,100,3
324799,,,59.8,9
772243,45-49,40-49,120,2
1232308,,,75,18
50288,13-15,14-18,67.5,DQ


In [13]:
notes.append('AgeClass, BirthYearClass are ranges which can be replaced with AgeClassFrom,AgeClassTo and BirthYearClassFrom, BirthYearClassTo to faciliate filtering')
# Take a closer look at Place to find non-numeric values

[place for place in df_opl['Place'].drop_duplicates().values if not place.isdigit()]

['DQ', 'G', 'NS', 'DD']

In [14]:
# non-numerical values exist for place so will create two data columns for this in final model: place_numeric (integer place or NaN / NULL) and another text place for DQ
# and other to spell out in full what they are short for e.g. DQ = Disqualified

notes.append('Place will be split into 2. place_numeric to hold numeric place. Where the lifter did not place this will be set NULL. We will need a way to record the non-place information such as DQ,DD,NS and G')

In [15]:
# Take a closer look at WeightClassKg to find non-numeric values

df_opl['WeightClassKg'].drop_duplicates().values

array(['60', '56', '110', '75', '82.5', '52', '67.5', '90', '110+', '125',
       '100', '140', '140+', '48', '90+', '44', nan, '63', '72', '84',
       '93', '105', '120', '120+', '74', '83', '47', '57', '84+', '66',
       '59', '53', '125+', '43', '+', '90.7', '90.7+', '95', '80', '100+',
       '36', '40', '46', '49', '75+', '82.5+', '52+', '95.2', '61', '82',
       '109', '109+', '145', '145+', '72+', '93+', '60+', '50', '65',
       '80+', '67.5+', '105+', '63+', '35', '155', '39', '155+', '68',
       '118', '30', '70', '58', '64', '85', '94', '77', '69', '62', '107',
       '88', '130', '34', '36.2', '45.3', '136', '136+', '51.7', '55.7',
       '59.8', '67.1', '74.8', '82.1', '89.8', '99.7', '109.7', '124.7',
       '124.7+', '47.5', '58.5', '50.5', '55.5', '101+', '102+', '103+',
       '104+', '54', '85+', '77.5', '103', '113.5', '127', '143', '143+',
       '53+', '56+', '68.5', '69.5', '70.5', '58.9', '72.5', '72.5+',
       '41', '45', '55', '67', '73', '79', '97', '86',

In [16]:
notes.append('WeightClassKg will require some data cleansing and standardisation to turn this data into two fields WeightClassFrom, WeightClassTo to make this data more useful')

In [25]:
# Check dates are valid dates
try:
    pd.to_datetime(df_opl['Date'])
except:
    print('Invalid dates exist')

Invalid dates exist


In [55]:
# find those invalid dates

## copy raw dates out into separate frame
df_dates = df_opl[['Date']].drop_duplicates().rename(columns={'Date':'raw_date'})

## attempt to parse date into separate column (coerce any errors to NaT)
df_dates['parsed_date'] = pd.to_datetime(df_dates['raw_date'],errors='coerce')

df_dates[df_dates['parsed_date'].isnull()]

Unnamed: 0,raw_date,parsed_date
154849,2018-04-31,NaT


In [56]:
# note this value needs correcting

notes.append('Date: contains an invalid date 2018-04-31 which will need to be corrected')

In [83]:
# Unique key checks

## first check if any duplicates

len(df_opl[df_opl.duplicated() == True])

Unnamed: 0,Name,Sex,Event,Equipment,Age,AgeClass,BirthYearClass,Division,BodyweightKg,WeightClassKg,...,McCulloch,Glossbrenner,IPFPoints,Tested,Country,Federation,Date,MeetCountry,MeetState,MeetName
405,Emily Knauth,F,SBD,Multi-ply,28.0,24-34,,F-OE,80.80,82.5,...,329.88,289.13,461.36,,Australia,GPC-AUS,2015-05-19,Australia,,Nationals
406,John Sheridan,M,SBD,Multi-ply,26.0,24-34,24-39,M-OE,136.50,140,...,493.66,470.07,555.98,,Australia,GPC-AUS,2015-05-19,Australia,,Nationals
407,Josh Vale,M,SBD,Multi-ply,26.0,24-34,24-39,M-OE,81.20,82.5,...,366.95,353.33,470.47,,,GPC-AUS,2015-05-19,Australia,,Nationals
408,Ron Birch,M,SBD,Multi-ply,70.0,70-74,,M-OE,98.70,100,...,503.23,292.26,393.71,,Australia,GPC-AUS,2015-05-19,Australia,,Nationals
409,Shane Atta-Singh,M,SBD,Multi-ply,31.0,24-34,24-39,M-OE,165.00,140+,...,491.11,459.20,523.21,,Australia,GPC-AUS,2015-05-19,Australia,,Nationals
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1720168,Michael Staub,M,SBD,Wraps,,50-54,50-59,Amateur Masters 50-54,88.36,90,...,363.38,308.45,461.15,Yes,,RPS,2015-01-31,USA,KY,Northern KY Winter Warfare
1724179,Lorri Manning,F,BD,Raw,47.5,45-49,40-49,Police Amateur Open,55.25,56,...,233.44,190.90,,Yes,USA,RPS,2018-01-21,USA,MA,2nd Annual Worchester County Strength Bounty
1724365,Jeremy Velazquez,M,SBD,Wraps,,,,Amateur Submasters 33-39,89.90,90,...,,,,Yes,USA,RPS,2014-11-15,USA,NY,InsurreXtion III
1726906,Benjamin Eppley,M,SBD,Wraps,,20-23,,Amateur Juniors 20-23,87.91,90,...,326.85,313.58,470.25,Yes,,RPS,2015-04-18,USA,PA,17th Annual PowerPalooza


In [84]:
# Remove duplicates

df_opl = df_opl.drop_duplicates()

In [87]:
# Check duplicates have now been removed

len(df_opl[df_opl.duplicated() == True])

0

In [96]:
print('- ' + "\n- ".join(notes))

- 
