# Data Preparation

In [1]:
# import relevant libraries
import pandas as pd
import numpy as np

# set options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# read clean Open 2019 dataset and drop unnamed column
df = pd.read_csv('./data/19_clean.csv')
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]

In [3]:
df.head()

Unnamed: 0,competitorid,firstname,lastname,gender,countryoforiginname,affiliatename,age,height,weight,overallrank,overallscore,is_scaled,division,rank_1,rank_2,rank_3,rank_4,rank_5,time_2,time_3,time_4,time_5,scaled_1,scaled_2,scaled_3,scaled_4,scaled_5,score_1,score_2,score_3,score_4,score_5,bs_backsquat,bs_cleanandjerk,bs_snatch,bs_deadlift,bs_fightgonebad,bs_maxpull_ups,bs_fran,bs_grace,bs_helen,bs_filthy50,bs_sprint400m,bs_run5k,w1_reps_total,w2_reps_total,w2_reps_t2b,w2_reps_du,w2_reps_sqcl,w2_rounds_completed,w2_tiebreak,w3_reps_total,w3_5ft_reps_ohl,w3_reps_dbbsu,w3_reps_hspu,w3_hspu_status,w3_5ft_reps_hsw,w3_rounds_completed,w3_tiebreak,w4_reps_total,w4_reps_sn,w4_reps_bp,w4_reps_bmu,w4_bmu_status,w4_tiebreak,w5_reps_total,w5_reps_thr,w5_reps_c2bpu,region,w2_full_rounds_completed,w3_full_rounds_completed,BMI
0,86,Justin,Bergh,M,United States,CrossFit HQ,37.0,1.96,102.06,53022,304948,0,Men,15232,58534,59298,85576,86308,480.0,600.0,720.0,1200.0,0,0,0,0,0,13040000,11670764,10980237,10720255,11350000,151.0,120.0,95.0,188.0,393.0,32.0,226.0,174.0,496.0,1287.0,64.0,1325.0,304.0,167.0,50.0,100.0,17.0,1.88,436.0,98.0,40.0,50.0,8.0,1.0,0.0,2.16,363.0,72.0,30.0,36.0,6.0,1.0,465.0,135.0,75.0,60.0,USA,< 8 min,HSPU,26.6
1,88,Cary,Hair,M,United States,CrossFit Santa Cruz,34.0,1.83,86.64,4361,37192,0,Men,5891,5456,7199,13060,5586,720.0,600.0,720.0,882.0,0,0,0,0,0,13240000,12620599,11330328,11170505,12100318,206.0,142.0,120.0,234.0,407.0,54.0,140.0,109.0,448.0,1103.0,54.0,1294.0,324.0,262.0,75.0,150.0,37.0,2.98,601.0,133.0,40.0,50.0,43.0,1.0,0.0,2.86,,117.0,30.0,60.0,27.0,1.0,215.0,210.0,105.0,105.0,USA,< 12 min,HSPU,25.9
2,92,Tim,Chan,M,United States,CrossFit SoCal,49.0,1.68,73.03,117405,564929,0,Men,155079,102548,127946,76177,103179,480.0,600.0,720.0,1200.0,0,0,0,0,0,11710000,11021027,10510000,10770250,11090000,102.0,92.0,70.0,138.0,,35.0,262.0,281.0,645.0,1661.0,,,171.0,102.0,37.0,50.0,15.0,1.14,173.0,51.0,40.0,11.0,0.0,0.0,0.0,1.22,600.0,77.0,30.0,37.0,10.0,1.0,470.0,109.0,60.0,49.0,USA,< 8 min,Box SU,25.9
3,93,Leif,Edmundson,M,United States,CrossFit HQ,37.0,1.83,92.99,38765,237798,0,Men,50223,39468,78855,40425,28827,480.0,600.0,720.0,1200.0,0,0,0,0,0,12670000,11720803,10920171,10970472,12010000,138.0,106.0,83.0,161.0,403.0,45.0,220.0,154.0,493.0,1500.0,62.0,1239.0,267.0,172.0,50.0,100.0,22.0,1.93,397.0,92.0,40.0,50.0,2.0,1.0,0.0,2.04,429.0,97.0,30.0,48.0,19.0,1.0,248.0,201.0,105.0,96.0,USA,< 8 min,HSPU,27.8
4,1617,John,Mclaughlin,M,United States,CrossFit Palm Beach,51.0,1.78,84.82,12050,89855,0,Men,12686,26138,26923,13629,10479,480.0,600.0,720.0,999.0,0,0,0,0,0,13080000,11760858,11130242,11170482,12100201,161.0,115.0,88.0,197.0,,50.0,146.0,154.0,444.0,,,,308.0,176.0,50.0,100.0,26.0,1.98,342.0,113.0,40.0,50.0,23.0,1.0,0.0,2.46,358.0,117.0,30.0,60.0,27.0,1.0,238.0,210.0,105.0,105.0,USA,< 8 min,HSPU,26.8


### Drop features/observations

In [4]:
cols = ['gender','age','height','weight','overallrank','division', \
        'scaled_1','scaled_2','scaled_3','scaled_4','scaled_5', \
        'bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
        'w1_reps_total','w2_reps_t2b','w2_reps_du','w2_reps_sqcl', \
        'w3_5ft_reps_ohl','w3_reps_dbbsu','w3_reps_hspu','w3_hspu_status','w3_5ft_reps_hsw', \
        'w4_reps_sn','w4_reps_bp','w4_reps_bmu','w4_bmu_status', \
        'w5_reps_total']
df = df[cols]
df = df[df['age'].notna()]

In [5]:
df.shape

(338534, 29)

### Create new features

In [6]:
height = df['height'].to_list()
weight = df['weight'].to_list()
w1_reps = df['w1_reps_total'].to_list()
w2_reps = df['w2_reps_t2b'].to_list()
w3_reps = df['w3_5ft_reps_ohl'].to_list()
w4_reps = df['w4_reps_sn'].to_list()
w5_reps = df['w5_reps_total'].to_list()
lift_1 = df['bs_backsquat'].to_list()
lift_2 = df['bs_cleanandjerk'].to_list()
lift_3 = df['bs_snatch'].to_list()
lift_4 = df['bs_deadlift'].to_list()

In [7]:
miss_hw = []
parti_1,parti_2,parti_3,parti_4,parti_5 = [],[],[],[],[]
miss_lift = []

In [8]:
for i in range(len(height)):
    if np.isnan(height[i]) or np.isnan(weight[i]):
        miss_hw.append(1)
    else:
        miss_hw.append(0)

for i in range(len(w1_reps)):
    if np.isnan(w1_reps[i]):
        parti_1.append(0)
    else:
        parti_1.append(1)
        
for i in range(len(w2_reps)):
    if np.isnan(w2_reps[i]):
        parti_2.append(0)
    else:
        parti_2.append(1)
        
for i in range(len(w3_reps)):
    if np.isnan(w3_reps[i]):
        parti_3.append(0)
    else:
        parti_3.append(1)
        
for i in range(len(w4_reps)):
    if np.isnan(w4_reps[i]):
        parti_4.append(0)
    else:
        parti_4.append(1)
        
for i in range(len(w5_reps)):
    if np.isnan(w5_reps[i]):
        parti_5.append(0)
    else:
        parti_5.append(1)
        
for i in range(len(lift_1)):
    if np.isnan(lift_1[i]) or np.isnan(lift_2[i]) or np.isnan(lift_3[i]) or np.isnan(lift_4[i]):
        miss_lift.append(1)
    else:
        miss_lift.append(0)

In [9]:
df['miss_hw'] = miss_hw
df['parti_1'] = parti_1
df['parti_2'] = parti_2
df['parti_3'] = parti_3
df['parti_4'] = parti_4
df['parti_5'] = parti_5
df['miss_lift'] = miss_lift

### Cleaning

In [10]:
# set gender to 1/0
df['gender'].replace(['M','F'],[1,0],inplace=True)

# set hspu_status to 1/0; set NaN to 0 (not participated)
df['w3_hspu_status'].replace([1.0,0.0,np.NaN],[1,0,0],inplace=True)
df = df.astype({"w3_hspu_status": int})

# set bmu_status to 1/0; set NaN to 0 (not participated)
df['w4_bmu_status'].replace([1.0,0.0,np.NaN],[1,0,0],inplace=True)
df = df.astype({"w4_bmu_status": int})

### Imputation

#### Workout Results

In [11]:
# set results of 19.1 to 0
df['w1_reps_total'].replace([np.NaN],[0],inplace=True)
# set results of 19.2 to 0
df['w2_reps_t2b'].replace([np.NaN],[0],inplace=True)
df['w2_reps_du'].replace([np.NaN],[0],inplace=True)
df['w2_reps_sqcl'].replace([np.NaN],[0],inplace=True)
# set results of 19.3 to 0
df['w3_5ft_reps_ohl'].replace([np.NaN],[0],inplace=True)
df['w3_reps_dbbsu'].replace([np.NaN],[0],inplace=True)
df['w3_reps_hspu'].replace([np.NaN],[0],inplace=True)
df['w3_5ft_reps_hsw'].replace([np.NaN],[0],inplace=True)
# set results of 19.4 to 0
df['w4_reps_sn'].replace([np.NaN],[0],inplace=True)
df['w4_reps_bp'].replace([np.NaN],[0],inplace=True)
df['w4_reps_bmu'].replace([np.NaN],[0],inplace=True)
# set results of 19.5 to 0
df['w5_reps_total'].replace([np.NaN],[0],inplace=True)

#### Body Measurements

The missing values of features height and weight will be imputed in preprocessing pipeline in the Modelling Notebooks.

#### Benchmark Statistics

The missing values of the lifting benchmarks will be imputed in preprocessing pipeline in the Modelling Notebooks.

---
## Create Dataframes for Modelling

### Define superior dataframes

In [12]:
# define dataset for regression model
df_rank = df[(df['division']=='Men')|(df['division']=='Women')]
df_rank = df_rank.drop(['division'],1)
# define dataset for HSPU-classification model
df_hspu = df.drop(['division','overallrank','w4_bmu_status', \
                   'w3_5ft_reps_ohl','w3_reps_dbbsu','w3_reps_hspu','w3_5ft_reps_hsw', \
                   'scaled_3'],1)
# define dataset for BMU-classification model
df_bmu = df.drop(['division','overallrank','w3_hspu_status', \
                  'w4_reps_sn','w4_reps_bp','w4_reps_bmu'],1)

#### 01 - 19 - Open 19 complete case analysis

In [13]:
df_rank_19 = df_rank[
    (df_rank['parti_1']==1) &
    (df_rank['parti_2']==1) &
    (df_rank['parti_3']==1) &
    (df_rank['parti_4']==1) &
    (df_rank['parti_5']==1)]
df_rank_19 = df_rank_19.drop(['height','weight', \
                              'bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                              'miss_hw', \
                              'parti_1','parti_2','parti_3','parti_4','parti_5', \
                              'miss_lift'],1)

df_hspu_19 = df_hspu[
    (df_hspu['parti_1']==1) &
    (df_hspu['parti_2']==1) &
    (df_hspu['parti_3']==1) &
    (df_hspu['parti_4']==1) &
    (df_hspu['parti_5']==1)]
df_hspu_19 = df_hspu_19.drop(['height','weight', \
                              'bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                              'miss_hw', \
                              'parti_1','parti_2','parti_3','parti_4','parti_5', \
                              'miss_lift'],1)

df_bmu_19 = df_bmu[
    (df_bmu['parti_1']==1) &
    (df_bmu['parti_2']==1) &
    (df_bmu['parti_3']==1) &
    (df_bmu['parti_4']==1) &
    (df_bmu['parti_5']==1)]
df_bmu_19 = df_bmu_19.drop(['height','weight', \
                            'bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                            'miss_hw', \
                            'parti_1','parti_2','parti_3','parti_4','parti_5', \
                            'miss_lift'],1)

#### 02 - 19imp - Open 19 dataset with imputed values (zero)

In [14]:
df_rank_19imp = df_rank.drop(['height','weight', \
                              'bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                              'miss_hw', \
                              'miss_lift'],1)

df_hspu_19imp = df_hspu.drop(['height','weight', \
                              'bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                              'miss_hw', \
                              'miss_lift'],1)

df_bmu_19imp = df_bmu.drop(['height','weight', \
                            'bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                            'miss_hw', \
                            'miss_lift'],1)

#### 03 - 19_hw - Open 19 complete case analysis with heights & weights

In [15]:
df_rank_19_hw = df_rank[
    (df_rank['parti_1']==1) &
    (df_rank['parti_2']==1) &
    (df_rank['parti_3']==1) &
    (df_rank['parti_4']==1) &
    (df_rank['parti_5']==1)]
df_rank_19_hw = df_rank_19_hw.drop(['bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                                    'parti_1','parti_2','parti_3','parti_4','parti_5', \
                                    'miss_lift'],1)

df_hspu_19_hw = df_hspu[
    (df_hspu['parti_1']==1) &
    (df_hspu['parti_2']==1) &
    (df_hspu['parti_3']==1) &
    (df_hspu['parti_4']==1) &
    (df_hspu['parti_5']==1)]
df_hspu_19_hw = df_hspu_19_hw.drop(['bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                                    'parti_1','parti_2','parti_3','parti_4','parti_5', \
                                    'miss_lift'],1)

df_bmu_19_hw = df_bmu[
    (df_bmu['parti_1']==1) &
    (df_bmu['parti_2']==1) &
    (df_bmu['parti_3']==1) &
    (df_bmu['parti_4']==1) &
    (df_bmu['parti_5']==1)]
df_bmu_19_hw = df_bmu_19_hw.drop(['bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                                  'parti_1','parti_2','parti_3','parti_4','parti_5', \
                                  'miss_lift'],1)

#### 04 - 19imp_hw - Open 19 dataset with imputed values (zero) with heights & weights

In [16]:
df_rank_19imp_hw = df_rank.drop(['bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                                 'miss_lift'],1)

df_hspu_19imp_hw = df_hspu.drop(['bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                                 'miss_lift'],1)

df_bmu_19imp_hw = df_bmu.drop(['bs_backsquat','bs_cleanandjerk','bs_snatch','bs_deadlift', \
                               'miss_lift'],1)

#### 05 - 19_bs - Open 19 complete case analysis with olympic lifts

In [17]:
df_rank_19_bs = df_rank[
    (df_rank['parti_1']==1) &
    (df_rank['parti_2']==1) &
    (df_rank['parti_3']==1) &
    (df_rank['parti_4']==1) &
    (df_rank['parti_5']==1)]
df_rank_19_bs = df_rank_19_bs[(df_rank_19_bs['bs_backsquat'].notna()) &
                              (df_rank_19_bs['bs_cleanandjerk'].notna()) &
                              (df_rank_19_bs['bs_snatch'].notna()) &
                              (df_rank_19_bs['bs_deadlift'].notna())]
df_rank_19_bs = df_rank_19_bs.drop(['height','weight', \
                                    'parti_1','parti_2','parti_3','parti_4','parti_5', \
                                    'miss_hw','miss_lift'],1)

df_hspu_19_bs = df_hspu[
    (df_hspu['parti_1']==1) &
    (df_hspu['parti_2']==1) &
    (df_hspu['parti_3']==1) &
    (df_hspu['parti_4']==1) &
    (df_hspu['parti_5']==1)]
df_hspu_19_bs = df_hspu_19_bs[(df_hspu_19_bs['bs_backsquat'].notna()) &
                              (df_hspu_19_bs['bs_cleanandjerk'].notna()) &
                              (df_hspu_19_bs['bs_snatch'].notna()) &
                              (df_hspu_19_bs['bs_deadlift'].notna())]
df_hspu_19_bs = df_hspu_19_bs.drop(['height','weight', \
                                    'parti_1','parti_2','parti_3','parti_4','parti_5', \
                                    'miss_hw','miss_lift'],1)

df_bmu_19_bs = df_bmu[
    (df_bmu['parti_1']==1) &
    (df_bmu['parti_2']==1) &
    (df_bmu['parti_3']==1) &
    (df_bmu['parti_4']==1) &
    (df_bmu['parti_5']==1)]
df_bmu_19_bs = df_bmu_19_bs[(df_bmu_19_bs['bs_backsquat'].notna()) &
                            (df_bmu_19_bs['bs_cleanandjerk'].notna()) &
                            (df_bmu_19_bs['bs_snatch'].notna()) &
                            (df_bmu_19_bs['bs_deadlift'].notna())]
df_bmu_19_bs = df_bmu_19_bs.drop(['height','weight', \
                                  'parti_1','parti_2','parti_3','parti_4','parti_5', \
                                  'miss_hw','miss_lift'],1)

#### 06 - 19imp_bs - Open 19 dataset with imputed values (zero) with olympic lifts

In [18]:
df_rank_19imp_bs = df_rank[(df_rank['bs_backsquat'].notna()) &
                           (df_rank['bs_cleanandjerk'].notna()) &
                           (df_rank['bs_snatch'].notna()) &
                           (df_rank['bs_deadlift'].notna())]
df_rank_19imp_bs = df_rank_19imp_bs.drop(['height','weight','miss_hw','miss_lift'],1)

df_hspu_19imp_bs = df_hspu[(df_hspu['bs_backsquat'].notna()) &
                           (df_hspu['bs_cleanandjerk'].notna()) &
                           (df_hspu['bs_snatch'].notna()) &
                           (df_hspu['bs_deadlift'].notna())]
df_hspu_19imp_bs = df_hspu_19imp_bs.drop(['height','weight','miss_hw','miss_lift'],1)

df_bmu_19imp_bs = df_bmu[(df_bmu['bs_backsquat'].notna()) &
                         (df_bmu['bs_cleanandjerk'].notna()) &
                         (df_bmu['bs_snatch'].notna()) &
                         (df_bmu['bs_deadlift'].notna())]
df_bmu_19imp_bs = df_bmu_19imp_bs.drop(['height','weight','miss_hw','miss_lift'],1)

#### 07 - 19_hw_bs - Open 19 complete case analysis with heights&weights and olympic lifts

In [19]:
df_rank_19_hw_bs = df_rank[
    (df_rank['parti_1']==1) &
    (df_rank['parti_2']==1) &
    (df_rank['parti_3']==1) &
    (df_rank['parti_4']==1) &
    (df_rank['parti_5']==1)]
df_rank_19_hw_bs = df_rank_19_hw_bs[(df_rank_19_hw_bs['bs_backsquat'].notna()) &
                                    (df_rank_19_hw_bs['bs_cleanandjerk'].notna()) &
                                    (df_rank_19_hw_bs['bs_snatch'].notna()) &
                                    (df_rank_19_hw_bs['bs_deadlift'].notna())]
df_rank_19_hw_bs = df_rank_19_hw_bs.drop(['parti_1','parti_2','parti_3','parti_4','parti_5', \
                                          'miss_lift'],1)

df_hspu_19_hw_bs = df_hspu[
    (df_hspu['parti_1']==1) &
    (df_hspu['parti_2']==1) &
    (df_hspu['parti_3']==1) &
    (df_hspu['parti_4']==1) &
    (df_hspu['parti_5']==1)]
df_hspu_19_hw_bs = df_hspu_19_hw_bs[(df_hspu_19_hw_bs['bs_backsquat'].notna()) &
                                    (df_hspu_19_hw_bs['bs_cleanandjerk'].notna()) &
                                    (df_hspu_19_hw_bs['bs_snatch'].notna()) &
                                    (df_hspu_19_hw_bs['bs_deadlift'].notna())]
df_hspu_19_hw_bs = df_hspu_19_hw_bs.drop(['parti_1','parti_2','parti_3','parti_4','parti_5', \
                                          'miss_lift'],1)

df_bmu_19_hw_bs = df_bmu[
    (df_bmu['parti_1']==1) &
    (df_bmu['parti_2']==1) &
    (df_bmu['parti_3']==1) &
    (df_bmu['parti_4']==1) &
    (df_bmu['parti_5']==1)]
df_bmu_19_hw_bs = df_bmu_19_hw_bs[(df_bmu_19_hw_bs['bs_backsquat'].notna()) &
                                  (df_bmu_19_hw_bs['bs_cleanandjerk'].notna()) &
                                  (df_bmu_19_hw_bs['bs_snatch'].notna()) &
                                  (df_bmu_19_hw_bs['bs_deadlift'].notna())]
df_bmu_19_hw_bs = df_bmu_19_hw_bs.drop(['parti_1','parti_2','parti_3','parti_4','parti_5', \
                                        'miss_lift'],1)

#### 08 - 19imp_hw_bs - Open 19 dataset with imputed values (zero) with heights&weights and olympic lifts

In [20]:
df_rank_19imp_hw_bs = df_rank[(df_rank['bs_backsquat'].notna()) &
                              (df_rank['bs_cleanandjerk'].notna()) &
                              (df_rank['bs_snatch'].notna()) &
                              (df_rank['bs_deadlift'].notna())]
df_rank_19imp_hw_bs = df_rank_19imp_hw_bs.drop(['miss_lift'],1)

df_hspu_19imp_hw_bs = df_hspu[(df_hspu['bs_backsquat'].notna()) &
                              (df_hspu['bs_cleanandjerk'].notna()) &
                              (df_hspu['bs_snatch'].notna()) &
                              (df_hspu['bs_deadlift'].notna())]
df_hspu_19imp_hw_bs = df_hspu_19imp_hw_bs.drop(['miss_lift'],1)

df_bmu_19imp_hw_bs = df_bmu[(df_bmu['bs_backsquat'].notna()) &
                            (df_bmu['bs_cleanandjerk'].notna()) &
                            (df_bmu['bs_snatch'].notna()) &
                            (df_bmu['bs_deadlift'].notna())]
df_bmu_19imp_hw_bs = df_bmu_19imp_hw_bs.drop(['miss_lift'],1)

#### 09 - 19_bsimp - Open 19 complete case analysis with olympic lifts (to be imputed)

In [21]:
df_rank_19_bsimp = df_rank[
    (df_rank['parti_1']==1) &
    (df_rank['parti_2']==1) &
    (df_rank['parti_3']==1) &
    (df_rank['parti_4']==1) &
    (df_rank['parti_5']==1)]
df_rank_19_bsimp = df_rank_19_bsimp.drop(['height','weight', \
                                    'miss_hw', \
                                    'parti_1','parti_2','parti_3','parti_4','parti_5'],1)

df_hspu_19_bsimp = df_hspu[
    (df_hspu['parti_1']==1) &
    (df_hspu['parti_2']==1) &
    (df_hspu['parti_3']==1) &
    (df_hspu['parti_4']==1) &
    (df_hspu['parti_5']==1)]
df_hspu_19_bsimp = df_hspu_19_bsimp.drop(['height','weight', \
                                    'miss_hw', \
                                    'parti_1','parti_2','parti_3','parti_4','parti_5'],1)

df_bmu_19_bsimp = df_bmu[
    (df_bmu['parti_1']==1) &
    (df_bmu['parti_2']==1) &
    (df_bmu['parti_3']==1) &
    (df_bmu['parti_4']==1) &
    (df_bmu['parti_5']==1)]
df_bmu_19_bsimp = df_bmu_19_bsimp.drop(['height','weight', \
                                        'miss_hw', \
                                        'parti_1','parti_2','parti_3','parti_4','parti_5'],1)

#### 10 - 19imp_bsimp - Open 19 dataset with imputed values (zero) with olympic lifts (to be imputed)

In [22]:
df_rank_19imp_bsimp = df_rank.drop(['height','weight','miss_hw'],1)

df_hspu_19imp_bsimp = df_hspu.drop(['height','weight','miss_hw'],1)

df_bmu_19imp_bsimp = df_bmu.drop(['height','weight','miss_hw'],1)

#### 11 - 19_hw_bsimp - Open 19 complete case analysis with heights&weights and olympic lifts (to be imputed)

In [23]:
df_rank_19_hw_bsimp = df_rank[
    (df_rank['parti_1']==1) &
    (df_rank['parti_2']==1) &
    (df_rank['parti_3']==1) &
    (df_rank['parti_4']==1) &
    (df_rank['parti_5']==1)]
df_rank_19_hw_bsimp = df_rank_19_hw_bsimp.drop(['parti_1','parti_2','parti_3','parti_4','parti_5'],1)

df_hspu_19_hw_bsimp = df_hspu[
    (df_hspu['parti_1']==1) &
    (df_hspu['parti_2']==1) &
    (df_hspu['parti_3']==1) &
    (df_hspu['parti_4']==1) &
    (df_hspu['parti_5']==1)]
df_hspu_19_hw_bsimp = df_hspu_19_hw_bsimp.drop(['parti_1','parti_2','parti_3','parti_4','parti_5'],1)

df_bmu_19_hw_bsimp = df_bmu[
    (df_bmu['parti_1']==1) &
    (df_bmu['parti_2']==1) &
    (df_bmu['parti_3']==1) &
    (df_bmu['parti_4']==1) &
    (df_bmu['parti_5']==1)]
df_bmu_19_hw_bsimp = df_bmu_19_hw_bsimp.drop(['parti_1','parti_2','parti_3','parti_4','parti_5'],1)

#### 12 - 19imp_hw_bsimp - Open 19 dataset with imputed values (zero) with heights&weights and olympic lifts (to be imputed)

In [24]:
df_rank_19imp_hw_bsimp = df_rank

df_hspu_19imp_hw_bsimp = df_hspu

df_bmu_19imp_hw_bsimp = df_bmu

---
## Save Prepared Dataframes

In [25]:
df_rank_19.to_csv('./data/prep_rank_01_19.csv')
df_rank_19imp.to_csv('./data/prep_rank_02_19imp.csv')
df_rank_19_hw.to_csv('./data/prep_rank_03_19_hw.csv')
df_rank_19imp_hw.to_csv('./data/prep_rank_04_19imp_hw.csv')
df_rank_19_bs.to_csv('./data/prep_rank_05_19_bs.csv')
df_rank_19imp_bs.to_csv('./data/prep_rank_06_19imp_bs.csv')
df_rank_19_hw_bs.to_csv('./data/prep_rank_07_19_hw_bs.csv')
df_rank_19imp_hw_bs.to_csv('./data/prep_rank_08_19imp_hw_bs.csv')
df_rank_19_bsimp.to_csv('./data/prep_rank_09_19_bsimp.csv')
df_rank_19imp_bsimp.to_csv('./data/prep_rank_10_19imp_bsimp.csv')
df_rank_19_hw_bsimp.to_csv('./data/prep_rank_11_19_hw_bsimp.csv')
df_rank_19imp_hw_bsimp.to_csv('./data/prep_rank_12_19imp_hw_bsimp.csv')

df_hspu_19.to_csv('./data/prep_hspu_01_19.csv')
df_hspu_19imp.to_csv('./data/prep_hspu_02_19imp.csv')
df_hspu_19_hw.to_csv('./data/prep_hspu_03_19_hw.csv')
df_hspu_19imp_hw.to_csv('./data/prep_hspu_04_19imp_hw.csv')
df_hspu_19_bs.to_csv('./data/prep_hspu_05_19_bs.csv')
df_hspu_19imp_bs.to_csv('./data/prep_hspu_06_19imp_bs.csv')
df_hspu_19_hw_bs.to_csv('./data/prep_hspu_07_19_hw_bs.csv')
df_hspu_19imp_hw_bs.to_csv('./data/prep_hspu_08_19imp_hw_bs.csv')
df_hspu_19_bsimp.to_csv('./data/prep_hspu_09_19_bsimp.csv')
df_hspu_19imp_bsimp.to_csv('./data/prep_hspu_10_19imp_bsimp.csv')
df_hspu_19_hw_bsimp.to_csv('./data/prep_hspu_11_19_hw_bsimp.csv')
df_hspu_19imp_hw_bsimp.to_csv('./data/prep_hspu_12_19imp_hw_bsimp.csv')

df_bmu_19.to_csv('./data/prep_bmu_01_19.csv')
df_bmu_19imp.to_csv('./data/prep_bmu_02_19imp.csv')
df_bmu_19_hw.to_csv('./data/prep_bmu_03_19_hw.csv')
df_bmu_19imp_hw.to_csv('./data/prep_bmu_04_19imp_hw.csv')
df_bmu_19_bs.to_csv('./data/prep_bmu_05_19_bs.csv')
df_bmu_19imp_bs.to_csv('./data/prep_bmu_06_19imp_bs.csv')
df_bmu_19_hw_bs.to_csv('./data/prep_bmu_07_19_hw_bs.csv')
df_bmu_19imp_hw_bs.to_csv('./data/prep_bmu_08_19imp_hw_bs.csv')
df_bmu_19_bsimp.to_csv('./data/prep_bmu_09_19_bsimp.csv')
df_bmu_19imp_bsimp.to_csv('./data/prep_bmu_10_19imp_bsimp.csv')
df_bmu_19_hw_bsimp.to_csv('./data/prep_bmu_11_19_hw_bsimp.csv')
df_bmu_19imp_hw_bsimp.to_csv('./data/prep_bmu_12_19imp_hw_bsimp.csv')