In [28]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

X_train = pd.read_csv("../data/X_train.csv")
X_test = pd.read_csv("../data/X_test.csv")
y_train = pd.read_csv("../data/y_train.csv")
y_test = pd.read_csv("../data/y_test.csv")

X_train.describe()



Unnamed: 0,playerId,Age,BodyweightKg,BestDeadliftKg
count,18900.0,18725.0,18900.0,18900.0
mean,15039.49963,29.6647,85.425557,201.12277
std,8674.67268,11.55708,22.95972,62.17163
min,0.0,7.0,26.13,18.1
25%,7462.75,21.5,67.7,149.8575
50%,15122.5,26.5,82.1,204.12
75%,22540.25,35.0,98.97,247.5
max,29998.0,83.0,201.0,408.23


In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18900 entries, 0 to 18899
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   playerId        18900 non-null  float64
 1   Name            18900 non-null  object 
 2   Sex             18900 non-null  object 
 3   Equipment       18900 non-null  object 
 4   Age             18725 non-null  float64
 5   BodyweightKg    18900 non-null  float64
 6   BestSquatKg     18900 non-null  object 
 7   BestDeadliftKg  18900 non-null  float64
dtypes: float64(4), object(4)
memory usage: 1.2+ MB


In [18]:
X_train['Age'].value_counts()

Age
23.0    662
24.0    598
26.0    575
22.0    570
21.0    525
       ... 
79.5      1
7.0       1
81.0      1
81.5      1
83.0      1
Name: count, Length: 148, dtype: int64

In [19]:
X_train['Sex'].value_counts()

Sex
M    12771
F     6129
Name: count, dtype: int64

In [21]:
X_train['BestSquatKg'].value_counts()

BestSquatKg
182.5     364
200.0     358
160.0     313
205.0     312
170.0     306
         ... 
-235.0      1
366.0       1
-290.0      1
153.0       1
231.0       1
Name: count, Length: 626, dtype: int64

In [22]:
X_train['BestDeadliftKg'].value_counts()

BestDeadliftKg
227.50    514
250.00    390
215.00    357
240.00    342
200.00    333
         ... 
250.02      1
234.96      1
245.50      1
258.50      1
166.00      1
Name: count, Length: 492, dtype: int64

In [24]:
X_train['BodyweightKg'].value_counts()

BodyweightKg
82.500     91
90.000     88
67.500     74
75.000     73
81.600     67
           ..
129.770     1
92.020      1
97.860      1
74.162      1
51.160      1
Name: count, Length: 3991, dtype: int64

In [25]:
X_train['Equipment'].value_counts()

Equipment
Raw           12277
Wraps          3965
Single-ply     2317
Multi-ply       341
Name: count, dtype: int64

In [36]:
X_train['BodyweightKg'].describe()

count    18900.000000
mean        85.425557
std         22.959720
min         26.130000
25%         67.700000
50%         82.100000
75%         98.970000
max        201.000000
Name: BodyweightKg, dtype: float64

### 
### Data Analysis for cleanup/preparation

Based on all the data info, only the 'Age' column will need to be imputed cleaned for null values (approx 175 entries). The dataset may also need to be cleaned up based on age because there seems to be some underepresentative ages. This is the case for sex as well so I'm planning on using sample_weight parameters for 'Sex' and 'Age' features. Lastly, the negative weight entries should be removed from 
'BestSquatKg' column. 


Additional/Missing cleanup steps:

* Negative and 0 values in BestBenchKg, BestDeadliftKg, BestSquatKg, and BodyweightKg.
* Extremely High/Low values in the weight features
* Age Imputation, mean/median imputation, imputation based on weight class/performance, remove instead.
* Feature Engineering
    - Wilks/DOTS score (strength relative to bodyweight)
    - Total = Squat + Bench + Deadlift
    - Age group/bins instead of raw age
    - Bodyweight categories (weight classes)

In [None]:
# Cleanup data, based on info, only the Age column has null values

# clean training data
train_null_mask = X_train['Age'].notna() # creates boolean mask for non-null
X_train_clean = X_train[train_null_mask]
y_train_clean = y_train[train_null_mask]

test_null_mask = X_test['Age'].notna()
X_test_clean = X_test[test_null_mask]
y_test_clean = y_test[test_null_mask]


# Using dropna
# train_valid_indices = X_train['Age'].dropna().index
# X_train_clean = X_train['Age'].loc[train_valid_indices]

In [13]:
# Name column isn't useful
X_train_clean.drop(columns=['Name'])
X_test_clean.drop(columns=['Name'])

Unnamed: 0,playerId,Sex,Equipment,Age,BodyweightKg,BestSquatKg,BestDeadliftKg
0,2308.0,M,Wraps,30.0,89.81,179.17,192.78
1,22404.0,M,Raw,27.0,74.95,125.00,165.00
2,23397.0,M,Wraps,30.0,122.61,290.00,317.50
3,25058.0,M,Raw,20.0,88.00,187.50,240.00
4,2664.0,M,Wraps,30.0,122.00,235.01,275.01
...,...,...,...,...,...,...,...
11095,3104.0,M,Wraps,28.0,98.20,270.00,315.00
11096,10239.0,F,Raw,36.0,58.80,92.50,137.50
11097,29044.0,M,Raw,22.5,82.62,250.00,311.00
11098,29947.0,F,Wraps,28.0,74.30,105.00,152.50


In [15]:
def convert_kgs_to_lbs(kgs):
    return np.round(kgs * 2.20462, 2)

198.0