# Frailty Data - Reproducible Workflow

### Import Packages

In [21]:
import os
import pandas as pd
import numpy as np
import statsmodels.api as sm

### Create Directory Structure

In [7]:
# Define Directories
parent_dir = 'Assignment_1-1_Frailty-Reproducible-Workflow'
data_raw_dir = 'data_raw'
data_clean_dir = 'data_clean'
results_dir = 'results'
source_dir = 'src'

# Create Directories
os.mkdir(f'./{parent_dir}')
os.mkdir(f'./{parent_dir}/{data_raw_dir}')
os.mkdir(f'./{parent_dir}/{data_clean_dir}')
os.mkdir(f'./{parent_dir}/{results_dir}')
os.mkdir(f'./{parent_dir}/{source_dir}')

## Data Collection

As the dataset was provided in a Word Document, it was copied into a spreadsheet and modified with two additional, synthetic rows that contained NaNs.

It was saved as "frailty.csv".

At this point, upload or copy frailty.csv into:

In [8]:
print(f'./{parent_dir}/{data_raw_dir}')

./Assignment_1-1_Frailty-Reproducible-Workflow/data_raw


Then, create a README

In [13]:
with open(f'./{parent_dir}/{data_raw_dir}/README.md', "w") as file: # Create a (mostly) empty README
    file.write("Raw Data Metadata")

**The README will need to be manually updated with the appropriate field data.**

### Import the Dataset

In [14]:
# Create the Dataframe
data_raw = pd.read_csv(f'./{parent_dir}/{data_raw_dir}/frailty.csv')
data_raw.head(15)

Unnamed: 0,Height,Weight,Age,Grip strength,Frailty
0,65.8,112.0,30,30,N
1,71.5,136.0,19,31,N
2,69.4,153.0,45,29,N
3,67.6,,34,27,Y
4,68.2,142.0,22,28,Y
5,67.8,144.0,29,24,Y
6,68.7,123.0,50,26,N
7,69.8,141.0,51,22,Y
8,70.1,136.0,23,20,Y
9,67.9,112.0,17,19,N


## Data Processing/Cleaning

Data preprocessing will be done systematically, here.  Since the only data in need of cleaning where the manually-created NaNs, will just be doing basic imputing.

In [16]:
# Check for NaNs
data_raw.isna().sum()

Height           1
Weight           1
Age              0
Grip strength    0
Frailty          0
dtype: int64

In [17]:
# Create a new dataframe while impute the NaNs with the columns' means
data_clean = data_raw.fillna(data_raw[data_raw.select_dtypes(include=np.number).columns].mean())

# Confirm there are no NaNs in the cleaned dataframe
data_clean.isna().sum()

Height           0
Weight           0
Age              0
Grip strength    0
Frailty          0
dtype: int64

In [55]:
# Print the new cleaned data as a sanity check
data_clean.head(15)

Unnamed: 0,Height,Weight,Age,Grip strength,Frailty
0,65.8,112.0,30,30,N
1,71.5,136.0,19,31,N
2,69.4,153.0,45,29,N
3,67.6,133.181818,34,27,Y
4,68.2,142.0,22,28,Y
5,67.8,144.0,29,24,Y
6,68.7,123.0,50,26,N
7,69.8,141.0,51,22,Y
8,70.1,136.0,23,20,Y
9,67.9,112.0,17,19,N


In [18]:
# Save the new dataframe as a csv
data_clean.to_csv(f'./{parent_dir}/{data_clean_dir}/frailty_clean.csv')

## Data Analysis

Since this is a classification problem (frailty: yes/no), we're doing basic Logistic Regression.  No train/test split, just checking predictor significance.

In [53]:
X = data_clean.drop(['Frailty'],axis=1) # Define predictors
y = data_clean['Frailty'] == 'Y' # Define label, where "Y" = 1/True

lr_model = sm.Logit(y, X) # Define the LR model
results = lr_model.fit() # Train the model
print(results.summary()) # Print the results

Optimization terminated successfully.
         Current function value: 0.325829
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:                Frailty   No. Observations:                   12
Model:                          Logit   Df Residuals:                        8
Method:                           MLE   Df Model:                            3
Date:                Mon, 10 Jun 2024   Pseudo R-squ.:                  0.5299
Time:                        21:46:44   Log-Likelihood:                -3.9099
converged:                       True   LL-Null:                       -8.3178
Covariance Type:            nonrobust   LLR p-value:                   0.03185
                    coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------------
Height           -0.2018      0.163     -1.240      0.215      -0.521       0.117
Weight            0.

In [58]:
with open(f'./{parent_dir}/{results_dir}/results.txt', "w") as file: # Write the results to a text file
    file.write(f'{results.summary()}')

### Cleanup

In [63]:
!zip -r './{parent_dir}.zip' './{parent_dir}' # Zip up the directory to be downloaded

  adding: Assignment_1-1_Frailty-Reproducible-Workflow/ (stored 0%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/results/ (stored 0%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/results/results.txt (deflated 68%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/data_raw/ (stored 0%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/data_raw/frailty.csv (deflated 35%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/data_raw/README.md (stored 0%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/src/ (stored 0%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/data_clean/ (stored 0%)
  adding: Assignment_1-1_Frailty-Reproducible-Workflow/data_clean/frailty_clean.csv (deflated 39%)


Finally, don't forget to save this notebook in:

In [59]:
print(f'./{parent_dir}/{source_dir}')

./Assignment_1-1_Frailty-Reproducible-Workflow/src
