Read the dataset (Frailty dataset) from Github for the Assignment#1

In [70]:
import pandas as pd
import numpy as np

# Stage 1: Ingest
# This stage involes reading and loading the raw data from CSV file in Github
# into a DataFrame
df = pd.read_csv('https://raw.githubusercontent.com/mosomo82/Frailty_Study/refs/heads/main/raw_data/raw_data.csv')
print("--- Stage 1: Ingest Complete--- ")
print(df.head())
print("\n" + "="*40 + "\n")

--- Stage 1: Ingest Complete--- 
   Height  Weight  Age  Grip strength Frailty
0    65.8     112   30             30       N
1    71.5     136   19             31       N
2    69.4     153   45             29       N
3    68.2     142   22             28       Y
4    67.8     144   29             24       Y




Preprocessing Data from raw dataset

In [71]:
# Stage 2: Process
#This stage involves cleaning, transforming, and enriching the data.
print("--- Stage 2: Process Initiated ---")

# Check for missing values
missing_values = df.isnull().sum()
if missing_values.any():
  print("Missing values found in the dataset.")
  print(missing_values)
  # since the dataset is generall clean and sample dataset size is not large enough, I will not drop any rows with missing values
  df.dropna(inplace=False)
else:
  print("No missing values found in the dataset.")

# Format the column name for professionalism, clarity and consistency
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.capitalize()
df.rename(columns={'Height': 'Height_in', 'Weight': 'Weight_lb', 'Age': 'Age_yr', 'Grip_strength': 'Grip_strength_kg'}, inplace=True)

# Output to CSV file as clean data
df.to_csv('clean_data.csv', index=False)



--- Stage 2: Process Initiated ---
No missing values found in the dataset.


a) Unit standardization
      i.	Height_m = Height_in * 0.0254
      ii.	Weight_kg = Weight_lb * 0.45359237


In [72]:
# a. Unit standadization
# i. Convert Height from inches to meters (1 inch = 0.0254 meters) and round up
# to 2 decimal.
height_m = df['Height_in'] * 0.0254
df['Height_m'] = height_m.round(2)
# ii. Convert Height from inches to meters (1 inch = 0.0254 meters) and round up to 2 decimal.
weight_kg = df['Weight_lb'] * 0.45359237
df['Weight_kg'] = weight_kg.round(2)


b) Feature engineering
      i. BMI = Weight_kg / (Height_m ** 2) (round to 2 decimals).
      ii. AgeGroup (categorical): "<30", "30–45", "46–60", ">60" based on Age_yr


In [73]:
# b. Feature Engineering
# i. Calculate Body Mass Index (BMI) and round it to two decimal places
df['BMI'] = round(df['Weight_kg'] / (df['Height_m'] ** 2), 2)

# ii. Create a categorical 'Age_group' feature based on 'Age_yr'
# Define the bins and labels for the age groups
bins = [0, 29, 45, 60, np.inf]
labels = ['<30', '30-45', '46-60', '>60']
AgeGroup = pd.cut(df['Age_yr'], bins=bins, labels=labels, right=True)
df['Age_group'] = AgeGroup

c) Categorical → numeric encoding
i. Binary encoding: Frailty_binary (Y→1, N→0, store as int8).
ii. One‑hot encode AgeGroup into columns: AgeGroup_<30, AgeGroup_30–45,
AgeGroup_46–60, AgeGroup_>60

In [74]:
# c. Categorical to numeric encoding
# i. Binary encoding for Frailty_binary
df['Frailty_binary'] = df['Frailty'].map({'Y': 1, 'N': 0}).astype('int8')

# One-hot encode AgeGroup into column
df = pd.get_dummies(df, columns=['Age_group'], prefix='Age_group')

print("--- Stage 2: Process Complete--- ")
# Display the head of the encoded DataFrame
print("Final Processed DataFrame: ")
print(df.head())
print("\n" + "="*40 + "\n")

# Output the final processed data into CSV file
df.to_csv('process_data.csv')

--- Stage 2: Process Complete--- 
Final Processed DataFrame: 
   Height_in  Weight_lb  Age_yr  Grip_strength_kg Frailty  Height_m  \
0       65.8        112      30                30       N      1.67   
1       71.5        136      19                31       N      1.82   
2       69.4        153      45                29       N      1.76   
3       68.2        142      22                28       Y      1.73   
4       67.8        144      29                24       Y      1.72   

   Weight_kg    BMI  Frailty_binary  Age_group_<30  Age_group_30-45  \
0      50.80  18.22               0          False             True   
1      61.69  18.62               0           True            False   
2      69.40  22.40               0          False             True   
3      64.41  21.52               1           True            False   
4      65.32  22.08               1           True            False   

   Age_group_46-60  Age_group_>60  
0            False          False  
1           

d. EDA & Reporting
i. Compute summary table: mean/median/std for numeric columns; save to finding.md
ii. Quantify relation of strength ↔ frailty: compute correlation between Grip_kg
and Frailty_binary, and report it.

In [75]:
# ---Stage 3: Anazlye ---
# This stage involves computing statistics and generating reports.
print("--- Stage 3: Anaylize Initiated ---")

# d. EDA & Reporting
# Compute summary table (mean/median/std) for all numeric columns.
numeric_cols = df.select_dtypes(include=[np.number]).columns
summary_table = df[numeric_cols].agg(['mean', 'median', 'std']).round(2)

# Convert the summary table to markdown format and save it to a Markdown file
summary_table.to_markdown('findings.md')
print("Summary table created and saved to 'findings.md'")
print(summary_table)
print("-" * 20)

# ii. Quantify the relationship between grip strength and frailty
# Compute the Pearson correlation coefficient
correlation = df['Grip_strength_kg'].corr(df['Frailty_binary'])
print(f"Correlation between Grip Strength and Frailty: {correlation.round(2)}")
print("--- Stage 3: Analyze Complete ---")

--- Stage 3: Anaylize Initiated ---
Summary table created and saved to 'findings.md'
        Height_in  Weight_lb  Age_yr  Grip_strength_kg  Height_m  Weight_kg  \
mean        68.60     131.90   32.50             26.00      1.74      59.83   
median      68.45     136.00   29.50             27.00      1.74      61.69   
std          1.67      14.23   12.86              4.52      0.04       6.46   

          BMI  Frailty_binary  
mean    19.72            0.40  
median  19.15            0.00  
std      1.79            0.52  
--------------------
Correlation between Grip Strength and Frailty: -0.48
--- Stage 3: Analyze Complete ---
