## CBC [ Study , Analysis , Detection ]
<img src="binary.png" alt="Drawing" width="4000"/>



# 1 . 1  Load and Explore the Data

In [6]:
#  Environment preparation
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
!cd NHANES
!pwd

/bin/bash: line 1: cd: NHANES: No such file or directory
/home/tallawi/CBC _model/NHANES


In [10]:
file_paths = [f'cbc{i}.XPT' for i in range(1, 14)]

# Read each file and view its information.
all_data = []
for file in file_paths:
    df = pd.read_sas(file, format='xport')  # قراءة الملف بصيغة xpt
    print(f'Info for {file}:')
    print(df.info())  # عرض معلومات الملف
    print('========================================')


Info for cbc1.XPT:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8727 entries, 0 to 8726
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   SEQN      8727 non-null   float64
 1   WTPH2YR   8727 non-null   float64
 2   LBXWBCSI  7593 non-null   float64
 3   LBXLYPCT  7582 non-null   float64
 4   LBXMOPCT  7582 non-null   float64
 5   LBXNEPCT  7582 non-null   float64
 6   LBXEOPCT  7582 non-null   float64
 7   LBXBAPCT  7582 non-null   float64
 8   LBDLYMNO  7582 non-null   float64
 9   LBDMONO   7582 non-null   float64
 10  LBDNENO   7582 non-null   float64
 11  LBDEONO   7582 non-null   float64
 12  LBDBANO   7582 non-null   float64
 13  LBXRBCSI  7593 non-null   float64
 14  LBXHGB    7593 non-null   float64
 15  LBXHCT    7593 non-null   float64
 16  LBXMCVSI  7593 non-null   float64
 17  LBXMC     7593 non-null   float64
 18  LBXMCHSI  7593 non-null   float64
 19  LBXRDW    7593 non-null   float64
 20  LBXPLTSI  7

In [13]:
from pathlib import Path

def merge_cbc_files(file_pattern='cbc*.XPT'):
    """
    Merge multiple CBC XPT files into a single DataFrame.
    
    Args:
        file_pattern (str): Pattern to match XPT files (default: 'cbc*.XPT')
        
    Returns:
        pandas.DataFrame: Merged data from all CBC files
    """
    # Store all dataframes
    dfs = []
    
    # Process each XPT file
    for file_path in sorted(Path('.').glob(file_pattern)):
        try:
            # Read the XPT file
            df = pd.read_sas(file_path)
            
            # Add file source
            df['source_file'] = file_path.name

 # Handle special case for file 3 (rename LB2* columns to LBX*)
            if 'LB2WBCSI' in df.columns:
                rename_dict = {
                    'LB2WBCSI': 'LBXWBCSI',
                    'LB2LYPCT': 'LBXLYPCT',
                    'LB2MOPCT': 'LBXMOPCT',
                    'LB2NEPCT': 'LBXNEPCT',
                    'LB2EOPCT': 'LBXEOPCT',
                    'LB2BAPCT': 'LBXBAPCT',
                    'LB2LYMNO': 'LBDLYMNO',
                    'LB2MONO': 'LBDMONO',
                    'LB2NENO': 'LBDNENO',
                    'LB2EONO': 'LBDEONO',
                    'LB2BANO': 'LBDBANO',
                    'LB2RBCSI': 'LBXRBCSI',
                    'LB2HGB': 'LBXHGB',
                    'LB2HCT': 'LBXHCT',
                    'LB2MCVSI': 'LBXMCVSI',
                    'LB2MCHSI': 'LBXMCHSI',
                    'LB2MC': 'LBXMC',
                    'LB2RDW': 'LBXRDW',
                    'LB2PLTSI': 'LBXPLTSI',
                    'LB2MPSI': 'LBXMPSI'
                }
                df = df.rename(columns=rename_dict)
            
            dfs.append(df)
            print(f"Processed {file_path.name}: {len(df)} rows")
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {str(e)}")
    
    # Combine all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Sort by SEQN
    merged_df = merged_df.sort_values('SEQN')
    
    return merged_df

def save_and_analyze(df, output_file='merged_cbc_data.csv'):
    """
    Save merged data and print analysis summary.
    
    Args:
        df (pandas.DataFrame): Merged DataFrame
        output_file (str): Output CSV filename
    """
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    # Print summary
    print("\nMerged Data Summary:")
    print(f"Total rows: {len(df):,}")
    print(f"Total unique subjects (SEQN): {df['SEQN'].nunique():,}")
    print("\nColumns and non-null counts:")
    for col in df.columns:
        non_null = df[col].count()
        pct_non_null = (non_null / len(df)) * 100
        print(f"{col}: {non_null:,} ({pct_non_null:.1f}%)")

# Main execution
if __name__ == "__main__":
    try:
        # Merge files
        print("Starting CBC files merge...")
        merged_data = merge_cbc_files()
        
        # Save and analyze results
        save_and_analyze(merged_data)
        
        print("\nMerge completed successfully!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Starting CBC files merge...
Processed cbc1.XPT: 8727 rows
Processed cbc10.XPT: 9440 rows
Processed cbc11.XPT: 8832 rows
Processed cbc12.XPT: 9307 rows
Processed cbc13.XPT: 8366 rows
Processed cbc2.XPT: 13772 rows
Processed cbc3.XPT: 557 rows
Processed cbc4.XPT: 9165 rows
Processed cbc5.XPT: 9422 rows
Processed cbc6.XPT: 8956 rows
Processed cbc7.XPT: 9835 rows
Processed cbc8.XPT: 9929 rows
Processed cbc9.XPT: 9179 rows

Merged Data Summary:
Total rows: 115,487
Total unique subjects (SEQN): 114,930

Columns and non-null counts:
SEQN: 115,487 (100.0%)
WTPH2YR: 8,727 (7.6%)
LBXWBCSI: 103,223 (89.4%)
LBXLYPCT: 103,028 (89.2%)
LBXMOPCT: 103,028 (89.2%)
LBXNEPCT: 103,028 (89.2%)
LBXEOPCT: 103,028 (89.2%)
LBXBAPCT: 103,028 (89.2%)
LBDLYMNO: 103,025 (89.2%)
LBDMONO: 103,025 (89.2%)
LBDNENO: 103,025 (89.2%)
LBDEONO: 103,025 (89.2%)
LBDBANO: 103,025 (89.2%)
LBXRBCSI: 103,226 (89.4%)
LBXHGB: 103,226 (89.4%)
LBXHCT: 103,226 (89.4%)
LBXMCVSI: 103,226 (89.4%)
LBXMC: 103,256 (89.4%)
LBXMCHSI: 103,226 

##  preparing and cleaning

###  merge all files

In [19]:
from pathlib import Path

def merge_cbc_files(file_pattern='cbc*.XPT'):
    """
    Merge multiple CBC XPT files into a single DataFrame.
    
    Args:
        file_pattern (str): Pattern to match XPT files (default: 'cbc*.XPT')
        
    Returns:
        pandas.DataFrame: Merged data from all CBC files
    """
    # Store all dataframes
    dfs = []
    
    # Process each XPT file
    for file_path in sorted(Path('.').glob(file_pattern)):
        try:
            # Read the XPT file
            df = pd.read_sas(file_path)
            
            # Add file source
            df['source_file'] = file_path.name

 # Handle special case for file 3 (rename LB2* columns to LBX*)
            if 'LB2WBCSI' in df.columns:
                rename_dict = {
                    'LB2WBCSI': 'LBXWBCSI',
                    'LB2LYPCT': 'LBXLYPCT',
                    'LB2MOPCT': 'LBXMOPCT',
                    'LB2NEPCT': 'LBXNEPCT',
                    'LB2EOPCT': 'LBXEOPCT',
                    'LB2BAPCT': 'LBXBAPCT',
                    'LB2LYMNO': 'LBDLYMNO',
                    'LB2MONO': 'LBDMONO',
                    'LB2NENO': 'LBDNENO',
                    'LB2EONO': 'LBDEONO',
                    'LB2BANO': 'LBDBANO',
                    'LB2RBCSI': 'LBXRBCSI',
                    'LB2HGB': 'LBXHGB',
                    'LB2HCT': 'LBXHCT',
                    'LB2MCVSI': 'LBXMCVSI',
                    'LB2MCHSI': 'LBXMCHSI',
                    'LB2MC': 'LBXMC',
                    'LB2RDW': 'LBXRDW',
                    'LB2PLTSI': 'LBXPLTSI',
                    'LB2MPSI': 'LBXMPSI'
                }
                df = df.rename(columns=rename_dict)
            
            dfs.append(df)
            print(f"Processed {file_path.name}: {len(df)} rows")
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {str(e)}")
    
    # Combine all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Sort by SEQN
    merged_df = merged_df.sort_values('SEQN')
    
    return merged_df

def save_and_analyze(df, output_file='merged_cbc_data.csv'):
    """
    Save merged data and print analysis summary.
    
    Args:
        df (pandas.DataFrame): Merged DataFrame
        output_file (str): Output CSV filename
    """
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    # Print summary
    print("\nMerged Data Summary:")
    print(f"Total rows: {len(df):,}")
    print(f"Total unique subjects (SEQN): {df['SEQN'].nunique():,}")
    print("\nColumns and non-null counts:")
    for col in df.columns:
        non_null = df[col].count()
        pct_non_null = (non_null / len(df)) * 100
        print(f"{col}: {non_null:,} ({pct_non_null:.1f}%)")

# Main execution
if __name__ == "__main__":
    try:
        # Merge files
        print("Starting CBC files merge...")
        merged_data = merge_cbc_files()
        
        # Save and analyze results
        save_and_analyze(merged_data)
        
        print("\nMerge completed successfully!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

Starting CBC files merge...
Processed cbc1.XPT: 8727 rows
Processed cbc10.XPT: 9440 rows
Processed cbc11.XPT: 8832 rows
Processed cbc12.XPT: 9307 rows
Processed cbc13.XPT: 8366 rows
Processed cbc2.XPT: 13772 rows
Processed cbc3.XPT: 557 rows
Processed cbc4.XPT: 9165 rows
Processed cbc5.XPT: 9422 rows
Processed cbc6.XPT: 8956 rows
Processed cbc7.XPT: 9835 rows
Processed cbc8.XPT: 9929 rows
Processed cbc9.XPT: 9179 rows

Merged Data Summary:
Total rows: 115,487
Total unique subjects (SEQN): 114,930

Columns and non-null counts:
SEQN: 115,487 (100.0%)
WTPH2YR: 8,727 (7.6%)
LBXWBCSI: 103,223 (89.4%)
LBXLYPCT: 103,028 (89.2%)
LBXMOPCT: 103,028 (89.2%)
LBXNEPCT: 103,028 (89.2%)
LBXEOPCT: 103,028 (89.2%)
LBXBAPCT: 103,028 (89.2%)
LBDLYMNO: 103,025 (89.2%)
LBDMONO: 103,025 (89.2%)
LBDNENO: 103,025 (89.2%)
LBDEONO: 103,025 (89.2%)
LBDBANO: 103,025 (89.2%)
LBXRBCSI: 103,226 (89.4%)
LBXHGB: 103,226 (89.4%)
LBXHCT: 103,226 (89.4%)
LBXMCVSI: 103,226 (89.4%)
LBXMC: 103,256 (89.4%)
LBXMCHSI: 103,226 

## Optimize columns

In [24]:
df = merged_data.drop(['SEQN','WTPH2YR','LBXNRBC','source_file','LB2DAY'],axis=1)

In [27]:
# Mapping dictionary for renaming columns to shorter, similar names
column_mapping = {
    'LBXWBCSI': 'WBC',     # White Blood Cell Count
    'LBXLYPCT': 'LY%',     # Lymphocytes Percentage
    'LBXMOPCT': 'MO%',     # Monocytes Percentage
    'LBXNEPCT': 'NE%',     # Neutrophils Percentage
    'LBXEOPCT': 'EO%',     # Eosinophils Percentage
    'LBXBAPCT': 'BA%',     # Basophils Percentage
    'LBDLYMNO': 'LY#',     # Lymphocyte Count
    'LBDMONO': 'MO#',      # Monocyte Count
    'LBDNENO': 'NE#',      # Neutrophil Count
    'LBDEONO': 'EO#',      # Eosinophil Count
    'LBDBANO': 'BA#',      # Basophil Count
    'LBXRBCSI': 'RBC',     # Red Blood Cell Count
    'LBXHGB': 'HGB',       # Hemoglobin
    'LBXHCT': 'HCT',       # Hematocrit
    'LBXMCVSI': 'MCV',     # Mean Corpuscular Volume
    'LBXMCHSI': 'MCH',     # Mean Corpuscular Hemoglobin
    'LBXMC': 'MCHC',       # Mean Corpuscular Hemoglobin Concentration
    'LBXRDW': 'RDW',       # Red Cell Distribution Width
    'LBXPLTSI': 'PLT',     # Platelet Count
    'LBXMPSI': 'MPV',      # Mean Platelet Volume
}

# Apply renaming
df = df.rename(columns=column_mapping)

# Display the new column names to verify
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115487 entries, 18167 to 8726
Data columns (total 20 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   WBC     103223 non-null  float64
 1   LY%     103028 non-null  float64
 2   MO%     103028 non-null  float64
 3   NE%     103028 non-null  float64
 4   EO%     103028 non-null  float64
 5   BA%     103028 non-null  float64
 6   LY#     103025 non-null  float64
 7   MO#     103025 non-null  float64
 8   NE#     103025 non-null  float64
 9   EO#     103025 non-null  float64
 10  BA#     103025 non-null  float64
 11  RBC     103226 non-null  float64
 12  HGB     103226 non-null  float64
 13  HCT     103226 non-null  float64
 14  MCV     103226 non-null  float64
 15  MCHC    103256 non-null  float64
 16  MCH     103226 non-null  float64
 17  RDW     103226 non-null  float64
 18  PLT     103224 non-null  float64
 19  MPV     103224 non-null  float64
dtypes: float64(20)
memory usage: 18.5 MB


In [30]:
df.to_csv("cbc_dataframe",index=False)