## CBC [ Study , Analysis , Detection ]
<img src="binary.png" alt="Drawing" width="4000"/>


# 1 . 1  Load and Explore the Data

In [53]:
#  Environment preparation
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
!cd NHANES
!pwd

/home/tallawi/CBC _model


In [29]:

# قم بتحديد مسار الملفات إذا لزم الأمر
file_paths = [f'cbc{i}.XPT' for i in range(1, 14)]

# قراءة كل ملف وعرض معلوماته
all_data = []
for file in file_paths:
    df = pd.read_sas(file, format='xport')  # قراءة الملف بصيغة xpt
    print(f'Info for {file}:')
    print(df.info())  # عرض معلومات الملف
    print('========================================')


FileNotFoundError: [Errno 2] No such file or directory: 'cbc1.XPT'

## 1.2  comment  : 
     #       Combined data columns: Index(['SEQN', 'WTPH2YR', 'LBXWBCSI', 'LBXLYPCT', 'LBXMOPCT', 'LBXNEPCT',
      #  'LBXEOPCT', 'LBXBAPCT', 'LBDLYMNO', 'LBDMONO', 'LBDNENO', 'LBDEONO',
      #  'LBDBANO', 'LBXRBCSI', 'LBXHGB', 'LBXHCT', 'LBXMCVSI', 'LBXMC',
      #  'LBXMCHSI', 'LBXRDW', 'LBXPLTSI', 'LBXMPSI', 'LBXNRBC', 'LB2DAY',
      #  'LB2WBCSI', 'LB2LYPCT', 'LB2MOPCT', 'LB2NEPCT', 'LB2EOPCT', 'LB2BAPCT',
      #  'LB2LYMNO', 'LB2MONO', 'LB2NENO', 'LB2EONO', 'LB2BANO', 'LB2RBCSI',
      #  'LB2HGB', 'LB2HCT', 'LB2MCVSI', 'LB2MCHSI', 'LB2MC', 'LB2RDW',
      #  'LB2PLTSI', 'LB2MPSI'],
      # dtype='object')
            **WE NEED TO  Handle special case for file 3 (rename LB2* columns to LBX*)**

## 2 . preparing and cleaning

### 2 . 1  merge all files

In [None]:
from pathlib import Path

def merge_cbc_files(file_pattern='cbc*.XPT'):
    """
    Merge multiple CBC XPT files into a single DataFrame.
    
    Args:
        file_pattern (str): Pattern to match XPT files (default: 'cbc*.XPT')
        
    Returns:
        pandas.DataFrame: Merged data from all CBC files
    """
    # Store all dataframes
    dfs = []
    
    # Process each XPT file
    for file_path in sorted(Path('.').glob(file_pattern)):
        try:
            # Read the XPT file
            df = pd.read_sas(file_path)
            
            # Add file source
            df['source_file'] = file_path.name

 # Handle special case for file 3 (rename LB2* columns to LBX*)
            if 'LB2WBCSI' in df.columns:
                rename_dict = {
                    'LB2WBCSI': 'LBXWBCSI',
                    'LB2LYPCT': 'LBXLYPCT',
                    'LB2MOPCT': 'LBXMOPCT',
                    'LB2NEPCT': 'LBXNEPCT',
                    'LB2EOPCT': 'LBXEOPCT',
                    'LB2BAPCT': 'LBXBAPCT',
                    'LB2LYMNO': 'LBDLYMNO',
                    'LB2MONO': 'LBDMONO',
                    'LB2NENO': 'LBDNENO',
                    'LB2EONO': 'LBDEONO',
                    'LB2BANO': 'LBDBANO',
                    'LB2RBCSI': 'LBXRBCSI',
                    'LB2HGB': 'LBXHGB',
                    'LB2HCT': 'LBXHCT',
                    'LB2MCVSI': 'LBXMCVSI',
                    'LB2MCHSI': 'LBXMCHSI',
                    'LB2MC': 'LBXMC',
                    'LB2RDW': 'LBXRDW',
                    'LB2PLTSI': 'LBXPLTSI',
                    'LB2MPSI': 'LBXMPSI'
                }
                df = df.rename(columns=rename_dict)
            
            dfs.append(df)
            print(f"Processed {file_path.name}: {len(df)} rows")
            
        except Exception as e:
            print(f"Error processing {file_path.name}: {str(e)}")
    
    # Combine all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Sort by SEQN
    merged_df = merged_df.sort_values('SEQN')
    
    return merged_df

def save_and_analyze(df, output_file='merged_cbc_data.csv'):
    """
    Save merged data and print analysis summary.
    
    Args:
        df (pandas.DataFrame): Merged DataFrame
        output_file (str): Output CSV filename
    """
    # Save to CSV
    df.to_csv(output_file, index=False)
    
    # Print summary
    print("\nMerged Data Summary:")
    print(f"Total rows: {len(df):,}")
    print(f"Total unique subjects (SEQN): {df['SEQN'].nunique():,}")
    print("\nColumns and non-null counts:")
    for col in df.columns:
        non_null = df[col].count()
        pct_non_null = (non_null / len(df)) * 100
        print(f"{col}: {non_null:,} ({pct_non_null:.1f}%)")

# Main execution
if __name__ == "__main__":
    try:
        # Merge files
        print("Starting CBC files merge...")
        merged_data = merge_cbc_files()
        
        # Save and analyze results
        save_and_analyze(merged_data)
        
        print("\nMerge completed successfully!")
        
    except Exception as e:
        print(f"An error occurred: {str(e)}")

## 2.2 optimize columns

In [None]:
df = merged_data.drop(['SEQN','WTPH2YR','LBXNRBC','source_file','LB2DAY'],axis=1)

In [None]:
# Mapping dictionary for renaming columns to shorter, similar names
column_mapping = {
    'LBXWBCSI': 'WBC',     # White Blood Cell Count
    'LBXLYPCT': 'LY%',     # Lymphocytes Percentage
    'LBXMOPCT': 'MO%',     # Monocytes Percentage
    'LBXNEPCT': 'NE%',     # Neutrophils Percentage
    'LBXEOPCT': 'EO%',     # Eosinophils Percentage
    'LBXBAPCT': 'BA%',     # Basophils Percentage
    'LBDLYMNO': 'LY#',     # Lymphocyte Count
    'LBDMONO': 'MO#',      # Monocyte Count
    'LBDNENO': 'NE#',      # Neutrophil Count
    'LBDEONO': 'EO#',      # Eosinophil Count
    'LBDBANO': 'BA#',      # Basophil Count
    'LBXRBCSI': 'RBC',     # Red Blood Cell Count
    'LBXHGB': 'HGB',       # Hemoglobin
    'LBXHCT': 'HCT',       # Hematocrit
    'LBXMCVSI': 'MCV',     # Mean Corpuscular Volume
    'LBXMCHSI': 'MCH',     # Mean Corpuscular Hemoglobin
    'LBXMC': 'MCHC',       # Mean Corpuscular Hemoglobin Concentration
    'LBXRDW': 'RDW',       # Red Cell Distribution Width
    'LBXPLTSI': 'PLT',     # Platelet Count
    'LBXMPSI': 'MPV',      # Mean Platelet Volume
}

# Apply renaming
df = df.rename(columns=column_mapping)

# Display the new column names to verify
df.info()

In [None]:
df= df.drop_duplicates().dropna()

# 3 . E D A
## 3.1 Describe DataFrame of CBC

- **WBC, NE#, LY#, MO#, EO#, BA#** — *White Blood Cells*
- **RBC, HGB, HCT, MCV, MCH, MCHC** — *Red Blood Cells*
- **RDW, PLT, MPV, PCT, PDW** — *Platelets*



In [None]:
df_describe = df.describe()
df_describe

## 3.2 Splitting
 Split dataframe vertically based on features 

 ### Data Division Overview

- **Dataset**
  - **RBC Features**
    - RBC: Red Blood Cell Count
    - HGB: Hemoglobin Level
    - HCT: Hematocrit Percentage
    - MCV: Mean Corpuscular Volume
    - MCH: Mean Corpuscular Hemoglobin
    - MCHC: Mean Corpuscular Hemoglobin Concentration
    - RDW: Red Cell Distribution Width
    - PLT: Platelet Count

  - **WBC Features**
    - WBC: White Blood Cell Count
    - NE#: Neutrophil Count
    - LY#: Lymphocyte Count
    - MO#: Monocyte Count
    - EO#: Eosinophil Count
    - BA#: Basophil Count
    - NE%: Neutrophil Relative count
    - LY%: Lymphocyte Relative count
    - MO%: Monocyte Relative count
    - EO%: Eosinophil Relative count
    - BA%: Basophil Relative count
    - PLT: Platelet Count

  - **Platelet Features**
    - PLT: Platelet Count
    - MPV: Mean Platelet Volume


In [None]:
rbc_features = ['RBC','HGB','HCT','MCV','MCH','MCHC','RDW','PLT']
wbc_features = ['WBC','NE#','LY#','MO#','EO#','BA#','NE%','LY%','MO%','EO%','BA%','PLT']
plt_features = ['PLT','MPV']
df_rbc = df[rbc_features]
df_wbc = df[wbc_features]
df_plt = df[plt_features]

# 4. RBC <img src="rbc.png" alt="Smiley face" width="30" height="30" style="vertical-align:middle;margin:0px 50px">
## 4.1 Splitting data fram based on HGB ,RBC
 
### Data Splitting Overview

In this analysis, the data has been divided into multiple sections based on specific criteria related to blood measurements:

#### 1. **Primary Data Splitting**
The data has been initially divided into three main groups based on Hemoglobin (HGB) and Red Blood Cells (RBC) levels:

- **Anemia DataFrame (`df_anemia`)**: Contains records where:
  - HGB < 12.0 or RBC < 3.8 (indicating anemia).

- **Greater RBC DataFrame (`df_RBC_GREATER`)**: Contains records where:
  - HGB > 17.5 or RBC > 5.9 (indicating elevated levels).

- **Normal RBC DataFrame (`df_RBC_NORMAL`)**: Contains records with normal ranges:
  - HGB is within the interval [12.0, 17.5]
  - RBC is within the interval [3.8, 5.9]

#### 2. **Anemia Classification**
The `df_anemia` DataFrame is further classified into three types of anemia based on MCV (Mean Corpuscular Volume):

- **Microcytic Anemia**: MCV < 80
- **Normocytic Anemia**: 80 ≤ MCV ≤ 100
- **Macrocytic Anemia**: MCV > 100

This structured approach allows for a clearer analysis of the relationships between blood parameters and different types of anemia.



In [None]:

# Function to classify anemia based on MCV value
def classify_anemia(row):
    if row['MCV'] < 80:  # Microcytic
        return 'Microcytic'
    elif 80 <= row['MCV'] <= 100:  # Normocytic
        return 'Normocytic'
    elif row['MCV'] > 100:  # Macrocytic
        return 'Macrocytic'
    else:
        return 'Unknown'

# Splitting the data based on HGB and RBC values
df_anemia = df_rbc[(df_rbc['HGB'] < 12.0) | (df_rbc['RBC'] < 3.8)]


# Define normal ranges
hgb_normal_range = pd.Interval(left=12.0, right=17.5, closed='both')  # Adjust right value based on gender if needed
rbc_normal_range = pd.Interval(left=3.8, right=5.9, closed='both')

# Filter DataFrame based on normal ranges
df_RBC_NORMAL = df_rbc[
    df_rbc['HGB'].apply(lambda x: x in hgb_normal_range) & 
    df_rbc['RBC'].apply(lambda x: x in rbc_normal_range)
]

df_RBC_GREATER = df_rbc[
    ( df_rbc['HGB'].apply(lambda x: x in hgb_normal_range) &  (df_rbc['RBC'] >5.9) ) |
    ( df_rbc['RBC'].apply(lambda x: x in rbc_normal_range) &  (df_rbc['HGB'] > 17.5) ) |
    ((df_rbc['HGB'] > 17.5) & (df_rbc['RBC'] >5.9))
    
]


# Applying anemia classification to the df_anemia DataFrame
df_anemia['AnemiaType'] = df_anemia.apply(classify_anemia, axis=1)

# # Print results
# print("========================================= Anemia Data =========================================")
# display(df_anemia['AnemiaType'].value_counts())

# print("========================================= df_RBC_NORMAL Data =========================================")
# display(df_RBC_NORMAL.head())

# print("========================================= df_RBC_GREATER Data =========================================")
# display(df_RBC_GREATER.head())

# # Optional: Save these tables to CSV files if needed
# df_anemia.to_csv('anemia_data.csv', index=False)
# df_RBC_NORMAL.to_csv('df_RBC_NORMAL.csv', index=False)
# df_RBC_GREATER.to_csv('df_RBC_GREATER.csv', index=False)

In [None]:
display(df_anemia['AnemiaType'].value_counts())

## 4.2 Anemia
### 4.2.1 Splitting the dataframe **(`df_anemia`)** based on the type of anemia :
<p align="center">
<h3>- Microcytic Anemia</h3>
<h3>- Normocytic Anemia</h3>
<h3>- Macrocytic Anemia</h3>

  <img src="anemiatype.gif" width="700" height="100" style="vertical-align:middle;margin:0px 50px">
</p>

In [None]:

df_microcytic = df_anemia[df_anemia['AnemiaType'] == 'Microcytic']
df_normocytic = df_anemia[df_anemia['AnemiaType'] == 'Normocytic']
df_macrocytic = df_anemia[df_anemia['AnemiaType'] == 'Macrocytic']


## 4.2.1 describe Anemia Types : 

In [None]:

print( '========================================= All Anemia =========================================')
df_anemia.describe()
print( '========================================= Microcytic =========================================')
df_microcytic.describe()
print( '========================================= Normocytic =========================================')
df_normocytic.describe()
print( '========================================= Macrocytic =========================================')
df_macrocytic.describe()

In [None]:
numeric_columns = df_anemia.select_dtypes(include=[np.number]).columns.tolist()

for feature in numeric_columns:
    plt.figure(figsize=(8, 6))
    sns.violinplot(x='AnemiaType', y=feature, data=df_anemia)
    plt.title(f'Violin Plot for {feature}')
    plt.show()


## 4.2.1 Comment:

### Post-analysis observations

1. **HGB and RBC distribution:**
- There is a significant difference in the distribution of *HGB* and *RBC* across anemia types.
- This indicates that the type of anemia affects different levels in the blood cells

2. **HCT and PLT variation:**
- Slight variation in *HCT* and *PLT* was observed across different anemia types.

3. **MCH and MCHC variations:**
- There is a significant difference in *MCH* values ​​although *MCHC* is unchanged

4. **Macrocytic vs. Microcytic and Normocytic Anemia:**
- *Macrocytic anemia* shows significant differences in several features compared to *Microcytic* and *Normocytic* anemia types.
- Although the overall values ​​may appear similar, the *violin plot* revealed distinct details and patterns, highlighting the uniqueness of macrocytic anemia.

In general, deeper insights will be revealed through correlation analysis. [Correlation]

## 4.2.2 Correlation Anemia Types : 

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def correlation_plots(correlation_matrices, titles, layout='single', fig_size=(7, 7)):
    """
    Plot correlation heatmaps for one or multiple matrices.
    
    Parameters:
    - correlation_matrices (list of DataFrames): List of correlation matrices to plot.
    - titles (list of str): List of titles for each correlation matrix.
    - layout (str): Layout of plots ('single' or 'multi') - 'single' to plot each separately, 'multi' to plot them together.
    - fig_size (tuple): Size of each plot (for 'single') or of entire figure (for 'multi').
    """
    num_plots = len(correlation_matrices)
    
    if layout == 'single':
        # Plot each matrix in a separate figure
        for i in range(num_plots):
            plt.figure(figsize=fig_size)
            sns.heatmap(correlation_matrices[i], annot=True, fmt=".2f", cmap='coolwarm', square=True)
            plt.title(titles[i])
            plt.show()
    elif layout == 'multi':
        # Plot all matrices in a single figure with subplots
        cols = min(num_plots, 3)  # Up to 3 per row for better layout
        rows = (num_plots + cols - 1) // cols
        fig, axes = plt.subplots(rows, cols, figsize=fig_size)
        axes = axes.flatten() if num_plots > 1 else [axes]  # Flatten axes for easy indexing
        
        for i in range(num_plots):
            sns.heatmap(correlation_matrices[i], annot=True, fmt=".2f", cmap='coolwarm', square=True, ax=axes[i])
            axes[i].set_title(titles[i])
        
        # Hide any extra subplots
        for j in range(i + 1, len(axes)):
            axes[j].axis('off')
            
        plt.tight_layout()
        plt.show()



def scatter_plot(fet_1,fet_2 , DataFrame):
    plt.figure(figsize=(10,6))
    sns.regplot(data=DataFrame, x=fet_1, y=fet_2, scatter=True, order=1, label='Linear')
    
    # نضيف منحنى غير خطي (polynomial)
    sns.regplot(data=DataFrame, x=fet_1, y=fet_2, scatter=True, order=2, label='Polynomial')
    
    plt.title(f'{fet_1} vs {fet_2}: Linear vs Non-linear Relationship')
    plt.legend()
    plt.show()

In [None]:
# General correlation matrix
correlation_matrix = df_anemia.drop(['AnemiaType'], axis=1).corr()  # Adding axis=1 to drop column
# Microcytic correlation matrix
correlation_matrix_micro = df_microcytic.drop(['AnemiaType'], axis=1).corr()
# Normocytic correlation matrix
correlation_matrix_normo = df_normocytic.drop(['AnemiaType'], axis=1).corr()
# Macrocytic correlation matrix
correlation_matrix_macro = df_macrocytic.drop(['AnemiaType'], axis=1).corr()


# Display the plots
correlation_plots([correlation_matrix], ['Anemia'], layout='single', fig_size=(7, 7))
correlation_plots(
    correlation_matrices=[correlation_matrix_macro, correlation_matrix_normo, correlation_matrix_micro],
    titles=['Macrocytic Correlation', 'Normocytic Correlation', 'Microcytic Correlation'],
    layout='multi',  # لطباعة الرسومات جنبًا إلى جنب في نفس الشكل
    fig_size=(18, 6)  # حجم الشكل الكامل عند استخدام `multi`
)


In [None]:
def create_correlation_comparison(correlation_matrices):
    """
    تحليل الارتباطات بين المتغيرات لأنواع مختلفة من فقر الدم
    
    Parameters:
    correlation_matrices (dict): قواميس تحتوي على مصفوفات الارتباط لكل نوع من فقر الدم
    
    Returns:
    pd.DataFrame: جدول يحتوي على مقارنة الارتباطات القوية بين المتغيرات
    """
    # تعريف حدود الارتباط
    UPPER_THRESHOLD = 0.5
    LOWER_THRESHOLD = -0.4
    
    # إنشاء قائمة لتخزين جميع العلاقات
    all_relations = []
    
    # معالجة كل نوع من فقر الدم
    for anemia_type, corr_matrix in correlation_matrices.items():
        # الحصول على المثلث العلوي من مصفوفة الارتباط لتجنب التكرار
        upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
        
        # تحديد الارتباطات القوية
        strong_correlations = upper_triangle[
            (upper_triangle > UPPER_THRESHOLD) | 
            (upper_triangle < LOWER_THRESHOLD)
        ]
        
        # جمع العلاقات القوية
        for feature1, row in strong_correlations.items():
            for feature2, corr_value in row.dropna().items():
                relation = f"{feature1}/{feature2}"
                all_relations.append({
                    'Relation': relation,
                    'Anemia_Type': anemia_type,
                    'Correlation': corr_value
                })
    
    # تحويل القائمة إلى DataFrame
    relations_df = pd.DataFrame(all_relations)
    
    # إعادة تشكيل البيانات لتكون في الشكل المطلوب
    comparison_df = relations_df.pivot(
        index='Relation',
        columns='Anemia_Type',
        values='Correlation'
    ).reset_index()
    
    # ملء القيم المفقودة
    comparison_df.fillna('No Strong Correlation', inplace=True)
    
    return comparison_df

# استخدام الدالة
anemia_types = ['Microcytic', 'Normocytic', 'Macrocytic']
correlation_matrices = {
    'Microcytic': correlation_matrix_micro,
    'Normocytic': correlation_matrix_normo,
    'Macrocytic': correlation_matrix_macro
}

comparison_df = create_correlation_comparison(correlation_matrices)

In [None]:
# Sort by Microcytic correlations in descending order
comparison_df_micro = comparison_df.sort_values(by='Microcytic', ascending=False, key=lambda col: pd.to_numeric(col, errors='coerce')).reset_index(drop=True)

# Sort by Normocytic correlations in descending order
comparison_df_normo = comparison_df.sort_values(by='Normocytic', ascending=False, key=lambda col: pd.to_numeric(col, errors='coerce')).reset_index(drop=True)

# Sort by Macrocytic correlations in descending order
comparison_df_macro = comparison_df.sort_values(by='Macrocytic', ascending=False, key=lambda col: pd.to_numeric(col, errors='coerce')).reset_index(drop=True)

# Display the sorted dataframes
print("Sorted by Microcytic Correlations:")
display(comparison_df_micro)

print("\nSorted by Normocytic Correlations:")
display(comparison_df_normo)

print("\nSorted by Macrocytic Correlations:")
display(comparison_df_macro)


In [None]:
scatter_plot('MCV','MCH' , df)