# Explore All Features - Comprehensive Feature Analysis

This notebook explores all available features in the PNS dataset to create a comprehensive feature set for endometriosis classification.


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import pandasql as pdsql
import json
import gc # Garbage collector


## Load Schema Mappings


In [2]:
# Load schema mappings
with open('../data/schema/data_mapping_2019.json', 'r', encoding='utf-8') as f:
    schema_2019 = json.load(f)
with open('../data/schema/data_mapping_2013.json', 'r', encoding='utf-8') as f:
    schema_2013 = json.load(f)

# Create a mapping of column names to descriptions
column_descriptions = {}
for item in schema_2019 + schema_2013:
    if item['key'] and item['description']:
        column_descriptions[item['key']] = item['description']

# Function to get column description
def get_column_description(col_name):
    return column_descriptions.get(col_name, f"No description available for {col_name}")


## Set style for plots


In [3]:
plt.style.use('seaborn-v0_8')  # Updated to use valid style name
sns.set_theme()  # Using seaborn's default theme


## Read the data


In [4]:
df = pd.read_parquet("../data/staged/PNS_union.parquet")
print(f"Total records: {len(df)}")
print(f"Total features: {len(df.columns)}")


Total records: 251250
Total features: 99


## Basic data information


In [5]:
display(df)

Unnamed: 0,V0020,C008,C009,R010,R011,R012,R025,R028,R031,R034,...,Q12106,Q12107,Q12108,Q121010,Q121011,Q121012,Q121013,Q121014,Q121015,Q12201
0,2013,37,1,99,<None>,0,0,0,0,0,...,,,,,,,,,,
1,2013,0,4,99,<None>,0,0,0,0,0,...,,,,,,,,,,
2,2013,16,1,99,<None>,0,0,0,0,0,...,,,,,,,,,,
3,2013,38,1,99,<None>,0,0,0,0,0,...,,,,,,,,,,
4,2013,32,2,0,<None>,0,12,0,1,0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251245,2019,32,4,0,<None>,0,11,0,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<None>
251246,2019,4,4,99,<None>,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<None>
251247,2019,42,4,99,<None>,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<None>
251248,2019,49,2,99,<None>,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<None>


In [6]:
print("Basic Data Information:")
print(df.info())


Basic Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251250 entries, 0 to 251249
Data columns (total 99 columns):
 #   Column   Non-Null Count   Dtype   
---  ------   --------------   -----   
 0   V0020    251250 non-null  object  
 1   C008     251250 non-null  int64   
 2   C009     251250 non-null  category
 3   R010     251250 non-null  int64   
 4   R011     251250 non-null  category
 5   R012     251250 non-null  int64   
 6   R025     251250 non-null  int64   
 7   R028     251250 non-null  int64   
 8   R031     251250 non-null  int64   
 9   R034     251250 non-null  int64   
 10  R037     251250 non-null  category
 11  W00101   251250 non-null  float64 
 12  W00102   251250 non-null  float64 
 13  W00103   251250 non-null  float64 
 14  W00201   251250 non-null  float64 
 15  W00202   251250 non-null  float64 
 16  W00203   251250 non-null  float64 
 17  P013     251250 non-null  int64   
 18  P015     251250 non-null  int64   
 19  P027     251250 non-

## Check for missing values


In [7]:
print("Missing Values Analysis:")
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100
missing_data = pd.DataFrame({
    'Missing Values': missing_values,
    'Percentage': missing_percentage,
    'Description': [get_column_description(col) for col in df.columns]
})
# Show only columns with missing values, sorted by percentage
print(missing_data[missing_data['Missing Values'] > 0].sort_values('Percentage', ascending=False))


Missing Values Analysis:
         Missing Values  Percentage  \
R00101           106310   42.312438   
S065             106310   42.312438   
Y002             106310   42.312438   
H001             106310   42.312438   
H003             106310   42.312438   
W001             106310   42.312438   
P00102           106310   42.312438   
P00103           106310   42.312438   
P00104           106310   42.312438   
P00402           106310   42.312438   
P00403           106310   42.312438   
P00404           106310   42.312438   
P00901           106310   42.312438   
P01101           106310   42.312438   
J00101           106310   42.312438   
J01101           106310   42.312438   
J01502           106310   42.312438   
J02702           106310   42.312438   
M01601           106310   42.312438   
N00101           106310   42.312438   
Q00201           106310   42.312438   
Q03001           106310   42.312438   
Q06306           106310   42.312438   
Q09301           106310   42.312438   


## Target variable analysis


In [8]:
# R011 is the endometriosis indicator according to the schema
has_endometriosis = df["R011"] == "3"

print("Endometriosis Distribution:")
endometriosis_counts = len(df[has_endometriosis])
print(f"Total cases: {endometriosis_counts}")
print(f"Prevalence Rate: {(endometriosis_counts / len(df)) * 100:.2f}%")

# Print the description of R011 from schema
print(f"Variable Description: {get_column_description('R011')}")


Endometriosis Distribution:
Total cases: 294
Prevalence Rate: 0.12%
Variable Description: Segundo o médico, qual o motivo da retirada do útero? 


## Comprehensive Feature Analysis


In [9]:
# Get all available features
all_features = df.columns.tolist()
print(f"Total available features: {len(all_features)}")

# Categorize features by type
numeric_features = df.select_dtypes(include=[np.int64, np.float64]).columns.tolist()
categorical_features = df.select_dtypes(include=['category']).columns.tolist()

print(f"\nNumeric features: {len(numeric_features)}")
print(f"Categorical features: {len(categorical_features)}")

# Show feature categories
print("\nNumeric features:")
for feat in numeric_features[:10]:  # Show first 10
    print(f"  {feat}: {get_column_description(feat)}")

print("\nCategorical features:")
for feat in categorical_features[:10]:  # Show first 10
    print(f"  {feat}: {get_column_description(feat)}")


Total available features: 99

Numeric features: 79
Categorical features: 19

Numeric features:
  C008: Idade do morador na data de referência
  R010: A sra já foi submetida a cirurgia para retirada do útero? 
  R012: Que idade a sra tinha quando foi submetida à cirurgia? 
  R025: Com que idade a sra ficou menstruada pela primeira vez?  
  R028: A sra já entrou na menopausa? 
  R031: Nos últimos 12 meses, a sra teve relações sexuais? 
  R034: A sra usa algum método para evitar a gravidez atualmente?
  W00101: Peso - 1ª pesagem (em kg)
  W00102: Peso - 2ª pesagem (em kg)
  W00103: Peso - Final (em kg)

Categorical features:
  C009: Cor ou raça
  R011: Segundo o médico, qual o motivo da retirada do útero? 
  R037: A sra e/ou seu companheiro já fizeram ou fazem algum tratamento para engravidar? 
  P052: E no passado, o(a) Sr(a) fumou algum produto do tabaco?
  J001: De um modo geral, como é o estado de saúde de________
  J039: Qual foi o principal atendimento de saúde que ___recebeu quando

## Feature Categories Analysis


In [10]:
# Analyze features by category (based on column prefixes)
feature_categories = {}

for col in df.columns:
    # Extract prefix (first few characters)
    prefix = col[:3] if len(col) >= 3 else col
    
    if prefix not in feature_categories:
        feature_categories[prefix] = []
    feature_categories[prefix].append(col)

print("Feature categories by prefix:")
for category, features in sorted(feature_categories.items()):
    print(f"\n{category}: {len(features)} features")
    for feat in features[:5]:  # Show first 5 features per category
        print(f"  {feat}: {get_column_description(feat)}")
    if len(features) > 5:
        print(f"  ... and {len(features) - 5} more")


Feature categories by prefix:

C00: 2 features
  C008: Idade do morador na data de referência
  C009: Cor ou raça

H00: 2 features
  H001: Quando foi a última vez que o(a) sr(a) consultou com um(a) médico(a)?
  H003: Por qual motivo o(a) sr(a) precisou consultar com um(a) médico(a)

J00: 3 features
  J001: De um modo geral, como é o estado de saúde de________
  J007: Algum médico já deu o diagnóstico de alguma doença crônica, física ou mental, ou doença de longa duração (de mais de 6 meses de duração) a ___
  J00101: Considerando saúde como um estado de bem-estar físico e mental, e não somente a ausência de doenças, como é o estado de saúde de _____________? 

J01: 3 features
  J014: Nas duas últimas semanas, ___ procurou algum lugar, serviço ou profissional de saúde para atendimento relacionado à própria saúde
  J01101: Quando ____ consultou um médico pela última vez
  J01502: Qual foi o motivo principal pelo qual ___ procurou atendimento relacionado à própria saúde nas duas últimas s

## Demographic Features Analysis


In [11]:
# Focus on demographic features (C series - household characteristics)
demographic_features = [col for col in df.columns if col.startswith('C')]
print(f"Demographic features (C series): {len(demographic_features)}")

print("\nDemographic features with descriptions:")
for feat in demographic_features:
    print(f"{feat}: {get_column_description(feat)}")
    print(f"  Unique values: {df[feat].nunique()}")
    print(f"  Missing values: {df[feat].isnull().sum()}")
    print()


Demographic features (C series): 2

Demographic features with descriptions:
C008: Idade do morador na data de referência
  Unique values: 110
  Missing values: 0

C009: Cor ou raça
  Unique values: 6
  Missing values: 0



## Health Status Features Analysis


In [12]:
# Focus on health status features (Q series - chronic conditions)
health_features = [col for col in df.columns if col.startswith('Q')]
print(f"Health status features (Q series): {len(health_features)}")

print("\nHealth features with descriptions:")
for feat in health_features:
    print(f"{feat}: {get_column_description(feat)}")
    print(f"  Unique values: {df[feat].nunique()}")
    print(f"  Missing values: {df[feat].isnull().sum()}")
    print()


Health status features (Q series): 39

Health features with descriptions:
Q003: Que idade o(a) Sr(a) tinha no primeiro diagnóstico de hipertensão arterial (pressão alta)? (0 =  Menos de 1 ano)
  Unique values: 96
  Missing values: 0

Q031: Que idade o(a) Sr(a) tinha no primeiro diagnóstico de diabetes? (0 =  Menos de 1 ano) (Branco = Não aplicável)
  Unique values: 94
  Missing values: 0

Q060: Algum médico já lhe deu o diagnóstico de colesterol alto? 
  Unique values: 2
  Missing values: 0

Q061: Que idade o(a) sr(a) tinha no primeiro diagnóstico de colesterol alto? (0 =  Menos de 1 ano)
  Unique values: 97
  Missing values: 0

Q064: Que idade o Sr(a) tinha no primeiro diagnóstico da doença do coração? (0 =  Menos de 1 ano)
  Unique values: 96
  Missing values: 0

Q068: Algum médico já lhe deu o diagnóstico de AVC (Acidente Vascular Cerebral) ou derrame?
  Unique values: 2
  Missing values: 0

Q070: Que idade o(a) Sr(a) tinha no primeiro diagnóstico do derrame (ou AVC)? (0 =  Menos de

## Reproductive Health Features Analysis


In [13]:
# Focus on reproductive health features (R series)
reproductive_features = [col for col in df.columns if col.startswith('R')]
print(f"Reproductive health features (R series): {len(reproductive_features)}")

print("\nReproductive health features with descriptions:")
for feat in reproductive_features:
    print(f"{feat}: {get_column_description(feat)}")
    print(f"  Unique values: {df[feat].nunique()}")
    print(f"  Missing values: {df[feat].isnull().sum()}")
    print()


Reproductive health features (R series): 9

Reproductive health features with descriptions:
R010: A sra já foi submetida a cirurgia para retirada do útero? 
  Unique values: 3
  Missing values: 0

R011: Segundo o médico, qual o motivo da retirada do útero? 
  Unique values: 8
  Missing values: 0

R012: Que idade a sra tinha quando foi submetida à cirurgia? 
  Unique values: 76
  Missing values: 0

R025: Com que idade a sra ficou menstruada pela primeira vez?  
  Unique values: 18
  Missing values: 0

R028: A sra já entrou na menopausa? 
  Unique values: 2
  Missing values: 0

R031: Nos últimos 12 meses, a sra teve relações sexuais? 
  Unique values: 2
  Missing values: 0

R034: A sra usa algum método para evitar a gravidez atualmente?
  Unique values: 2
  Missing values: 0

R037: A sra e/ou seu companheiro já fizeram ou fazem algum tratamento para engravidar? 
  Unique values: 4
  Missing values: 0

R00101: Quando foi a última vez que a Sra fez exame preventivo para câncer de colo do ú

## Feature Correlation Analysis


In [14]:
# Create target variable for correlation analysis
df['endometriosis_target'] = (df['R011'] == '3').astype(int)

# Select numeric features for correlation analysis
numeric_df = df.select_dtypes(include=[np.number])
print(f"Numeric features for correlation analysis: {len(numeric_df.columns)}")

# Calculate correlations with target
correlations_with_target = numeric_df.corr()['endometriosis_target'].sort_values(ascending=False)

print("\nTop correlations with endometriosis:")
print(correlations_with_target.head(10))

print("\nBottom correlations with endometriosis:")
print(correlations_with_target.tail(10))


Numeric features for correlation analysis: 80

Top correlations with endometriosis:
endometriosis_target    1.000000
R012                    0.192034
P00901                  0.057272
P00103                  0.056348
P00104                  0.055011
P00403                  0.053913
P00402                  0.053647
P00102                  0.052380
P00404                  0.051285
S065                    0.049272
Name: endometriosis_target, dtype: float64

Bottom correlations with endometriosis:
P05421    -0.000354
Q121011   -0.000365
Q121010   -0.000541
P05411    -0.000616
Q11701    -0.002082
Q11604    -0.002526
R034      -0.010974
R028      -0.011395
R031      -0.014879
R010      -0.051259
Name: endometriosis_target, dtype: float64


## Feature Importance Analysis


In [15]:
# Analyze feature importance using chi-square test for categorical features
from scipy.stats import chi2_contingency

categorical_features_analysis = []

for col in categorical_features:
    if col != 'R011':  # Skip target variable
        try:
            # Create contingency table
            contingency_table = pd.crosstab(df[col], df['endometriosis_target'])
            
            # Perform chi-square test
            chi2, p_value, dof, expected = chi2_contingency(contingency_table)
            
            categorical_features_analysis.append({
                'feature': col,
                'description': get_column_description(col),
                'chi2_statistic': chi2,
                'p_value': p_value,
                'unique_values': df[col].nunique(),
                'missing_values': df[col].isnull().sum()
            })
        except:
            continue

# Sort by chi-square statistic
categorical_importance = pd.DataFrame(categorical_features_analysis)
categorical_importance = categorical_importance.sort_values('chi2_statistic', ascending=False)

print("Top categorical features by chi-square statistic:")
print(categorical_importance.head(20))


Top categorical features by chi-square statistic:
   feature                                        description  chi2_statistic  \
17  Q12201  Que idade o(a) Sr(a) tinha no primeiro diagnós...      682.860725   
5     N001       Em geral, como o(a) Sr(a) avalia a sua saúde      626.280159   
2     P052  E no passado, o(a) Sr(a) fumou algum produto d...      565.895527   
8     H001  Quando foi a última vez que o(a) sr(a) consult...      527.083853   
6   R00101  Quando foi a última vez que a Sra fez exame pr...      513.918011   
16  N00101  Considerando saúde como um estado de bem-estar...      396.908115   
7     Y002     Nos últimos doze meses teve relações sexuais?       393.495203   
15  M01601  Nos últimos doze meses, com que frequência o(a...      390.683496   
9     H003  Por qual motivo o(a) sr(a) precisou consultar ...      371.870033   
4     J039  Qual foi o principal atendimento de saúde que ...       95.793550   
13  J01502  Qual foi o motivo principal pelo qual ___ proc.

## Create Comprehensive Feature Set


In [16]:
# Create a comprehensive feature set
print("Creating comprehensive feature set...")

# Filter features based on criteria:
# 1. Not too many missing values (< 50%)
# 2. Not too many unique values for categorical (< 100)
# 3. Significant correlation or chi-square statistic

selected_features = []

# Add numeric features with good correlation
numeric_candidates = correlations_with_target[abs(correlations_with_target) > 0.01].index.tolist()
selected_features.extend(numeric_candidates)

# Add categorical features with significant chi-square
significant_categorical = categorical_importance[categorical_importance['p_value'] < 0.05]['feature'].tolist()
selected_features.extend(significant_categorical)

# Add important demographic and health features
important_features = ['C008', 'R012', 'R025'] + health_features + reproductive_features
selected_features.extend(important_features)

# Remove duplicates and target variable
selected_features = list(set(selected_features))
if 'endometriosis_target' in selected_features:
    selected_features.remove('endometriosis_target')

print(f"Selected {len(selected_features)} features for comprehensive analysis")
print("\nSelected features:")
for feat in selected_features:
    print(f"{feat}: {get_column_description(feat)}")


Creating comprehensive feature set...
Selected 88 features for comprehensive analysis

Selected features:
Q121011: Diagnóstico de outro câncer? Bexiga
R00101: Quando foi a última vez que a Sra fez exame preventivo para câncer de colo do útero
Q12108: Diagnóstico de outro câncer? Colo de útero (só para mulheres) 
J02702: Qual foi o principal atendimento de saúde que ___ recebeu?
R034: A sra usa algum método para evitar a gravidez atualmente?
P00402: O(A) Sr(a) sabe sua altura? (mesmo que seja valor aproximado) 
Q080: Que idade o(a) Sr(a) tinha no primeiro diagnóstico de artrite ou reumatismo? (0 =  Menos de 1 ano) (Branco = Não aplicável)
Q074: Algum médico já lhe deu o diagnóstico de asma (ou bronquite asmática)? 
Q084: O(a) Sr(a) tem algum problema crônico de coluna, como dor crônica nas costas ou no pescoço, lombalgia, dor ciática, problemas nas vértebras ou disco? 
R010: A sra já foi submetida a cirurgia para retirada do útero? 
Q12201: Que idade o(a) Sr(a) tinha no primeiro diagnós

## Feature Preprocessing


In [26]:
# Create processed dataset with selected features
df_comprehensive = df[selected_features + ['R011']].copy()

print(f"Comprehensive dataset shape: {df_comprehensive.shape}")
print(f"Features: {len(selected_features)}")
print(f"Target variable: R011")

# Handle missing values
print("\nHandling missing values...")
missing_before = df_comprehensive.isnull().sum().sum()
print(f"Missing values before: {missing_before}")

# For numeric features, fill with median
numeric_cols = df_comprehensive.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if col != 'R011':
        df_comprehensive[col] = df_comprehensive[col].fillna(df_comprehensive[col].median())

# For categorical features, fill with mode
categorical_cols = df_comprehensive.select_dtypes(include=['category']).columns
for col in categorical_cols:
    if col != 'R011':
        df_comprehensive[col] = df_comprehensive[col].fillna(df_comprehensive[col].mode()[0] if len(df_comprehensive[col].mode()) > 0 else 'Unknown')

missing_after = df_comprehensive.isnull().sum().sum()
print(f"Missing values after: {missing_after}")


Comprehensive dataset shape: (251250, 89)
Features: 88
Target variable: R011

Handling missing values...
Missing values before: 4252400
Missing values after: 0


## Feature Encoding


In [27]:
# Check for duplicate columns and drop one if R011 is duplicated
if 'R011' in df_comprehensive.columns:
    # Get column names and check for duplicates
    col_names = df_comprehensive.columns.tolist()
    if col_names.count('R011') > 1:
        # Drop the duplicate R011 column (keep only the first one)
        df_comprehensive = df_comprehensive.loc[:, ~df_comprehensive.columns.duplicated()]
        print("Dropped duplicate R011 column")

Dropped duplicate R011 column


In [28]:
display(df_comprehensive)

Unnamed: 0,Q121011,R00101,Q12108,J02702,R034,P00402,Q080,Q074,Q084,R010,...,P00901,C008,Q061,R031,Q11604,R011,P013,Q120,P00102,Q068
0,0.0,<None>,0.0,<None>,0,0.0,0,0,0,99,...,0.0,37,0,0,0.0,<None>,0,0,0.0,0
1,0.0,<None>,0.0,<None>,0,0.0,0,0,0,99,...,0.0,0,0,0,0.0,<None>,0,0,0.0,0
2,0.0,<None>,0.0,<None>,0,0.0,0,0,0,99,...,0.0,16,0,0,0.0,<None>,0,0,0.0,0
3,0.0,<None>,0.0,<None>,0,0.0,0,0,0,99,...,0.0,38,0,0,0.0,<None>,0,0,0.0,0
4,0.0,<None>,0.0,<None>,0,0.0,0,0,0,0,...,0.0,32,0,1,0.0,<None>,2,0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
251245,0.0,4,0.0,<None>,0,1.0,0,0,0,0,...,2.0,32,0,1,0.0,<None>,0,0,1.0,0
251246,0.0,<None>,0.0,<None>,0,0.0,0,0,0,99,...,0.0,4,0,0,0.0,<None>,0,0,0.0,0
251247,0.0,<None>,0.0,01,0,0.0,0,0,0,99,...,0.0,42,0,0,0.0,<None>,0,0,0.0,0
251248,0.0,<None>,0.0,<None>,0,0.0,0,0,0,99,...,0.0,49,0,0,0.0,<None>,0,0,0.0,0


In [30]:
# Encode categorical features
print("Encoding categorical features...")

# Create dummy variables for categorical features
categorical_features_to_encode = [col for col in selected_features if col in categorical_cols]
print(f"Categorical features to encode: {len(categorical_features_to_encode)}")

# Apply one-hot encoding
df_encoded = pd.get_dummies(df_comprehensive, columns=categorical_features_to_encode, drop_first=True).copy()

# Create binary target variable
# First, drop the original R011 column to avoid duplicate labels
# df_encoded = df_encoded.drop('R011', axis=1)
df_encoded['endometriosis_target'] = df_comprehensive['R011'].apply(lambda x: 1 if str(x) == '3' else 0)

print(f"Final dataset shape: {df_encoded.shape}")
print(f"Features after encoding: {len(df_encoded.columns) - 2}")  # -2 for R011 and target


Encoding categorical features...
Categorical features to encode: 19
Final dataset shape: (251250, 261)
Features after encoding: 259


## Save Comprehensive Feature Set


In [31]:
# Save the comprehensive feature set
df_encoded.to_parquet("../data/processed/PNS_comprehensive_features.parquet", index=False)

print("Comprehensive feature set saved successfully!")
print(f"\nDataset summary:")
print(f"- Total records: {len(df_encoded)}")
print(f"- Total features: {len(df_encoded.columns) - 2}")
print(f"- Endometriosis cases: {df_encoded['endometriosis_target'].sum()}")
print(f"- Prevalence rate: {(df_encoded['endometriosis_target'].sum() / len(df_encoded)) * 100:.2f}%")


Comprehensive feature set saved successfully!

Dataset summary:
- Total records: 251250
- Total features: 259
- Endometriosis cases: 294
- Prevalence rate: 0.12%


## Feature Summary Report


In [32]:
# Create a comprehensive feature summary
feature_summary = []

for col in df_encoded.columns:
    if col not in ['R011', 'endometriosis_target']:
        feature_summary.append({
            'feature': col,
            'data_type': str(df_encoded[col].dtype),
            'unique_values': df_encoded[col].nunique(),
            'mean': df_encoded[col].mean() if df_encoded[col].dtype in ['int64', 'float64'] else None,
            'std': df_encoded[col].std() if df_encoded[col].dtype in ['int64', 'float64'] else None,
            'min': df_encoded[col].min() if df_encoded[col].dtype in ['int64', 'float64'] else None,
            'max': df_encoded[col].max() if df_encoded[col].dtype in ['int64', 'float64'] else None
        })

feature_summary_df = pd.DataFrame(feature_summary)
print("Feature Summary:")
print(feature_summary_df.head(20))

# Save feature summary
feature_summary_df.to_csv("../data/processed/comprehensive_feature_summary.csv", index=False)
print("\nFeature summary saved to comprehensive_feature_summary.csv")


Feature Summary:
    feature data_type  unique_values       mean        std  min    max
0   Q121011   float64              2   0.000060   0.007726  0.0    1.0
1    Q12108   float64              2   0.001134   0.033661  0.0    1.0
2      R034     int64              2   0.093218   0.290738  0.0    1.0
3    P00402   float64              2   0.156100   0.362950  0.0    1.0
4      Q080     int64             97   1.553393   8.912042  0.0   98.0
5      Q074     int64              2   0.018030   0.133060  0.0    1.0
6      Q084     int64              2   0.070901   0.256661  0.0    1.0
7      R010     int64              3  69.055853  45.443470  0.0   99.0
8    W00102   float64            953   9.449317  23.878227  0.0  178.5
9    Q00201   float64              2   0.057580   0.232948  0.0    1.0
10     P035     int64              8   0.379705   1.254640  0.0    7.0
11   W00103   float64            947   9.766620  24.193451  0.0  178.0
12     Q092     int64              2   0.038746   0.192990  