In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('data/a-brazilian-multilabel-ophthalmological-dataset-brset-1.0.1/labels_brset.csv')

In [11]:
def analyze_dataframe(df, return_string=True):
    """
    Analyze a DataFrame by showing dtypes, value counts, and number of unique values for each column.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame to analyze
    
    Returns:
    pandas.DataFrame: Summary DataFrame with column analysis
    """
    # Create empty lists to store information
    columns = []
    dtypes = []
    value_counts = []
    unique_counts = []
    null_counts = []
    
    # Analyze each column
    for column in df.columns:
        columns.append(column)
        dtypes.append(str(df[column].dtype))
        value_counts.append(df[column].count())
        unique_counts.append(df[column].nunique())
        null_counts.append(df[column].isnull().sum())
    
    # Create summary DataFrame
    summary_df = pd.DataFrame({
        'Column': columns,
        'Dtype': dtypes,
        'Total Values': value_counts,
        'Unique Values': unique_counts,
        'Null Values': null_counts
    })
    
    summary_df['Unique %'] = round((summary_df['Unique Values'] / summary_df['Total Values']) * 100, 2)
    
    if return_string:
        return summary_df.set_index('Column').reset_index(drop=False).to_string(index=False)
    return summary_df

# Example usage:
summary = analyze_dataframe(df)
print(summary)

                  Column   Dtype  Total Values  Unique Values  Null Values  Unique %
                image_id  object         16266          16266            0    100.00
              patient_id   int64         16266           8524            0     52.40
                  camera  object         16266              2            0      0.01
             patient_age float64         10820             92         5446      0.85
           comorbidities  object          8029            213         8237      2.65
         diabetes_time_y  object          1910             55        14356      2.88
                insuline  object          1714              2        14552      0.12
             patient_sex   int64         16266              2            0      0.01
                exam_eye   int64         16266              2            0      0.01
                diabetes  object         16266              2            0      0.01
             nationality  object         16266              1    

In [13]:
16266/2

8133.0

In [12]:
# Group by patient_id and check for inconsistencies
def check_patient_consistency(df):
    # Get all columns except patient_id and image_id
    columns_to_check = [col for col in df.columns if col not in ['patient_id', 'image_id']]
    
    # Group by patient_id
    grouped = df.groupby('patient_id')
    
    # Store inconsistent patients
    inconsistencies = {}
    
    for patient_id, patient_data in grouped:
        # For each column, check if all values are the same for this patient
        for col in columns_to_check:
            unique_values = patient_data[col].unique()
            if len(unique_values) > 1:
                if patient_id not in inconsistencies:
                    inconsistencies[patient_id] = {}
                inconsistencies[patient_id][col] = unique_values
    
    # Print results
    if not inconsistencies:
        print("No inconsistencies found. All patients have consistent data across their records.")
    else:
        print("Found inconsistencies for the following patients:\n")
        for patient_id, columns in inconsistencies.items():
            print(f"Patient ID: {patient_id}")
            for col, values in columns.items():
                print(f"  - {col}: {values}")
            print()

# Run the consistency check
check_patient_consistency(df)

Found inconsistencies for the following patients:

Patient ID: 1
  - exam_eye: [1 2]
  - optic_disc: ['1' '2']

Patient ID: 2
  - exam_eye: [1 2]

Patient ID: 3
  - exam_eye: [1 2]

Patient ID: 4
  - exam_eye: [1 2]

Patient ID: 5
  - exam_eye: [1 2]

Patient ID: 6
  - exam_eye: [1 2]

Patient ID: 7
  - exam_eye: [1 2]

Patient ID: 8
  - exam_eye: [1 2]

Patient ID: 9
  - exam_eye: [1 2]

Patient ID: 10
  - exam_eye: [1 2]
  - optic_disc: ['1' '2']
  - macula: [1 2]
  - scar: [0 1]
  - nevus: [0 1]
  - increased_cup_disc: [0 1]

Patient ID: 11
  - exam_eye: [1 2]

Patient ID: 13
  - exam_eye: [1 2]
  - macula: [2 1]
  - DR_SDRG: [3 1]
  - macular_edema: [1 0]

Patient ID: 14
  - exam_eye: [1 2]

Patient ID: 15
  - exam_eye: [1 2]
  - macular_edema: [0 1]

Patient ID: 16
  - exam_eye: [1 2]

Patient ID: 17
  - exam_eye: [1 2]
  - optic_disc: ['1' '2']
  - macula: [1 2]
  - macular_edema: [0 1]

Patient ID: 18
  - exam_eye: [1 2]

Patient ID: 19
  - exam_eye: [1 2]

Patient ID: 20
  - ex