<a href="https://colab.research.google.com/github/meghana-0211/clinical-data/blob/main/novaritis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import re
from typing import List, Dict, Any
import category_encoders as ce
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

class ClinicalTrialsPreprocessor:
    def __init__(self,
                 usecase_csv: str,
                 facilities_txt: str,
                 drop_withdrawals_txt: str,
                 eligibilities_txt: str,
                 reported_events_txt: str):
        """
        Advanced preprocessor for clinical trials data

        Args:
            usecase_csv (str): Path to main usecase CSV
            facilities_txt (str): Path to facilities text file
            drop_withdrawals_txt (str): Path to withdrawals text file
            eligibilities_txt (str): Path to eligibilities text file
            reported_events_txt (str): Path to reported events text file
        """
        # Flexible delimiter and column detection
        self.df_main = self._read_file(usecase_csv)
        self.df_facilities = self._read_file(facilities_txt)
        self.df_withdrawals = self._read_file(drop_withdrawals_txt)
        self.df_eligibility = self._read_file(eligibilities_txt)
        self.df_reported_events = self._read_file(reported_events_txt)

        # Preprocessing metadata
        self.preprocessing_log = {}
        self.feature_metadata = {}

    def _read_file(self, filepath: str, delimiters=['|', '\t', ',', ';']) -> pd.DataFrame:
        """
        Intelligently read files with different potential delimiters

        Args:
            filepath (str): Path to the file
            delimiters (list): List of delimiters to try

        Returns:
            pd.DataFrame: Loaded dataframe
        """
        for delimiter in delimiters:
            try:
                df = pd.read_csv(filepath, sep=delimiter, low_memory=False)
                return df
            except Exception:
                continue

        raise ValueError(f"Could not read file {filepath} with standard delimiters")

    def _find_nct_column(self, df: pd.DataFrame) -> str:
        """
        Find the NCT ID column in a dataframe

        Args:
            df (pd.DataFrame): Input dataframe

        Returns:
            str: Name of the NCT ID column
        """
        possible_columns = ['nct_id', 'NCT_ID', 'nct number', 'NCT Number', 'NCT number', 'id', 'Unnamed: 1']
        for col in possible_columns:
            if col in df.columns:
                return col
        raise ValueError(f"No NCT ID column found. Available columns: {list(df.columns)}")

    def clean_nct_id(self, id_str: str) -> str:
        """
        Enhanced NCT ID cleaning with more robust validation

        Args:
            id_str (str): Input ID string

        Returns:
            str: Cleaned and standardized NCT ID
        """
        if pd.isna(id_str):
            return np.nan

        # Convert to string and strip
        id_str = str(id_str).strip()

        # Remove any non-digit characters
        numeric_part = re.sub(r'\D', '', id_str)

        # Pad to 8 digits if needed
        padded_numeric = numeric_part.zfill(8)

        return f'NCT{padded_numeric}'

    def _preprocess_nct_ids(self):
        """
        Preprocess NCT IDs across all dataframes
        """
        # Find and clean NCT ID columns
        dataframes = [
            self.df_main,
            self.df_facilities,
            self.df_withdrawals,
            self.df_eligibility,
            self.df_reported_events
        ]

        for df in dataframes:
            try:
                nct_col = self._find_nct_column(df)
                df['Clean_NCT_Number'] = df[nct_col].apply(self.clean_nct_id)
            except ValueError:
                print(f"Could not find NCT ID column in dataframe: {list(df.columns)}")

    def extract_advanced_features(self) -> pd.DataFrame:
        """
        Extract advanced features from text fields

        Returns:
            pd.DataFrame: Dataframe with extracted features
        """
        def parse_criteria(criteria: str) -> Dict[str, int]:
            """
            Parse inclusion/exclusion criteria

            Args:
                criteria (str): Criteria text

            Returns:
                Dict of feature counts
            """
            if pd.isna(criteria):
                return {}

            features = {
                'inclusion_count': len(re.findall(r'\*\s*Inclusion', str(criteria), re.IGNORECASE)),
                'exclusion_count': len(re.findall(r'\*\s*Exclusion', str(criteria), re.IGNORECASE)),
                'has_age_restriction': bool(re.search(r'(age|years)', str(criteria), re.IGNORECASE)),
                'has_health_condition': bool(re.search(r'(condition|disease|syndrome)', str(criteria), re.IGNORECASE))
            }
            return features

        # Extract features from eligibility criteria
        try:
            eligibility_features = self.df_eligibility['criteria'].apply(parse_criteria).apply(pd.Series)
        except KeyError:
            print("No 'criteria' column found in eligibility dataframe")
            eligibility_features = pd.DataFrame()

        return eligibility_features

    def handle_missing_values(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Advanced missing value handling

        Args:
            df (pd.DataFrame): Input dataframe

        Returns:
            pd.DataFrame: Dataframe with handled missing values
        """
        # Log missing values before processing
        missing_before = df.isnull().sum()

        # Imputation strategies
        numeric_imputer = SimpleImputer(strategy='median')
        categorical_imputer = SimpleImputer(strategy='most_frequent')

        # Identify column types
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
        categorical_cols = df.select_dtypes(include=['object']).columns

        # Impute missing values
        df[numeric_cols] = numeric_imputer.fit_transform(df[numeric_cols])
        df[categorical_cols] = categorical_imputer.fit_transform(df[categorical_cols])

        # Log missing values after processing
        missing_after = df.isnull().sum()

        # Store imputation metadata
        self.preprocessing_log['missing_values'] = {
            'before': missing_before,
            'after': missing_after
        }

        return df

    def encode_categorical_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Advanced categorical feature encoding

        Args:
            df (pd.DataFrame): Input dataframe

        Returns:
            pd.DataFrame: Dataframe with encoded categorical features
        """
        # Identify categorical columns
        categorical_cols = df.select_dtypes(include=['object']).columns

        # Target encoding for high cardinality features
        try:
            encoder = ce.TargetEncoder(cols=categorical_cols)
            encoded_df = encoder.fit_transform(df, df['Enrollment'])
        except Exception as e:
            print(f"Error in target encoding: {e}")
            encoded_df = df

        return encoded_df

    def normalize_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Normalize numerical features

        Args:
            df (pd.DataFrame): Input dataframe

        Returns:
            pd.DataFrame: Normalized dataframe
        """
        # Select numeric columns
        numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

        # Standard scaling
        scaler = StandardScaler()
        df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

        return df

    def merge_with_additional_data(self) -> pd.DataFrame:
        """
        Comprehensive merge of multiple data sources

        Returns:
            pd.DataFrame: Merged and preprocessed dataframe
        """
        # Preprocess NCT IDs
        self._preprocess_nct_ids()

        # Merge dataframes
        merged_df = self.df_main.copy()

        # Merge configurations
        merge_configs = [
            (self.df_facilities, ['Clean_NCT_Number'],
             ['name', 'city', 'state', 'country'], 'left'),
            (self.df_withdrawals, ['Clean_NCT_Number'],
             ['reason', 'count'], 'left'),
            (self.df_eligibility, ['Clean_NCT_Number'],
             ['gender', 'minimum_age', 'maximum_age'], 'left'),
            (self.df_reported_events, ['Clean_NCT_Number'],
             ['event_type', 'subjects_affected'], 'left')
        ]

        # Perform merges
        for txt_df, merge_cols, selected_cols, merge_type in merge_configs:
            try:
                available_cols = [col for col in selected_cols if col in txt_df.columns]
                if available_cols:
                    merged_df = merged_df.merge(
                        txt_df[merge_cols + available_cols],
                        on=merge_cols[0],
                        how=merge_type
                    )
            except Exception as e:
                print(f"Error merging {txt_df}: {e}")

        return merged_df

    def preprocess(self) -> pd.DataFrame:
        """
        Comprehensive preprocessing pipeline

        Returns:
            pd.DataFrame: Fully preprocessed dataframe
        """
        # Merge data sources
        merged_df = self.merge_with_additional_data()

        # Extract advanced features
        advanced_features = self.extract_advanced_features()
        if not advanced_features.empty:
            merged_df = pd.concat([merged_df, advanced_features], axis=1)

        # Handle missing values
        merged_df = self.handle_missing_values(merged_df)

        # Encode categorical features
        merged_df = self.encode_categorical_features(merged_df)

        # Normalize features
        merged_df = self.normalize_features(merged_df)

        return merged_df

    def generate_data_report(self) -> Dict[str, Any]:
        """
        Generate comprehensive data preprocessing report

        Returns:
            Dict with preprocessing metadata
        """
        report = {
            'initial_shape': self.df_main.shape,
            'merged_shape': None,
            'missing_values': self.preprocessing_log.get('missing_values', {}),
            'unique_nct_ids': {
                'original': self.df_main['NCT Number'].nunique() if 'NCT Number' in self.df_main.columns else 'N/A',
                'cleaned': None
            },
            'feature_stats': {}
        }

        return report

# Main processing function
def process_clinical_trials(
    usecase_csv: str,
    facilities_txt: str,
    drop_withdrawals_txt: str,
    eligibilities_txt: str,
    reported_events_txt: str
) -> pd.DataFrame:
    """
    Main processing function

    Args:
        usecase_csv (str): Path to main CSV
        facilities_txt (str): Path to facilities text file
        drop_withdrawals_txt (str): Path to withdrawals text file
        eligibilities_txt (str): Path to eligibilities text file
        reported_events_txt (str): Path to reported events text file

    Returns:
        pd.DataFrame: Preprocessed dataframe
    """
    preprocessor = ClinicalTrialsPreprocessor(
        usecase_csv,
        facilities_txt,
        drop_withdrawals_txt,
        eligibilities_txt,
        reported_events_txt
    )

    # Preprocess data
    processed_df = preprocessor.preprocess()

    # Generate report
    report = preprocessor.generate_data_report()
    print("Preprocessing Report:")
    for key, value in report.items():
        print(f"{key}: {value}")

    return processed_df



In [None]:
pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.8.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category_encoders
Successfully installed category_encoders-2.8.0


In [None]:
from google.colab import drive
drive.mount('/content/drive')

processed_data = process_clinical_trials(
    '/content/drive/MyDrive/usecase_1_.csv',
    '/content/drive/MyDrive/facilities.txt',
    '/content/drive/MyDrive/drop_withdrawals.txt',
    '/content/drive/MyDrive/eligibilities.txt',
    '/content/drive/MyDrive/reported_events.txt'
)

# Save processed data
processed_data.to_csv('/content/drive/MyDrive/advanced_processed_clinical_trials.csv', index=False)

Mounted at /content/drive
Could not find NCT ID column in dataframe: [',Unnamed: 0,NCT Number,Study Title,Study URL,Acronym,Study Status,Brief Summary,Study Results,Conditions,Interventions,Primary Outcome Measures,Secondary Outcome Measures,Other Outcome Measures,Sponsor,Collaborators,Sex,Age,Phases,Enrollment,Funder Type,Study Type,Study Design,Other IDs,Start Date,Primary Completion Date,Completion Date,First Posted,Results First Posted,Last Update Posted,Locations,Study Documents']
Error merging                id       nct_id      status  \
0        39182239  NCT02696421  RECRUITING   
1        39182240  NCT01324414         NaN   
2        39182241  NCT02595814         NaN   
3        39182242  NCT02595814         NaN   
4        39182243  NCT02595814         NaN   
...           ...          ...         ...   
3085459  39182234  NCT05770505         NaN   
3085460  39182235  NCT00127790         NaN   
3085461  39182236  NCT02577705         NaN   
3085462  39182237  NCT01518205     