In [2]:
import pandas as pd # Just medication+examination
from mlxtend.frequent_patterns import apriori, association_rules

def main():
    # File paths
    medication_file = "/kaggle/input/finalproject/Raw - Selected Dataset(medications).csv"
    examination_file = "/kaggle/input/finalproject/Raw - Selected Dataset(examination).csv"

    # Load the datasets
    medication_data = pd.read_csv(medication_file)
    examination_data = pd.read_csv(examination_file)

    print("Initial Data Loaded from Medication Dataset:")
    print(medication_data.head())
    print("\nInitial Data Loaded from Examination Dataset:")
    print(examination_data.head())

    # Drop unwanted columns in both datasets (for examination)
    medication_data = medication_data[['SEQN', 'RXDRSC1_1', 'RXDRSC1_2', 'RXDRSC1_3']]
    print("\nMedication Data After Selecting Relevant Columns:")
    print(medication_data.head())

    # Function to categorize continuous variables in examination data
    def categorize_column(df, column, bins, labels):
        df[column] = pd.cut(df[column], bins=bins, labels=labels)
        return df

    # Define continuous columns for categorization based on provided ranges
    continuous_columns = {
        'BPXSY1': [72, 100, 120, 140, 160],  # Blood Pressure Systolic 1
        'BPXDI1': [0, 40, 60, 80, 100],  # Blood Pressure Diastolic 1
        'BPXSY2': [0, 50, 100, 120, 140],  # Blood Pressure Systolic 2
        'BPXDI2': [0, 40, 60, 80, 100],  # Blood Pressure Diastolic 2
        'BPXSY3': [76, 100, 120, 140, 160],  # Blood Pressure Systolic 3
        'BPXDI3': [0, 40, 60, 80, 100],  # Blood Pressure Diastolic 3
        'BMXWT': [3.2, 50, 70, 90, 110],  # Weight
        'BMXHT': [78.3, 150, 160, 170, 180],  # Height
        'BMXBMI': [12.3, 18.5, 24.9, 29.9, 40],  # Body Mass Index
        'BMDBMIC': [1, 2, 3, 4],  # BMI Class (Children)
        'BMXLEG': [24.8, 30, 40, 50],  # Leg Length
        'BMXARML': [9.4, 20, 30, 40],  # Upper Arm Length
        'BMXARMC': [11.2, 20, 30, 40],  # Arm Circumference
        'BMXWAIST': [40, 60, 80, 100],  # Waist Circumference
        'BMXSAD1': [9.5, 10, 20, 30],  # Sagittal Abdominal Diameter 1
        'BMXSAD2': [9.7, 10, 20, 30]  # Sagittal Abdominal Diameter 2
    }

    for column, bins in continuous_columns.items():
        labels = ['Low', 'Medium', 'High', 'Very High'][:len(bins)-1]
        examination_data = categorize_column(examination_data, column, bins, labels)

    print("\nExamination Data After Categorizing Continuous Variables:")
    print(examination_data.head())

    # Convert the examination data to binary
    examination_data_bin = pd.DataFrame()
    examination_data_bin['SEQN'] = examination_data['SEQN']  # Keep SEQN for merging

    for column in examination_data.columns:
        if column != 'SEQN':  # Don't modify the SEQN column
            for label in ['Low', 'Medium', 'High', 'Very High']:
                new_column = f'{column}_{label.replace(" ", "")}'
                examination_data_bin[new_column] = (examination_data[column] == label).astype(int)

    print("\nExamination Data After Binary Conversion:")
    print(examination_data_bin.head())

    # Convert medication data to binary
    medication_data_bin = pd.DataFrame()
    medication_data_bin['SEQN'] = medication_data['SEQN']  # Keep SEQN for merging

    for column in medication_data.columns[1:]:  # Skip SEQN
        for unique_value in medication_data[column].dropna().unique():
            new_column = f'{column}_{unique_value.replace(" ", "").replace(".", "")}'
            medication_data_bin[new_column] = medication_data[column].apply(lambda x: 1 if unique_value in str(x) else 0)

    print("\nMedication Data After Binary Conversion:")
    print(medication_data_bin.head())

    # Merge both datasets on 'SEQN'
    merged_data = pd.merge(examination_data_bin, medication_data_bin, on="SEQN", how="outer")
    print("\nMerged Data:")
    print(merged_data.head())

    # Handle NaN values by replacing them with 0 (False)
    merged_data = merged_data.fillna(0)

    # Run Apriori on the merged data
    frequent_itemsets = apriori(merged_data.drop(columns='SEQN'), min_support=0.4\, use_colnames=True)
    print("\nFrequent Itemsets:")
    print(frequent_itemsets)

    # Generate and print association rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
    print("\nGenerated Association Rules:")
    print(rules)

if __name__ == "__main__":
    main()


Initial Data Loaded from Medication Dataset:
    SEQN RXDRSC1_1 RXDRSC1_2 RXDRSC1_3  Unnamed: 4 Unnamed: 5  \
0  73557       E11       NaN       NaN         NaN        NaN   
1  73558       E11     E11.2     E78.0         NaN     Column   
2  73559       E11       E11     E78.0         NaN       SEQN   
3  73561       I10       I10       NaN         NaN    RXDRSC1   
4  73562       I10     E78.0       I10         NaN        NaN   

                Unnamed: 6   Unnamed: 7     Unnamed: 8  
0                      NaN          NaN            NaN  
1                  Remarks         Type        Remarks  
2               Individual   Identifier            NaN  
3  Medication-Disease code  Categorical  E11, E78, I10  
4                      NaN          NaN            NaN  

Initial Data Loaded from Examination Dataset:
    SEQN  BPXSY1  BPXDI1  BPXSY2  BPXDI2  BPXSY3  BPXDI3  BMXWT  BMXHT  \
0  73557   122.0    72.0   114.0    76.0   102.0    74.0   78.3  171.3   
1  73558   156.0    62.0   



In [12]:
import pandas as pd # Just questionnaire and medication
from mlxtend.frequent_patterns import apriori, association_rules

# Function to load datasets
def load_data(questionnaire_path, medication_path):
    questionnaire_data = pd.read_csv(questionnaire_path)
    medication_data = pd.read_csv(medication_path)
    return questionnaire_data, medication_data

# Function to categorize continuous variables
def categorize_column(df, column, bins, labels):
    df[column] = pd.cut(df[column], bins=bins, labels=labels, include_lowest=True)
    return df

# Function to preprocess questionnaire data
def preprocess_questionnaire_data(questionnaire_data, continuous_columns):
    for column, bins in continuous_columns.items():
        labels = [f'{i}-{j}' for i, j in zip(bins[:-1], bins[1:])]
        if column in questionnaire_data.columns:
            questionnaire_data = categorize_column(questionnaire_data, column, bins, labels)
    return questionnaire_data

# Function to convert data to binary format
def convert_to_binary(df, id_column):
    binary_data = pd.DataFrame()
    binary_data[id_column] = df[id_column]

    # Collect all new columns as a list
    new_columns = []
    for column in df.columns:
        if column != id_column:
            unique_values = df[column].dropna().unique()
            for value in unique_values:
                new_column = f'{column}_{str(value).replace(" ", "").replace("-", "_")}'
                new_columns.append(pd.Series((df[column] == value).astype(int), name=new_column))

    # Concatenate all new columns at once
    binary_data = pd.concat([binary_data] + new_columns, axis=1)
    return binary_data

# Function to run Apriori algorithm and generate rules
def run_apriori(data, min_support=0.6, min_lift=1.0):
    frequent_itemsets = apriori(data, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
    return frequent_itemsets, rules

def main():
    # File paths
    questionnaire_file = "/kaggle/input/finalproject/Raw - Selected Dataset(questionnarie).csv"
    medication_file = "/kaggle/input/finalproject/Raw - Selected Dataset(medications).csv"

    # Load the datasets
    questionnaire_data, medication_data = load_data(questionnaire_file, medication_file)
    print("Initial Questionnaire Data:")
    print(questionnaire_data.head())

    print("\nInitial Medication Data:")
    print(medication_data.head())

    # Define continuous columns for categorization
    continuous_columns = {
        'ALQ120Q': [0, 30, 90, 180, 365],             
        'ALQ130': [1, 5, 10, 15, 20, 25],             
        'DID040': [1, 20, 40, 60, 80],                
        'PAD675': [10, 150, 300, 600, 900],           
        'PAD680': [0, 300, 600, 900, 1200],           
        'SMD030': [0, 18, 30, 45, 60, 76]             
    }

    # Preprocess questionnaire data
    questionnaire_data = preprocess_questionnaire_data(questionnaire_data, continuous_columns)
    print("\nQuestionnaire Data After Categorization:")
    print(questionnaire_data.head())

    # Convert datasets to binary
    questionnaire_data_bin = convert_to_binary(questionnaire_data, 'SEQN')
    medication_data_bin = convert_to_binary(medication_data, 'SEQN')

    print("\nBinary Questionnaire Data:")
    print(questionnaire_data_bin.head())

    print("\nBinary Medication Data:")
    print(medication_data_bin.head())

    # Merge the binary datasets
    merged_data = pd.merge(questionnaire_data_bin, medication_data_bin, on="SEQN", how="outer").fillna(0)
    print("\nMerged Binary Data:")
    print(merged_data.head())

    # Run Apriori and print results
    frequent_itemsets, rules = run_apriori(merged_data.drop(columns='SEQN'))
    print("\nFrequent Itemsets:")
    print(frequent_itemsets)

    print("\nAssociation Rules:")
    print(rules)

if __name__ == "__main__":
    main()



Initial Questionnaire Data:
    SEQN  ALQ101  ALQ120Q  ALQ120U  ALQ130  BPD035  BPQ040A  DIQ010  DID040  \
0  73557     1.0      1.0      3.0     1.0    62.0      1.0     1.0    62.0   
1  73558     1.0      7.0      1.0     4.0    53.0      2.0     1.0    23.0   
2  73559     1.0      0.0      NaN     NaN    40.0      1.0     1.0    57.0   
3  73560     NaN      NaN      NaN     NaN     NaN      NaN     2.0     NaN   
4  73561     1.0      0.0      NaN     NaN    55.0      1.0     2.0     NaN   

   CBQ505  ...  HEQ010  IND235  MCQ010  MCQ160C  PFQ054  PFQ057  PAD675  \
0     1.0  ...     2.0     4.0     2.0      2.0     2.0     2.0     NaN   
1     1.0  ...     2.0     5.0     1.0      2.0     2.0     2.0     NaN   
2     2.0  ...     2.0    10.0     2.0      2.0     2.0     2.0   180.0   
3     NaN  ...     2.0     9.0     2.0      NaN     NaN     NaN     NaN   
4     1.0  ...     2.0    11.0     2.0      2.0     2.0     2.0     NaN   

   PAD680  SMD030  SMQ040  
0   600.0    17.0 



In [3]:
import pandas as pd # Medication and Demographic
from mlxtend.frequent_patterns import apriori, association_rules

def main():
    # File paths for medication and demographic data
    medication_file = "/kaggle/input/finalproject/Raw - Selected Dataset(medications).csv"
    demographic_file = "/kaggle/input/finalproject/Raw - Selected Dataset(demographic).csv"

    # Load the datasets
    medication_data = pd.read_csv(medication_file)
    demographic_data = pd.read_csv(demographic_file)

    print("Initial Data Loaded from Medication Dataset:")
    print(medication_data.head())
    print("\nInitial Data Loaded from Demographic Dataset:")
    print(demographic_data.head())

    # Drop unwanted columns in the medication data
    medication_data = medication_data[['SEQN', 'RXDRSC1_1', 'RXDRSC1_2', 'RXDRSC1_3']]
    print("\nMedication Data After Selecting Relevant Columns:")
    print(medication_data.head())

    # Function to categorize continuous variables in demographic data
    def categorize_column(df, column, bins, labels):
        df[column] = pd.cut(df[column], bins=bins, labels=labels)
        return df

    # Define continuous columns for categorization based on provided ranges
    continuous_columns = {
        'RIDAGEYR': [0, 30, 50, 70, 80],  # Age categorization
        'INDFMPIR': [0, 1, 2, 3, 4],      # Poverty-income ratio
        'WTINT2YR': [1000, 5000, 10000, 20000],  # Household income categorization
        'WTMEC2YR': [1000, 5000, 10000, 20000]   # Weightage categorization
    }

    for column, bins in continuous_columns.items():
        labels = ['Low', 'Medium', 'High', 'Very High'][:len(bins)-1]
        demographic_data = categorize_column(demographic_data, column, bins, labels)

    print("\nDemographic Data After Categorizing Continuous Variables:")
    print(demographic_data.head())

    # Convert the demographic data to binary
    demographic_data_bin = pd.DataFrame()
    demographic_data_bin['SEQN'] = demographic_data['SEQN']  # Keep SEQN for merging

    for column in demographic_data.columns:
        if column != 'SEQN':  # Don't modify the SEQN column
            for label in ['Low', 'Medium', 'High', 'Very High']:
                new_column = f'{column}_{label.replace(" ", "")}'
                demographic_data_bin[new_column] = (demographic_data[column] == label).astype(int)

    print("\nDemographic Data After Binary Conversion:")
    print(demographic_data_bin.head())

    # Convert medication data to binary
    medication_data_bin = pd.DataFrame()
    medication_data_bin['SEQN'] = medication_data['SEQN']  # Keep SEQN for merging

    for column in medication_data.columns[1:]:  # Skip SEQN
        for unique_value in medication_data[column].dropna().unique():
            new_column = f'{column}_{str(unique_value).replace(" ", "").replace(".", "")}'
            medication_data_bin[new_column] = medication_data[column].apply(lambda x: 1 if unique_value in str(x) else 0)

    print("\nMedication Data After Binary Conversion:")
    print(medication_data_bin.head())

    # Merge both datasets on 'SEQN'
    merged_data = pd.merge(demographic_data_bin, medication_data_bin, on="SEQN", how="outer")
    print("\nMerged Data:")
    print(merged_data.head())

    # Handle NaN values by replacing them with 0 (False)
    merged_data = merged_data.fillna(0)

    # Run Apriori on the merged data
    frequent_itemsets = apriori(merged_data.drop(columns='SEQN'), min_support=0.2, use_colnames=True)
    print("\nFrequent Itemsets:")
    print(frequent_itemsets)

    # Generate and print association rules
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
    print("\nGenerated Association Rules:")
    print(rules)

if __name__ == "__main__":
    main()


Initial Data Loaded from Medication Dataset:
    SEQN RXDRSC1_1 RXDRSC1_2 RXDRSC1_3  Unnamed: 4 Unnamed: 5  \
0  73557       E11       NaN       NaN         NaN        NaN   
1  73558       E11     E11.2     E78.0         NaN     Column   
2  73559       E11       E11     E78.0         NaN       SEQN   
3  73561       I10       I10       NaN         NaN    RXDRSC1   
4  73562       I10     E78.0       I10         NaN        NaN   

                Unnamed: 6   Unnamed: 7     Unnamed: 8  
0                      NaN          NaN            NaN  
1                  Remarks         Type        Remarks  
2               Individual   Identifier            NaN  
3  Medication-Disease code  Categorical  E11, E78, I10  
4                      NaN          NaN            NaN  

Initial Data Loaded from Demographic Dataset:
      SEQN  RIAGENDR  RIDAGEYR  RIDRETH1  INDFMPIR  INDHHIN2  DMDEDUC2  \
0  73557.0       1.0      69.0       4.0      0.84       4.0       3.0   
1  73558.0       1.0      5



In [10]:
import pandas as pd # All combined
from mlxtend.frequent_patterns import apriori, association_rules

# Function to load datasets
def load_data(medication_path, demographic_path, examination_path, questionnaire_path):
    medication_data = pd.read_csv(medication_path)
    demographic_data = pd.read_csv(demographic_path)
    examination_data = pd.read_csv(examination_path)
    questionnaire_data = pd.read_csv(questionnaire_path)
    return medication_data, demographic_data, examination_data, questionnaire_data

# Function to categorize continuous variables
def categorize_column(df, column, bins, labels):
    df[column] = pd.cut(df[column], bins=bins, labels=labels, include_lowest=True)
    return df

# Function to preprocess demographic data (categorize continuous variables)
def preprocess_demographic_data(demographic_data, continuous_columns):
    for column, bins in continuous_columns.items():
        labels = [f'{i}-{j}' for i, j in zip(bins[:-1], bins[1:])]
        if column in demographic_data.columns:
            demographic_data = categorize_column(demographic_data, column, bins, labels)
    return demographic_data

# Function to convert data to binary format
def convert_to_binary(df, id_column):
    binary_data = pd.DataFrame()
    binary_data[id_column] = df[id_column]

    new_columns = []  # List to hold new columns temporarily
    for column in df.columns:
        if column != id_column:
            unique_values = df[column].dropna().unique()
            for value in unique_values:
                new_column = f'{column}_{str(value).replace(" ", "").replace("-", "_")}'
                new_columns.append(pd.Series((df[column] == value).astype(int), name=new_column))

    # Concatenate all new columns at once
    binary_data = pd.concat([binary_data] + new_columns, axis=1)
    return binary_data

# Function to run Apriori algorithm and generate rules
def run_apriori(data, min_support=0.6, min_lift=1.0):
    frequent_itemsets = apriori(data, min_support=min_support, use_colnames=True)
    rules = association_rules(frequent_itemsets, metric="lift", min_threshold=min_lift)
    return frequent_itemsets, rules

def main():
    # File paths
    medication_file = "/kaggle/input/finalproject/Raw - Selected Dataset(medications).csv"
    demographic_file = "/kaggle/input/finalproject/Raw - Selected Dataset(demographic).csv"
    examination_file = "/kaggle/input/finalproject/Raw - Selected Dataset(examination).csv"
    questionnaire_file = "/kaggle/input/finalproject/Raw - Selected Dataset(questionnarie).csv"

    # Load the datasets
    medication_data, demographic_data, examination_data, questionnaire_data = load_data(
        medication_file, demographic_file, examination_file, questionnaire_file
    )

    # Print initial data loaded
    print("Initial Data Loaded from Medication Dataset:")
    print(medication_data.head())
    print("\nInitial Data Loaded from Demographic Dataset:")
    print(demographic_data.head())
    print("\nInitial Data Loaded from Examination Dataset:")
    print(examination_data.head())
    print("\nInitial Data Loaded from Questionnaire Dataset:")
    print(questionnaire_data.head())

    # Define continuous columns for categorization
    continuous_columns = {
        'BPXSY1': [72, 100, 120, 140, 160],  # Blood Pressure Systolic 1
        'BPXDI1': [0, 40, 60, 80, 100],  # Blood Pressure Diastolic 1
        'BPXSY2': [0, 50, 100, 120, 140],  # Blood Pressure Systolic 2
        'BPXDI2': [0, 40, 60, 80, 100],  # Blood Pressure Diastolic 2
        'BPXSY3': [76, 100, 120, 140, 160],  # Blood Pressure Systolic 3
        'BPXDI3': [0, 40, 60, 80, 100],  # Blood Pressure Diastolic 3
        'BMXWT': [3.2, 50, 70, 90, 110],  # Weight
        'BMXHT': [78.3, 150, 160, 170, 180],  # Height
        'BMXBMI': [12.3, 18.5, 24.9, 29.9, 40],  # Body Mass Index
        'BMDBMIC': [1, 2, 3, 4],  # BMI Class (Children)
        'BMXLEG': [24.8, 30, 40, 50],  # Leg Length
        'BMXARML': [9.4, 20, 30, 40],  # Upper Arm Length
        'BMXARMC': [11.2, 20, 30, 40],  # Arm Circumference
        'BMXWAIST': [40, 60, 80, 100],  # Waist Circumference
        'BMXSAD1': [9.5, 10, 20, 30],  # Sagittal Abdominal Diameter 1
        'BMXSAD2': [9.7, 10, 20, 30],  # Sagittal Abdominal Diameter 2
        'ALQ120Q': [0, 30, 90, 180, 365],             
        'ALQ130': [1, 5, 10, 15, 20, 25],             
        'DID040': [1, 20, 40, 60, 80],                
        'PAD675': [10, 150, 300, 600, 900],           
        'PAD680': [0, 300, 600, 900, 1200],           
        'SMD030': [0, 18, 30, 45, 60, 76],       
        'RIDAGEYR': [0, 30, 50, 70, 80],       # Example: Age categorization
        'INDFMPIR': [0, 1, 2, 3, 4],           # Example: Poverty ratio categorization
        'WTINT2YR': [1000, 5000, 10000, 20000],# Example: Household income categorization
        'WTMEC2YR': [1000, 5000, 10000, 20000] # Example: Weightage categorization
    }

    # Preprocess demographic data (categorize continuous variables)
    demographic_data = preprocess_demographic_data(demographic_data, continuous_columns)

    # Convert datasets to binary format
    demographic_data_bin = convert_to_binary(demographic_data, 'SEQN')
    medication_data_bin = convert_to_binary(medication_data, 'SEQN')
    examination_data_bin = convert_to_binary(examination_data, 'SEQN')
    questionnaire_data_bin = convert_to_binary(questionnaire_data, 'SEQN')

    # Merge the binary datasets
    merged_data = pd.merge(demographic_data_bin, medication_data_bin, on="SEQN", how="outer")
    merged_data = pd.merge(merged_data, examination_data_bin, on="SEQN", how="outer")
    merged_data = pd.merge(merged_data, questionnaire_data_bin, on="SEQN", how="outer")

    # Fill NaN values with 0
    merged_data = merged_data.fillna(0)

    # Run Apriori and print results
    frequent_itemsets, rules = run_apriori(merged_data.drop(columns='SEQN'))
    print("\nFrequent Itemsets:")
    print(frequent_itemsets)

    print("\nAssociation Rules:")
    print(rules)

if __name__ == "__main__":
    main()



Initial Data Loaded from Medication Dataset:
    SEQN RXDRSC1_1 RXDRSC1_2 RXDRSC1_3  Unnamed: 4 Unnamed: 5  \
0  73557       E11       NaN       NaN         NaN        NaN   
1  73558       E11     E11.2     E78.0         NaN     Column   
2  73559       E11       E11     E78.0         NaN       SEQN   
3  73561       I10       I10       NaN         NaN    RXDRSC1   
4  73562       I10     E78.0       I10         NaN        NaN   

                Unnamed: 6   Unnamed: 7     Unnamed: 8  
0                      NaN          NaN            NaN  
1                  Remarks         Type        Remarks  
2               Individual   Identifier            NaN  
3  Medication-Disease code  Categorical  E11, E78, I10  
4                      NaN          NaN            NaN  

Initial Data Loaded from Demographic Dataset:
      SEQN  RIAGENDR  RIDAGEYR  RIDRETH1  INDFMPIR  INDHHIN2  DMDEDUC2  \
0  73557.0       1.0      69.0       4.0      0.84       4.0       3.0   
1  73558.0       1.0      5




Frequent Itemsets:
    support                              itemsets
0  0.848627                          (DIQ010_2.0)
1  0.813880                          (HEQ010_2.0)
2  0.789211                          (MCQ010_2.0)
3  0.726723              (DIQ010_2.0, HEQ010_2.0)
4  0.716932              (MCQ010_2.0, DIQ010_2.0)
5  0.677385              (MCQ010_2.0, HEQ010_2.0)
6  0.606546  (MCQ010_2.0, DIQ010_2.0, HEQ010_2.0)

Association Rules:
                 antecedents               consequents  antecedent support  \
0               (DIQ010_2.0)              (HEQ010_2.0)            0.848627   
1               (HEQ010_2.0)              (DIQ010_2.0)            0.813880   
2               (MCQ010_2.0)              (DIQ010_2.0)            0.789211   
3               (DIQ010_2.0)              (MCQ010_2.0)            0.848627   
4               (MCQ010_2.0)              (HEQ010_2.0)            0.789211   
5               (HEQ010_2.0)              (MCQ010_2.0)            0.813880   
6   (MCQ010_2.