## Convert the .txt file to .csv file

In [11]:
import csv

def process_input_file(input_file, output_file):
    # Open input file for reading
    with open(input_file, 'r') as f:
        lines = f.readlines()

    # Initialize variables
    current_drug = {}
    data = []

    # Process each line in the input file
    for line in lines:
        if line.startswith('TTDDRUID'):
            current_drug['TTDDRUID'] = line.split('\t')[1].strip()
        elif line.startswith('DRUGNAME'):
            current_drug['DRUGNAME'] = line.split('\t')[1].strip()
        elif line.startswith('INDICATI'):
            parts = line.split('\t')[1:]
            if len(parts) >= 3:  # Check if there are enough elements
                indication = parts[0].strip()
                icd = parts[1].strip()  # Directly take the second element after splitting and strip whitespace
                clinical_status = parts[2].strip()
                current_drug.setdefault('INDICATI', []).append(indication)
                current_drug.setdefault('ICD-11', []).append(icd)
                current_drug.setdefault('CLINICAL_STATUS', []).append(clinical_status)

            # Append to current_drug dictionary
            current_drug.setdefault('INDICATI', []).append(indication)
            current_drug.setdefault('ICD-11', []).append(icd)
            current_drug.setdefault('CLINICAL_STATUS', []).append(clinical_status)
        
        # If we encounter a blank line, it means we have finished processing a drug
        elif line == '\n':
            # Combine multiple indications into a single string separated by ';'
            current_drug['INDICATI'] = ';'.join(current_drug['INDICATI'])
            current_drug['ICD-11'] = ';'.join(current_drug['ICD-11'])
            current_drug['CLINICAL_STATUS'] = ';'.join(current_drug['CLINICAL_STATUS'])
            
            # Append current_drug to data list
            data.append(current_drug)
            
            # Reset current_drug dictionary
            current_drug = {}

    # Write data to CSV file
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['TTDDRUID', 'DRUGNAME', 'INDICATI', 'ICD-11', 'CLINICAL_STATUS']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header
        writer.writeheader()
        
        # Write rows
        for drug in data:
            writer.writerow(drug)

# Example usage
input_file = 'P1-05-Drug_disease_filtered.txt'
output_file = 'P1-05-Drug_disease_filtered.csv'
process_input_file(input_file, output_file)

In [5]:
import pandas as pd
df_path = 'P1-05-Drug_disease_filtered.csv'
df = pd.read_csv(df_path, sep=',')

In [6]:
df

Unnamed: 0,TTDDRUID,DRUGNAME,INDICATI,ICD-11,CLINICAL_STATUS
0,DZB84T,Maralixibat,Pruritus;Pruritus;Progressive familial intrahe...,ICD-11: EC90;ICD-11: EC90;ICD-11: 5C58.03;ICD-...,Approved;Approved;Phase 3;Phase 3;Phase 2;Phase 2
1,DZA90G,BNT162b2,Coronavirus Disease 2019 (COVID-19);Coronaviru...,ICD-11: 1D6Y;ICD-11: 1D6Y,Approved;Approved
2,DZ8DF0,Nedosiran,Primary hyperoxaluria type 1;Primary hyperoxal...,ICD-11: 5C51.20;ICD-11: 5C51.20;ICD-11: 5C51.2...,Approved;Approved;Phase 2;Phase 2
3,DY49OT,Andexxa,Reversal of anticoagulation;Reversal of antico...,ICD-11: N.A.;ICD-11: N.A.,Approved;Approved
4,DY38JR,Astodrimer,Bacterial vaginosis;Bacterial vaginosis,ICD-11: MF3A;ICD-11: MF3A,Approved;Approved
...,...,...,...,...,...
23713,D00AJS,AIK11,Non-insulin dependent diabetes;Non-insulin dep...,ICD-11: 5A11;ICD-11: 5A11,Investigative;Investigative
23714,D00AIZ,SND-159,Mycobacterium infection;Mycobacterium infection,ICD-11: 1B10-1B21;ICD-11: 1B10-1B21,Investigative;Investigative
23715,D00AIS,AP-11014,Solid tumour/cancer;Solid tumour/cancer,ICD-11: 2A00-2F9Z;ICD-11: 2A00-2F9Z,Investigative;Investigative
23716,D00AHV,SKL-PD,Central nervous system disease;Central nervous...,ICD-11: 8A04-8D87;ICD-11: 8A04-8D87,Investigative;Investigative


### To create a separate file for all clinical_status

In [13]:
import csv
import os

def process_input_file(input_file):
    # Open input file for reading
    with open(input_file, 'r') as f:
        lines = f.readlines()

    # Initialize variables
    current_drug = {}
    data = {}

    # Process each line in the input file
    for line in lines:
        if line.startswith('TTDDRUID'):
            current_drug['TTDDRUID'] = line.split('\t')[1].strip()
        elif line.startswith('DRUGNAME'):
            current_drug['DRUGNAME'] = line.split('\t')[1].strip()
        elif line.startswith('INDICATI'):
            parts = line.split('\t')[1:]
            if len(parts) >= 3:  # Check if there are enough elements
                indication = parts[0].strip()
                icd = parts[1].strip()  # Directly take the second element after splitting and strip whitespace
                clinical_status = parts[2].strip()
                
                # Append to data dictionary
                if clinical_status not in data:
                    data[clinical_status] = []
                data[clinical_status].append(current_drug.copy())

        # If we encounter a blank line, it means we have finished processing a drug
        elif line == '\n':
            # Reset current_drug dictionary
            current_drug = {}

    # Write data to separate CSV files for each clinical status
    for status, drugs in data.items():
        output_filename = f"P1-05-Drug_disease_{status.replace(' ', '_').replace('/', '_')}.csv"
        output_dir = 'output'
        os.makedirs(output_dir, exist_ok=True)
        output_path = os.path.join(output_dir, output_filename)
        with open(output_path, 'w', newline='') as csvfile:
            fieldnames = ['TTDDRUID', 'DRUGNAME', 'INDICATI', 'ICD-11', 'CLINICAL_STATUS']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            
            # Write header
            writer.writeheader()
            
            # Write rows
            writer.writerows(drugs)

# Example usage
input_file = 'P1-05-Drug_disease_filtered.txt'
process_input_file(input_file)

In [14]:
df_path = 'P1-05-Drug_disease_filtered.csv'
df = pd.read_csv(df_path, sep=',')

In [15]:
df

Unnamed: 0,TTDDRUID,DRUGNAME,INDICATI,ICD-11,CLINICAL_STATUS
0,DZB84T,Maralixibat,Pruritus;Pruritus;Progressive familial intrahe...,ICD-11: EC90;ICD-11: EC90;ICD-11: 5C58.03;ICD-...,Approved;Approved;Phase 3;Phase 3;Phase 2;Phase 2
1,DZA90G,BNT162b2,Coronavirus Disease 2019 (COVID-19);Coronaviru...,ICD-11: 1D6Y;ICD-11: 1D6Y,Approved;Approved
2,DZ8DF0,Nedosiran,Primary hyperoxaluria type 1;Primary hyperoxal...,ICD-11: 5C51.20;ICD-11: 5C51.20;ICD-11: 5C51.2...,Approved;Approved;Phase 2;Phase 2
3,DY49OT,Andexxa,Reversal of anticoagulation;Reversal of antico...,ICD-11: N.A.;ICD-11: N.A.,Approved;Approved
4,DY38JR,Astodrimer,Bacterial vaginosis;Bacterial vaginosis,ICD-11: MF3A;ICD-11: MF3A,Approved;Approved
...,...,...,...,...,...
23713,D00AJS,AIK11,Non-insulin dependent diabetes;Non-insulin dep...,ICD-11: 5A11;ICD-11: 5A11,Investigative;Investigative
23714,D00AIZ,SND-159,Mycobacterium infection;Mycobacterium infection,ICD-11: 1B10-1B21;ICD-11: 1B10-1B21,Investigative;Investigative
23715,D00AIS,AP-11014,Solid tumour/cancer;Solid tumour/cancer,ICD-11: 2A00-2F9Z;ICD-11: 2A00-2F9Z,Investigative;Investigative
23716,D00AHV,SKL-PD,Central nervous system disease;Central nervous...,ICD-11: 8A04-8D87;ICD-11: 8A04-8D87,Investigative;Investigative


In [2]:
import csv

def process_input_file(input_file, output_file):
    # Open input file for reading
    with open(input_file, 'r') as f:
        lines = f.readlines()

    # Initialize variables
    current_drug = {}
    data = []

    # Process each line in the input file
    for line in lines:
        if line.startswith('TTDDRUID'):
            current_drug['TTDDRUID'] = line.split('\t')[1].strip()
        elif line.startswith('DRUGNAME'):
            current_drug['DRUGNAME'] = line.split('\t')[1].strip()
        elif line.startswith('INDICATI'):
            parts = line.split('\t')[1:]
            if len(parts) >= 3:  # Check if there are enough elements
                indication = parts[0].strip()
                if len(parts[1].split(':')) > 1:  # Check if there's a colon to split
                    icd = parts[1].split(':')[1].strip()  # Extract ICD code directly
                else:
                    icd = ""
                clinical_status = parts[2].strip()
                current_drug['INDICATI'] = indication
                current_drug['ICD-11'] = icd
                current_drug['CLINICAL_STATUS'] = clinical_status
                
                # Append current_drug to data list
                data.append(current_drug.copy())
                
                # Reset current_drug dictionary
                current_drug = {}

    # Write data to CSV file
    with open(output_file, 'w', newline='') as csvfile:
        fieldnames = ['TTDDRUID', 'DRUGNAME', 'INDICATI', 'ICD-11', 'CLINICAL_STATUS']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        
        # Write header
        writer.writeheader()
        
        # Write rows
        for drug in data:
            writer.writerow(drug)

# Example usage
input_file = 'P1-05-Drug_disease_filtered.txt'
output_file = 'P1-05-Drug_disease_filtered_new.csv'
process_input_file(input_file, output_file)

In [3]:
import pandas as pd
df_path = 'P1-05-Drug_disease_filtered_new.csv'
df = pd.read_csv(df_path, sep=',')

In [4]:
df

Unnamed: 0,TTDDRUID,DRUGNAME,INDICATI,ICD-11,CLINICAL_STATUS
0,DZB84T,Maralixibat,Pruritus,EC90,Approved
1,,,Progressive familial intrahepatic cholestasis,5C58.03,Phase 3
2,,,Alagille syndrome,LB20.0Y,Phase 2
3,DZA90G,BNT162b2,Coronavirus Disease 2019 (COVID-19),1D6Y,Approved
4,DZ8DF0,Nedosiran,Primary hyperoxaluria type 1,5C51.20,Approved
...,...,...,...,...,...
30309,D00AJS,AIK11,Non-insulin dependent diabetes,5A11,Investigative
30310,D00AIZ,SND-159,Mycobacterium infection,1B10-1B21,Investigative
30311,D00AIS,AP-11014,Solid tumour/cancer,2A00-2F9Z,Investigative
30312,D00AHV,SKL-PD,Central nervous system disease,8A04-8D87,Investigative
