In [2]:
import re
import warnings
import pandas as pd
# from modules.transform_mod import process_raw_data

warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [3]:
def normalize_strings(df, transformations):
    """
    Function to normalize strings in specified columns of a DataFrame.
    
    Parameters:
        df (pandas.DataFrame): DataFrame containing the columns to be normalized.
        transformations (dict): Dictionary where keys are column names and values are lists of regular expressions to apply to each respective column.
    
    Returns:
        pandas.DataFrame: DataFrame with normalized strings in specified columns.
    """
    # Apply transformations to each specified column
    for column, regex_list in transformations.items():
        if column in df.columns:
            for regex in regex_list:
                df[column] = df[column].str.replace(regex[0], regex[1], regex=True)
    
    return df

#### Transforming column string values
The following dataframe columns will be transformed using regular expressions.

* Strip leading and trailing whitespace: (r'^\s+|\s+$', '')

* Replace any sequence of whitespace characters with a single space: (r'\s+', ' ')
* Remove spaces between numbers and 'mg': (r'(\d+(\.\d+)?)\s*mg', r'\1mg)
* Convert all letters to lowercase: (r'([a-zA-Z]+)', lambda x: x.group(0).lower())
* Remove whitespace around slashes: (r'\s*([/])\s*', r'\1')
* Remove whitespace around hyphens: (r'\s*-\s*', '-')
* Replace '&' with 'and': (r'&', 'and')
* Remove '\r\n': (r'\r\n', '')
* Replace '{' with '(': (r'\{', '(')
* Replace '}' with ')': (r'\}', ')')
* Replace any instance of '{}' with '()': (r'\{.*?\}', '()')

In [4]:
transformations_1 = {
    'drug_name': [
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
    ]
}

transformations_2 = {
    'generic_name_1': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_2': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_3': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_4': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_5': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
}

transformations_3 = {
    'use_case_1': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
    ],
    'use_case_2': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
    ],
}

transformations_4 = {
    'chemical_class': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'&', 'and'),
        (r'\r\n', ''),
        (r'\{', '('),
        (r'\}', ')'),
        (r'\{.*?\}', '()'),
        (r'\s*-\s*', '-'),
        (r'\s*([/])\s*', r'\1'),
    ],
    'therapeutic_class': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'\r\n', ''),
        (r'\s*-\s*', '-'),
        (r'\s*([/])\s*', r'\1'),
    ],
    'action_class': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'&', 'and'),
        (r'\r\n', ''),
        (r'\{', '('),
        (r'\}', ')'),
        (r'\{.*?\}', '()'),
        (r'\s*-\s*', '-'),
        (r'\s*([/])\s*', r'\1'),
    ]
}

transformations_5 = {
    'use_case_1': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'&', 'and'),
        (r'\r\n', ''),
        (r'\{', '('),
        (r'\}', ')'),
        (r'\{.*?\}', '()'),
        (r'\s*-\s*', '-'),
        (r'\s*([/])\s*', r'\1'),
    ],
    'generic_name_1': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_2': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_3': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_4': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'generic_name_5': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'(\d+(\.\d+)?)\s*mg', r'\1mg'),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
    ],
    'therapeutic_class': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'\r\n', ''),
        (r'\s*-\s*', '-'),
        (r'\s*([/])\s*', r'\1'),
    ],
}

transformations_6 = {
    'side_effect_1': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
        (r'&', 'and'),
        (r'\r\n', ''),
    ],
    'side_effect_2': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
        (r'&', 'and'),
        (r'\r\n', ''),
    ],
    'side_effect_3': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
        (r'&', 'and'),
        (r'\r\n', ''),
    ],
    'side_effect_4': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
        (r'&', 'and'),
        (r'\r\n', ''),
    ],
    'side_effect_5': [
        (r'([a-zA-Z]+)', lambda x: x.group(0).lower()),
        (r'^\s+|\s+$', ''),
        (r'\s+', ' '),
        (r'\s*([/])\s*', r'\1'),
        (r'\s*-\s*', '-'),
        (r'&', 'and'),
        (r'\r\n', ''),
    ],
}

#### Preprocess Source Data

In [None]:
file_path = '../data/raw/medicine_dataset.csv'
file_name = 'Med_data'
interm_folder = '../data/interm'
# process_raw_data(file_path, interm_folder, file_name)

#### Apply unique transformations to the preprocessed data

In [5]:
# Load dataframes to clean
Drugs = pd.read_csv('../data/interm/Med_data_1_drugs.csv')
Generic_Names = pd.read_csv('../data/interm/Med_data_2_generic_names.csv')
Side_Effects = pd.read_csv('../data/interm/Med_data_3_side_effects.csv')
Use_Case = pd.read_csv('../data/interm/Med_data_4_use_case.csv')
Drug_Class = pd.read_csv('../data/interm/Med_data_5_drug_class.csv')
Drug_State = pd.read_csv('../data/interm/Med_data_6_drug_state.csv')

#### Transform Drugs

In [6]:
# Apply the normalization function to the dataframe column
Drugs_normalized = normalize_strings(Drugs, transformations_1)

In [7]:
Drugs_normalized.head(45)

Unnamed: 0,drug_id,drug_name,habit_forming
0,1,augmentin 625 duo tablet,No
1,2,azithral 500 tablet,No
2,3,ascoril ls syrup,No
3,4,allegra 120mg tablet,No
4,5,avil 25 tablet,No
5,6,allegra-m tablet,No
6,7,amoxyclav 625 tablet,No
7,8,azee 500 tablet,No
8,9,atarax 25mg tablet,No
9,10,ascoril d plus syrup sugar free,No


#### Transform Generic Names

- One thing that could be done to further clean the Generic Names is to inlclude 'mg' after any string that has a number before it. 

- Another way would be to compare the string with the one in the column next to it, if 'mg' is present in the column over, then it is transformed.

In [8]:
Generic_Names_normalized = normalize_strings(Generic_Names, transformations_2)

In [9]:
# Drop rows where Drug_ID has a value, but all other columns are NaN
Generic_Names_normalized = Generic_Names_normalized.dropna(
    subset=[
        "generic_name_1",
        "generic_name_2",
        "generic_name_3",
        "generic_name_4",
        "generic_name_5",
    ],
    how="all",
)

Generic_Names_cleaned = Generic_Names_normalized.reset_index(drop=True)

In [10]:
Generic_Names_cleaned.head(40)

Unnamed: 0,drug_id,generic_name_1,generic_name_2,generic_name_3,generic_name_4,generic_name_5
0,1,penciclav 500mg/125mg tablet,moxikind-cv 625 tablet,moxiforce-cv 625 tablet,fightox 625 tablet,novamox cv 625mg tablet
1,2,zithrocare 500mg tablet,azax 500 tablet,zady 500 tablet,cazithro 500mg tablet,trulimax 500mg tablet
2,3,solvin ls syrup,ambrodil-lx syrup,zerotuss xp syrup,capex ls syrup,broxum ls syrup
3,4,lcfex tablet,etofex 120mg tablet,nexofex 120mg tablet,fexise 120mg tablet,histafree 120 tablet
4,5,eralet 25mg tablet,,,,
5,6,emlukast-fx tablet,lcfex-mont tablet,fixar 10mg/120mg tablet,histakind-m tablet,histafree-m tablet
6,7,penciclav 500mg/125mg tablet,moxikind-cv 625 tablet,moxiforce-cv 625 tablet,fightox 625 tablet,novamox cv 625mg tablet
7,8,zithrocare 500mg tablet,azax 500 tablet,zady 500 tablet,cazithro 500mg tablet,trulimax 500mg tablet
8,9,hd zine 25mg tablet,hyzox 25 tablet,hizet 25mg tablet,hydil 25mg tablet,zyzine 25mg tablet
9,10,arnikof d syrup,cofsolve-d syrup,tucin d syrup,akof-d syrup sugar free,krisbro d syrup


#### Transform Use Case

In [11]:
Use_Case_normalized = normalize_strings(Use_Case, transformations_3)

In [12]:
Use_Case_normalized.head(40)

Unnamed: 0,drug_id,use_case_1,use_case_2
0,1,treatment of bacterial infections,
1,2,treatment of bacterial infections,
2,3,treatment of cough with mucus,
3,4,treatment of sneezing and runny nose due to al...,treatment of allergic conditions
4,5,treatment of allergic conditions,
5,6,treatment of sneezing and runny nose due to al...,
6,7,treatment of bacterial infections,
7,8,treatment of bacterial infections,
8,9,treatment of anxiety,treatment of skin conditions with inflammation...
9,10,treatment of dry cough,


#### Transform Drug Class

In [13]:
Drug_Class_normalized = normalize_strings(Drug_Class, transformations_4)

In [14]:
Drug_Class_normalized.head(40)

Unnamed: 0,drug_id,chemical_class,therapeutic_class,action_class
0,1,,anti infectives,
1,2,macrolides,anti infectives,macrolides
2,3,,respiratory,
3,4,diphenylmethane derivative,respiratory,h1 antihistaminics (second generation)
4,5,pyridines derivatives,respiratory,h1 antihistaminics (first generation)
5,6,,respiratory,
6,7,,anti infectives,
7,8,macrolides,anti infectives,macrolides
8,9,piperazine derivative,respiratory,h1 antihistaminics (first generation)
9,10,,respiratory,


#### Trasnform Drug_State

In [15]:
def extract_generic_names(row):
    """
    Extracts generic names from generic_name columns in a DataFrame row.
    
    Args:
        row (pd.Series): A pandas Series representing a row of a DataFrame.
    
    Returns:
        str: Comma-separated string of generic names extracted from the row.
    
    Example:
        If the row has values 
        'generic_name_1': 'penciclav 500mg/125mg tablet',
        'generic_name_2': 'moxikind-cv 625 tablet', 
        the function will return 'penciclav,moxikind-cv'.
    """
    # Check if all generic_name columns are NaN
    if row.filter(like='generic_name_').isnull().all():
        return None
    
    generic_names = []
    # Iterate through the generic_name columns
    for i in range(1, 6):
        generic_name = row[f'generic_name_{i}']
        # Check if the value is a string
        if isinstance(generic_name, str):
            # Split the string and extract the first word
            generic_names.append(generic_name.split()[0])
    
    # Join the extracted generic names into a comma-separated string
    return ','.join(generic_names)

In [16]:
def extract_administration_method(generic_name):
    """
    This function extracts the administration method from a given generic name.
    
    Parameters:
    generic_name (str): The generic name of a medication.
    
    Returns:
    str: The extracted administration method. If no match is found, returns None.
    
    The function uses a list of common administration methods and a regular expression 
    to match the method in the generic name. The method is considered to be the first 
    word that matches the list.
    """
    
    # Check if the generic_name is a string
    if isinstance(generic_name, str):
        administration_methods = [
            'tablet', 'syrup', 'capsule', 'injection', 
            'suspension', 'iv', 'ointment', 'drop', 'cream', 
            'inhaler', 'kit', 'oral drops', 'nasal spray', 'lotion', 
            'spray', 'eye gel', 'oral gel', 'eye drop', 
            'eye/ear drops', 'dispopen', 'flextouch', 'soap',
            'oral solution', 'shampoo', 'expectorant']
        
        # Create a regex pattern to match any of the administration methods
        pattern = re.compile(r'\b(?:' + '|'.join(administration_methods) + r')\b', flags=re.IGNORECASE)
        
        # Search for the pattern in the generic_name string
        match = re.search(pattern, generic_name)
        
        if match:
            return match.group(0)
        else:
            return generic_name
    else:
        return generic_name

In [17]:
Drug_State.head(20)

Unnamed: 0,drug_id,use_case_1,generic_name_1,generic_name_2,generic_name_3,generic_name_4,generic_name_5,therapeutic_class
0,1,Treatment of Bacterial infections,Penciclav 500 mg/125 mg Tablet,Moxikind-CV 625 Tablet,Moxiforce-CV 625 Tablet,Fightox 625 Tablet,Novamox CV 625mg Tablet,ANTI INFECTIVES
1,2,Treatment of Bacterial infections,Zithrocare 500mg Tablet,Azax 500 Tablet,Zady 500 Tablet,Cazithro 500mg Tablet,Trulimax 500mg Tablet,ANTI INFECTIVES
2,3,Treatment of Cough with mucus,Solvin LS Syrup,Ambrodil-LX Syrup,Zerotuss XP Syrup,Capex LS Syrup,Broxum LS Syrup,RESPIRATORY
3,4,Treatment of Sneezing and runny nose due to al...,Lcfex Tablet,Etofex 120mg Tablet,Nexofex 120mg Tablet,Fexise 120mg Tablet,Histafree 120 Tablet,RESPIRATORY
4,5,Treatment of Allergic conditions,Eralet 25mg Tablet,,,,,RESPIRATORY
5,6,Treatment of Sneezing and runny nose due to al...,Emlukast-FX Tablet,LCFEX-Mont Tablet,Fixar 10mg/120mg Tablet,Histakind-M Tablet,Histafree-M Tablet,RESPIRATORY
6,7,Treatment of Bacterial infections,Penciclav 500 mg/125 mg Tablet,Moxikind-CV 625 Tablet,Moxiforce-CV 625 Tablet,Fightox 625 Tablet,Novamox CV 625mg Tablet,ANTI INFECTIVES
7,8,Treatment of Bacterial infections,Zithrocare 500mg Tablet,Azax 500 Tablet,Zady 500 Tablet,Cazithro 500mg Tablet,Trulimax 500mg Tablet,ANTI INFECTIVES
8,9,Treatment of Anxiety,HD Zine 25mg Tablet,Hyzox 25 Tablet,Hizet 25mg Tablet,Hydil 25mg Tablet,Zyzine 25mg Tablet,RESPIRATORY
9,10,Treatment of Dry cough,Arnikof D Syrup,Cofsolve-D Syrup,Tucin D Syrup,Akof-D Syrup Sugar Free,Krisbro D Syrup,RESPIRATORY


In [18]:
Drug_State_normalized = normalize_strings(Drug_State, transformations_5)

In [19]:
# Create the new column 'generic_profile'
Drug_State_normalized['generic_profile'] = Drug_State_normalized.apply(extract_generic_names, axis=1)

In [20]:
# Apply the function to the generic_name_1 column
Drug_State_normalized['administration_method'] = Drug_State_normalized['generic_name_1'].apply(extract_administration_method)

In [21]:
Drug_State_normalized.head(20)

Unnamed: 0,drug_id,use_case_1,generic_name_1,generic_name_2,generic_name_3,generic_name_4,generic_name_5,therapeutic_class,generic_profile,administration_method
0,1,treatment of bacterial infections,penciclav 500mg/125mg tablet,moxikind-cv 625 tablet,moxiforce-cv 625 tablet,fightox 625 tablet,novamox cv 625mg tablet,anti infectives,"penciclav,moxikind-cv,moxiforce-cv,fightox,nov...",tablet
1,2,treatment of bacterial infections,zithrocare 500mg tablet,azax 500 tablet,zady 500 tablet,cazithro 500mg tablet,trulimax 500mg tablet,anti infectives,"zithrocare,azax,zady,cazithro,trulimax",tablet
2,3,treatment of cough with mucus,solvin ls syrup,ambrodil-lx syrup,zerotuss xp syrup,capex ls syrup,broxum ls syrup,respiratory,"solvin,ambrodil-lx,zerotuss,capex,broxum",syrup
3,4,treatment of sneezing and runny nose due to al...,lcfex tablet,etofex 120mg tablet,nexofex 120mg tablet,fexise 120mg tablet,histafree 120 tablet,respiratory,"lcfex,etofex,nexofex,fexise,histafree",tablet
4,5,treatment of allergic conditions,eralet 25mg tablet,,,,,respiratory,eralet,tablet
5,6,treatment of sneezing and runny nose due to al...,emlukast-fx tablet,lcfex-mont tablet,fixar 10mg/120mg tablet,histakind-m tablet,histafree-m tablet,respiratory,"emlukast-fx,lcfex-mont,fixar,histakind-m,hista...",tablet
6,7,treatment of bacterial infections,penciclav 500mg/125mg tablet,moxikind-cv 625 tablet,moxiforce-cv 625 tablet,fightox 625 tablet,novamox cv 625mg tablet,anti infectives,"penciclav,moxikind-cv,moxiforce-cv,fightox,nov...",tablet
7,8,treatment of bacterial infections,zithrocare 500mg tablet,azax 500 tablet,zady 500 tablet,cazithro 500mg tablet,trulimax 500mg tablet,anti infectives,"zithrocare,azax,zady,cazithro,trulimax",tablet
8,9,treatment of anxiety,hd zine 25mg tablet,hyzox 25 tablet,hizet 25mg tablet,hydil 25mg tablet,zyzine 25mg tablet,respiratory,"hd,hyzox,hizet,hydil,zyzine",tablet
9,10,treatment of dry cough,arnikof d syrup,cofsolve-d syrup,tucin d syrup,akof-d syrup sugar free,krisbro d syrup,respiratory,"arnikof,cofsolve-d,tucin,akof-d,krisbro",syrup


In [22]:
# Drop the generic_name columns
columns_to_drop = ['generic_name_1', 'generic_name_2', 
                'generic_name_3', 'generic_name_4', 'generic_name_5']

Drug_State_normalized.drop(columns=columns_to_drop, inplace=True)
Drug_State_normalized.reset_index(drop=True, inplace=True)

In [23]:
cols = {
    'use_case_1': 'case_definition',
    'therapeutic_class': 'therapeutic_type'
}

Drug_State_normalized.rename(columns=cols, inplace=True)

In [24]:
Drug_State_normalized.head(20)

Unnamed: 0,drug_id,case_definition,therapeutic_type,generic_profile,administration_method
0,1,treatment of bacterial infections,anti infectives,"penciclav,moxikind-cv,moxiforce-cv,fightox,nov...",tablet
1,2,treatment of bacterial infections,anti infectives,"zithrocare,azax,zady,cazithro,trulimax",tablet
2,3,treatment of cough with mucus,respiratory,"solvin,ambrodil-lx,zerotuss,capex,broxum",syrup
3,4,treatment of sneezing and runny nose due to al...,respiratory,"lcfex,etofex,nexofex,fexise,histafree",tablet
4,5,treatment of allergic conditions,respiratory,eralet,tablet
5,6,treatment of sneezing and runny nose due to al...,respiratory,"emlukast-fx,lcfex-mont,fixar,histakind-m,hista...",tablet
6,7,treatment of bacterial infections,anti infectives,"penciclav,moxikind-cv,moxiforce-cv,fightox,nov...",tablet
7,8,treatment of bacterial infections,anti infectives,"zithrocare,azax,zady,cazithro,trulimax",tablet
8,9,treatment of anxiety,respiratory,"hd,hyzox,hizet,hydil,zyzine",tablet
9,10,treatment of dry cough,respiratory,"arnikof,cofsolve-d,tucin,akof-d,krisbro",syrup


#### Transform Side Effects

In [25]:
Side_Effects.head(20)

Unnamed: 0,drug_id,side_effect_1,side_effect_2,side_effect_3,side_effect_4,side_effect_5
0,1,Vomiting,Nausea,Diarrhea,,
1,2,Vomiting,Nausea,Abdominal pain,Diarrhea,
2,3,Nausea,Vomiting,Diarrhea,Upset stomach,Stomach pain
3,4,Headache,Drowsiness,Dizziness,Nausea,
4,5,Sleepiness,Dryness in mouth,,,
5,6,Nausea,Diarrhea,Vomiting,Skin rash,Flu-like symptoms
6,7,Vomiting,Nausea,Diarrhea,,
7,8,Vomiting,Nausea,Abdominal pain,Diarrhea,
8,9,Sedation,Nausea,Vomiting,Upset stomach,Constipation
9,10,Nausea,Vomiting,Loss of appetite,Headache,


In [26]:
Side_Effects_normalized = normalize_strings(Side_Effects, transformations_6)

In [27]:
Side_Effects_normalized.head(20)

Unnamed: 0,drug_id,side_effect_1,side_effect_2,side_effect_3,side_effect_4,side_effect_5
0,1,vomiting,nausea,diarrhea,,
1,2,vomiting,nausea,abdominal pain,diarrhea,
2,3,nausea,vomiting,diarrhea,upset stomach,stomach pain
3,4,headache,drowsiness,dizziness,nausea,
4,5,sleepiness,dryness in mouth,,,
5,6,nausea,diarrhea,vomiting,skin rash,flu-like symptoms
6,7,vomiting,nausea,diarrhea,,
7,8,vomiting,nausea,abdominal pain,diarrhea,
8,9,sedation,nausea,vomiting,upset stomach,constipation
9,10,nausea,vomiting,loss of appetite,headache,
